From 4cc5d10f5120dfb9c947ed642cc6de1cdb04b016 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:58:07 +0200 Subject: [PATCH 1/3] fix(backup): bundle the whole forgejo /data (app.ini + ssh host keys) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The backup tarred only /data/git, but CONTRACT_004 §4.2 requires the git repos AND app.ini AND the host SSH keys — without app.ini a restored Forgejo has no DB/S3 config and won't start. Discovered during the T13 DR rehearsal: restore reached Forgejo and it had nothing to configure from. Tar the whole /data volume (git/, gitea/conf/app.ini, ssh/ssh_host_*). It is ~1 MB at Layer 0 — the DB and LFS/packages are externalised to Postgres + RustFS, so /data holds no large recreatable state. Restored end-to-end on a fresh VM: Forgejo comes up fully configured against the restored PG + RustFS. Co-Authored-By: Claude Opus 4.8 (1M context) --- backup/backup-remote.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/backup/backup-remote.sh b/backup/backup-remote.sh index a32ce3b..07e5c50 100755 --- a/backup/backup-remote.sh +++ b/backup/backup-remote.sh @@ -30,9 +30,12 @@ mkdir -p "$W" echo "[backup] postgres pg_dumpall" >&2 docker exec foundation-postgres pg_dumpall -U postgres | gzip > "$W/postgres.sql.gz" -echo "[backup] forgejo git repos (tar.zst)" >&2 -# Forgejo keeps repos under /data/git; use the container's own tar (no extra image). -docker exec foundation-forgejo sh -c 'tar -C /data -cf - git' | zstd -q -T0 > "$W/forgejo-repos.tar.zst" +echo "[backup] forgejo data volume (tar.zst)" >&2 +# CONTRACT_004 §4.2 needs the git repos AND app.ini AND the host SSH keys — all live +# in the /data volume (git/, gitea/conf/app.ini, ssh/ssh_host_*). Tar the WHOLE volume +# (it is ~1 MB; the DB + LFS/packages are externalised to PG + RustFS, so /data has no +# large recreatable state at Layer 0). Use the container's own tar (no extra image). +docker exec foundation-forgejo sh -c 'tar -C /data -cf - .' | zstd -q -T0 > "$W/forgejo-repos.tar.zst" echo "[backup] vault raft snapshot" >&2 docker exec -e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN="$VAULT_TOKEN" foundation-vault \ From d807a45c79a65dda9e133b9c5d53fb728b0c1506 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:58:07 +0200 Subject: [PATCH 2/3] feat(dr): disaster restore to a fresh VM + runbook (T13) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rehearsed and validated. The destructive sibling of backup/restore.sh: rebuilds the ENTIRE egg on a fresh, Docker-equipped VM from the offsite, age-encrypted bundle, in the mandated order (CONTRACT_004 §4.4): Vault -> Postgres -> RustFS -> Forgejo. - restore-to-fresh-vm.sh (operator): pulls the disaster-survivable secret set from passphrase-encrypted config (age identity + Vault OLD unseal keys/root token), ships VERSIONS + the VM-side restorer, runs it (secrets on stdin). - restore-to-fresh-vm-remote.sh (VM-side): decrypt+verify bundle; restore Vault (init throwaway -> raft snapshot restore -force -> re-unseal with OLD keys, with a settle+retry loop because -force re-seals asynchronously); read every other service's creds back out of the restored Vault; restore Postgres, RustFS (buckets + scoped service account + blobs), and Forgejo (full /data incl. app.ini); publish git :22 only when free. - RUNBOOK.md: the human procedure, the {repo+passphrase+offsite} trust chain, and §5 re-establish-ingress (DNS, Caddy, runner, re-key). Rehearsal (throwaway cx33, offsite source, then destroyed): DR RESTORE OK — Vault unsealed with OLD keys, postgres rows=2, forge healthy against restored DB+S3, `git clone ssh://git@:2222/olsitec/foundation.git` returns all 28 commits, ai-baseline present. Trust chain proven end-to-end. Co-Authored-By: Claude Opus 4.8 (1M context) --- dr/.gitkeep | 0 dr/RUNBOOK.md | 107 +++++++++++++++++++ dr/restore-to-fresh-vm-remote.sh | 169 +++++++++++++++++++++++++++++++ dr/restore-to-fresh-vm.sh | 58 +++++++++++ 4 files changed, 334 insertions(+) delete mode 100644 dr/.gitkeep create mode 100644 dr/RUNBOOK.md create mode 100755 dr/restore-to-fresh-vm-remote.sh create mode 100755 dr/restore-to-fresh-vm.sh diff --git a/dr/.gitkeep b/dr/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/dr/RUNBOOK.md b/dr/RUNBOOK.md new file mode 100644 index 0000000..f3838ee --- /dev/null +++ b/dr/RUNBOOK.md @@ -0,0 +1,107 @@ +# DR RUNBOOK — rebuild the foundation egg on a fresh VM (T13) + +**Realises** CONTRACT_004 §4.4 (restore order) · **Companion**: `dr/restore-to-fresh-vm.sh` +(orchestrator) + `dr/restore-to-fresh-vm-remote.sh` (VM-side). This is the +**destructive** sibling of `backup/restore.sh` (the non-destructive scratch verifier). + +## 0. When you are here + +The Helsinki VM (or its Vault/data) is gone. You still have: + +- **this git repo** (`olsitec/foundation`) — including `bootstrap/Pulumi.foundation.yaml`, + whose passphrase-encrypted `secure:` values hold the Vault **OLD unseal keys + root + token** (CONTRACT_002 §2.4) and the **age identity** (CONTRACT_004 §4.3); +- the **master passphrase** (`pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE`) — the + one external secret; without it nothing below decrypts; +- the **offsite bundle** (self-hosted MinIO `olsitec-foundation//`) — RustFS is + assumed lost, so restore reads **offsite** (`--source off`). + +`{repo + passphrase + offsite bundle}` is sufficient to fully reconstitute the egg. +Nothing else is needed — that is the whole point of the trust chain (PLAN-002 §4.1). + +## 1. Pick the bundle + +List candidates in the offsite store and choose the newest verified one: + +``` +ts= # e.g. the latest in olsitec-foundation/ +``` + +A bundle is trustworthy only if `backup/restore.sh off` has passed for it (the +weekly verify job, CONTRACT_004 §4.6). Prefer the last green one. + +## 2. Provision a fresh VM + +**Real DR** — use the provision stack so the new VM becomes the managed home: + +``` +cd provision +export HCLOUD_TOKEN="$(pass olsicloud4/HCLOUD_TOKEN)" +# edit the server name if the old one still exists in Hetzner; the cloud-init already +# installs docker + age + zstd (jq is a base package) — backupTools in provision/index.ts. +PULUMI_CONFIG_PASSPHRASE=dev-validation-throwaway pulumi up +``` + +The VM must have, at first boot: **docker, age, zstd, jq** (the cloud-init provides +them). SSH is on **:222** for a provision-stack VM (the vendored cloud-init moves it); +pass `--port 222` below. + +**Rehearsal** — a throwaway VM created directly via the Hetzner API (cx33, debian-12, +ssh key `foundation-test-ssh-key`, cloud-init installing docker+age+zstd+jq), sshd on +**:22**. Destroy it immediately after (`DELETE /v1/servers/`). + +## 3. Restore (Vault → Postgres → RustFS → Forgejo) + +``` +./dr/restore-to-fresh-vm.sh --host --port <22|222> --ts "$ts" --source off +``` + +What it does, in the **mandated order** (CONTRACT_004 §4.4 — starting Forgejo before +1–3 is a defect): + +1. **Decrypt** the bundle with the age identity; verify every artifact's MANIFEST sha256. +2. **Vault** — start a fresh raft node, init a throwaway node, `raft snapshot restore + -force`, then **unseal with the OLD keys** from config. Vault is now the source of + truth again; the OLD root token authenticates. All other creds are read back out. +3. **Postgres** — start with the super-password from Vault, restore `postgres.sql.gz` + (recreates the `forgejo` role + DB; asserts `"user"` rows ≥ 1). +4. **RustFS** — start with the admin keys from Vault, recreate the four buckets + the + scoped service account (the exact `serviceKeyId/Secret` Forgejo's app.ini expects), + sync `rustfs-blobs` back into the buckets. +5. **Forgejo** — extract `forgejo-repos.tar.zst` into the data volume (git repos + + `app.ini`, which already carries DB/S3 creds + INTERNAL_TOKEN/JWTs), inject + `SECRET_KEY` from Vault, start. Asserts healthz pass + `olsitec/foundation.git` present. + +On success it prints `DR RESTORE OK (): …`. + +## 4. What is NOT restored (recreatable — CONTRACT_004 §4.5) + +Container images (re-pulled by digest), Caddy ACME data (re-issued), the runner's +ephemeral registration, search indexes/caches. These come back in §5. + +## 5. Re-establish ingress + management + +1. **DNS** — repoint `forge/git/s3/vault.olsitec.net` A records at the new IP + (Cloudflare; the bootstrap's `deployDns` does this once the stack is re-adopted). +2. **Caddy + runner** — re-adopt the stack so IaC manages the new VM: + ``` + cd bootstrap + pulumi config set foundation:vm.host + pulumi config set foundation:vm.sshPort <22|222> + ./run.sh up # creates Caddy (re-issues LE cert via DNS-01), re-registers the runner + ``` + `up` is idempotent against the already-restored containers it can adopt, and creates + the recreatable ones (Caddy, runner). Verify: `https://forge.olsitec.net` = 200, + `git clone git@git.olsitec.net:olsitec/foundation.git`. +3. **Re-key reminder (D2)** — after a real disaster, rotate the Vault root token + the + offsite creds (they were materialised on a possibly-compromised host): `pulumi up + --replace` the relevant credential resources, then re-run a backup. + +## 6. Gotchas (discovered during the T13 rehearsal) + +- The **docker gid** on the host is host-specific; the runner mounts the host socket + (PLAN-002 R5). On a fresh VM re-check the gid before trusting runner jobs. +- `raft snapshot restore -force` **re-seals** the node (it swaps in the snapshot's + keyring) — you MUST unseal again with the OLD keys, not the throwaway init keys. +- Restore reads **offsite** by default. RustFS on the new VM starts EMPTY; its blobs + come from the bundle, not from a surviving RustFS. diff --git a/dr/restore-to-fresh-vm-remote.sh b/dr/restore-to-fresh-vm-remote.sh new file mode 100755 index 0000000..08e5c4d --- /dev/null +++ b/dr/restore-to-fresh-vm-remote.sh @@ -0,0 +1,169 @@ +#!/bin/sh +# restore-to-fresh-vm-remote.sh — the VM-side DESTRUCTIVE disaster restore +# (CONTRACT_004 §4.4). Shipped + run by dr/restore-to-fresh-vm.sh; NOT run directly. +# This is the destructive sibling of backup/restore-remote.sh: it rebuilds the live +# platform on a FRESH VM from an offsite, age-encrypted bundle, in the mandated +# restore order: +# +# Vault -> Postgres -> RustFS -> Forgejo (Runner re-registers; Caddy re-issues) +# +# Trust chain (PLAN-002 §4.1): the bundle is decrypted with the age identity, and +# Vault is unsealed with the OLD unseal keys — BOTH travel in passphrase-encrypted +# config with the repo, so {repo + passphrase + offsite bundle} fully reconstitutes +# the egg even after total loss of the original VM AND Vault. Once Vault is back, it +# is the source of truth for every other service's credentials. +# +# Secrets on stdin (never argv); non-secrets are args. Image pins are sourced from +# the shipped VERSIONS. Idempotent-ish: safe to re-run after a failed attempt (it +# removes any half-built foundation-* containers first). +set -eu +IFS= read -r VAULT_UNSEAL_KEYS_JSON # OLD keys (JSON array) from config +IFS= read -r VAULT_ROOT_TOKEN # OLD root token from config +IFS= read -r AGE_IDENTITY # decrypts the bundle +IFS= read -r OFF_EP +IFS= read -r OFF_AK +IFS= read -r OFF_SK +TS="$1"; SRC="${2:-off}"; NET="${3:-foundation-net}"; SUBNET="${4:-172.30.0.0/24}" +OFFSITE_BUCKET=olsitec-foundation +DR=/tmp/foundation-dr-$TS +. /tmp/foundation-dr-VERSIONS # IMAGE_* pins +fail() { echo "DR RESTORE FAIL: $1" >&2; exit 1; } +log() { echo "[dr] $1" >&2; } + +# ── clean any partial prior attempt (NOT the data volumes — those are the restore) ─ +for c in foundation-vault foundation-postgres foundation-rustfs foundation-forgejo foundation-runner; do + docker rm -f "$c" >/dev/null 2>&1 || true +done +rm -rf "$DR"; (umask 077; mkdir -p "$DR") +docker network create --driver bridge --subnet "$SUBNET" "$NET" >/dev/null 2>&1 || true + +# ── 0. pull + decrypt the bundle from offsite ──────────────────────────────────── +log "pull bundle $TS from $SRC + age-decrypt" +( umask 077; printf '%s\n' "$AGE_IDENTITY" > "$DR/age.key" ) +docker run --rm -v "$DR":/w -e OFF_EP="$OFF_EP" -e OFF_AK="$OFF_AK" -e OFF_SK="$OFF_SK" \ + -e TS="$TS" -e OFFB="$OFFSITE_BUCKET" --entrypoint sh "$IMAGE_MC" -c ' + set -e + mc alias set off "$OFF_EP" "$OFF_AK" "$OFF_SK" >/dev/null + mc cp -r "off/$OFFB/$TS/" /w/ >/dev/null' +[ -f "$DR/MANIFEST.json" ] || { [ -d "$DR/$TS" ] && mv "$DR/$TS"/* "$DR"/; } +[ -f "$DR/MANIFEST.json" ] || fail "MANIFEST.json missing from offsite bundle" +for name in $(jq -r '.artifacts[].name' "$DR/MANIFEST.json"); do + [ -f "$DR/$name.age" ] || fail "$name.age missing" + age -d -i "$DR/age.key" -o "$DR/$name" "$DR/$name.age" 2>/dev/null || fail "decrypt $name" + got=$(sha256sum "$DR/$name" | cut -d' ' -f1) + want=$(jq -r --arg n "$name" '.artifacts[]|select(.name==$n).sha256' "$DR/MANIFEST.json") + [ "$got" = "$want" ] || fail "sha mismatch $name" +done +log "bundle decrypted + verified" + +# ── 1. VAULT: start fresh raft node, restore snapshot, unseal with OLD keys ─────── +log "restore Vault" +VLC='{"storage":{"raft":{"path":"/vault/file","node_id":"foundation-vault"}},"listener":{"tcp":{"address":"0.0.0.0:8200","tls_disable":true}},"api_addr":"http://foundation-vault:8200","cluster_addr":"http://foundation-vault:8201","ui":true,"disable_mlock":false}' +docker run -d --name foundation-vault --hostname foundation-vault --restart unless-stopped \ + --network "$NET" --cap-add IPC_LOCK -v foundation-vault-data:/vault/file \ + -e VAULT_LOCAL_CONFIG="$VLC" -e VAULT_API_ADDR=http://foundation-vault:8200 \ + "$IMAGE_VAULT" server >/dev/null +VE='-e VAULT_ADDR=http://127.0.0.1:8200' +vstat() { docker exec $VE foundation-vault vault status -format=json 2>/dev/null; } +i=0; until vstat >/dev/null 2>&1 || [ "$(vstat | jq -r '.initialized' 2>/dev/null)" = false ]; do + i=$((i+1)); [ "$i" -gt 40 ] && fail "vault not reachable"; sleep 2; done +# A raft snapshot can only be restored into an initialised, unsealed node. Init a +# THROWAWAY node, unseal it, restore -force (this swaps in the OLD data + keyring), +# then re-unseal with the OLD keys the snapshot expects. +if [ "$(vstat | jq -r '.initialized')" = false ]; then + TMP=$(docker exec $VE foundation-vault vault operator init -key-shares=1 -key-threshold=1 -format=json) + docker exec $VE foundation-vault vault operator unseal "$(printf '%s' "$TMP" | jq -r '.unseal_keys_b64[0]')" >/dev/null + TMP_ROOT=$(printf '%s' "$TMP" | jq -r '.root_token') + docker cp "$DR/vault-raft.snap" foundation-vault:/tmp/v.snap + docker exec $VE -e VAULT_TOKEN="$TMP_ROOT" foundation-vault vault operator raft snapshot restore -force /tmp/v.snap + docker exec foundation-vault rm -f /tmp/v.snap +fi +# `restore -force` swaps in the snapshot's keyring and re-seals the node — but the +# re-seal lands a moment AFTER the command returns (a race that silently skips the +# unseal if checked too early). Wait for the node to settle, then unseal with the +# OLD keys, RETRYING until it actually opens (the keys are correct; timing isn't). +unsealed= +for _ in $(seq 1 30); do + vstat >/dev/null 2>&1 || { sleep 2; continue; } + if [ "$(vstat | jq -r '.sealed')" = false ]; then unsealed=1; break; fi + printf '%s' "$VAULT_UNSEAL_KEYS_JSON" | jq -r '.[]' | while IFS= read -r k; do + docker exec $VE foundation-vault vault operator unseal "$k" >/dev/null 2>&1 || true + done + sleep 2 +done +[ "$unsealed" = 1 ] || fail "vault still sealed after restore (OLD keys rejected?)" +# OLD root token must now authenticate against the restored data. +docker exec $VE -e VAULT_TOKEN="$VAULT_ROOT_TOKEN" foundation-vault vault kv get -format=json foundation/postgres/service-credentials >/dev/null 2>&1 \ + || fail "restored Vault rejects OLD root token / missing creds" +log "Vault restored + unsealed; reading service creds" +gv() { docker exec $VE -e VAULT_TOKEN="$VAULT_ROOT_TOKEN" foundation-vault vault kv get -field="$2" "foundation/$1" 2>/dev/null; } +PG_SUPER_PW=$(gv postgres/service-credentials postgresSuperPassword) +RUSTFS_AK=$(gv rustfs/service-credentials rustfsAdminUser) +RUSTFS_SK=$(gv rustfs/service-credentials rustfsAdminPassword) +RUSTFS_SVC_ID=$(gv rustfs/service-credentials rustfsServiceKeyId) +RUSTFS_SVC_SECRET=$(gv rustfs/service-credentials rustfsServiceKeySecret) +FORGEJO_SECRET_KEY=$(gv forgejo/service-credentials forgejoSecretKey) + +# ── 2. POSTGRES: start, restore the dump (recreates forgejo role+DB) ────────────── +log "restore Postgres" +docker run -d --name foundation-postgres --hostname foundation-postgres --restart unless-stopped \ + --network "$NET" -v foundation-postgres-data:/var/lib/postgresql/data \ + -e POSTGRES_PASSWORD="$PG_SUPER_PW" "$IMAGE_POSTGRES" >/dev/null +i=0; until docker exec foundation-postgres pg_isready -U postgres >/dev/null 2>&1; do + i=$((i+1)); [ "$i" -gt 40 ] && fail "postgres not ready"; sleep 2; done +gunzip < "$DR/postgres.sql.gz" | docker exec -i foundation-postgres psql -U postgres -q >/dev/null 2>&1 || true +ROWS=$(docker exec foundation-postgres psql -U postgres -d forgejo -tAc 'SELECT count(*) FROM "user"' 2>/dev/null || echo 0) +[ "${ROWS:-0}" -ge 1 ] || fail "restored forgejo DB has no users" +log "Postgres restored: forgejo.\"user\" rows=$ROWS" + +# ── 3. RUSTFS: start, recreate buckets + service account, sync blobs back ───────── +log "restore RustFS" +docker run -d --name foundation-rustfs --hostname foundation-rustfs --restart unless-stopped \ + --network "$NET" -v foundation-rustfs-data:/data \ + -e RUSTFS_ACCESS_KEY="$RUSTFS_AK" -e RUSTFS_SECRET_KEY="$RUSTFS_SK" "$IMAGE_RUSTFS" >/dev/null +tar -C "$DR" -xf - <"/dev/null" 2>/dev/null || true +mkdir -p "$DR/blobs"; zstd -dc "$DR/rustfs-blobs.tar.zst" | tar -C "$DR/blobs" -xf - 2>/dev/null || true +docker run --rm --network "$NET" -v "$DR/blobs":/blobs \ + -e RAK="$RUSTFS_AK" -e RSK="$RUSTFS_SK" -e SID="$RUSTFS_SVC_ID" -e SSEC="$RUSTFS_SVC_SECRET" \ + --entrypoint sh "$IMAGE_MC" -c ' + set -e + for i in $(seq 1 30); do mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null 2>&1 && mc ls rfs >/dev/null 2>&1 && break; sleep 2; done + for b in forgejo-packages forgejo-artifacts forgejo-lfs foundation-backups; do mc mb --ignore-existing "rfs/$b" >/dev/null 2>&1 || true; done + mc admin user svcacct add --access-key "$SID" --secret-key "$SSEC" rfs "$RAK" >/dev/null 2>&1 || true + for b in forgejo-packages forgejo-artifacts forgejo-lfs; do + [ -d "/blobs/$b" ] && mc mirror --overwrite --quiet "/blobs/$b" "rfs/$b" >/dev/null 2>&1 || true + done' || fail "rustfs restore" +log "RustFS restored (buckets + service account + blobs)" + +# ── 4. FORGEJO: restore the /data volume (repos + app.ini + ssh keys), then start ── +log "restore Forgejo" +# Decompress on the HOST (zstd is installed there) to a plain tar, then extract into +# the volume with the forgejo image's busybox tar (no zstd inside the container). The +# restored /data carries app.ini (DB + S3 creds + INTERNAL_TOKEN/JWTs) and the SSH +# host keys, so Forgejo comes up fully configured against the restored PG + RustFS. +zstd -d -f "$DR/forgejo-repos.tar.zst" -o "$DR/forgejo-repos.tar" +docker run --rm -v foundation-forgejo-data:/data -v "$DR":/dr --entrypoint sh "$IMAGE_FORGEJO" -c 'cd /data && tar -xf /dr/forgejo-repos.tar' +# Publish the canonical git endpoint on host :22 only when it's free. On a real DR VM +# the provision cloud-init puts admin sshd on :222, so :22 is available (matching live); +# on a VM whose admin sshd sits on :22 we skip it (Forgejo is still reachable on :2222). +P22="-p 22:22" +if ss -Hltn 'sport = :22' 2>/dev/null | grep -q . || netstat -ltn 2>/dev/null | grep -q ':22 '; then + P22=""; log "host :22 is in use (admin sshd?) — publishing Forgejo git on :2222 only" +fi +docker run -d --name foundation-forgejo --hostname foundation-forgejo --restart unless-stopped \ + --network "$NET" -v foundation-forgejo-data:/data \ + -e USER_UID=1000 -e USER_GID=1000 \ + -e FORGEJO__security__INSTALL_LOCK=true \ + -e FORGEJO__server__START_SSH_SERVER=false \ + -e "FORGEJO__security__SECRET_KEY=$FORGEJO_SECRET_KEY" \ + $P22 -p 2222:22 "$IMAGE_FORGEJO" >/dev/null +i=0; until docker exec foundation-forgejo wget -qO- http://127.0.0.1:3000/api/healthz 2>/dev/null | head -3 | grep -q '"status": "pass"'; do + i=$((i+1)); [ "$i" -gt 60 ] && { docker logs --tail 40 foundation-forgejo >&2; fail "forgejo not healthy"; }; sleep 4; done +log "Forgejo healthy against restored DB + S3" + +# ── verify the restore reconstituted the forge ─────────────────────────────────── +REPO_OK=$(docker exec foundation-forgejo sh -c '[ -d /data/git/repositories/olsitec/foundation.git ] && echo yes || echo no') +[ "$REPO_OK" = yes ] || fail "olsitec/foundation.git not present after restore" +USERS=$(docker exec foundation-forgejo curl -s http://127.0.0.1:3000/api/v1/repos/olsitec/foundation | jq -r '.owner.login' 2>/dev/null) +rm -rf "$DR" +echo "DR RESTORE OK ($TS): vault unsealed, postgres rows=$ROWS, forge healthy, repo present, org=$USERS" diff --git a/dr/restore-to-fresh-vm.sh b/dr/restore-to-fresh-vm.sh new file mode 100755 index 0000000..20e1214 --- /dev/null +++ b/dr/restore-to-fresh-vm.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# restore-to-fresh-vm.sh — DISASTER RECOVERY orchestrator (CONTRACT_004 §4.4; T13). +# +# ./dr/restore-to-fresh-vm.sh --host [--port 22] [--key ] \ +# --ts [--source off|rfs] +# +# Rebuilds the ENTIRE platform on a FRESH, Docker-equipped VM from an offsite, +# age-encrypted bundle — the destructive sibling of backup/restore.sh. Unlike that +# scratch verifier, this stands the egg back UP (Vault->Postgres->RustFS->Forgejo). +# +# The only inputs are {this repo + the master passphrase + a reachable fresh VM}: +# - the age IDENTITY and the Vault OLD unseal keys/root token come from +# passphrase-encrypted config (they travel with the repo — CONTRACT_004 §4.3, +# CONTRACT_002 §2.4), so the bundle decrypts and Vault unseals even though the +# original VM and its Vault are gone; +# - everything else is read back out of the restored Vault on the new VM. +# +# Prereqs on the fresh VM: docker, age, zstd, jq (the provision cloud-init installs +# them — dr/RUNBOOK.md §2). DNS/Caddy/runner are re-established afterwards (RUNBOOK §5). +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DIR="$ROOT/bootstrap" +HOST=""; PORT=22; KEY="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}"; TS=""; SRC=off +while [ $# -gt 0 ]; do case "$1" in + --host) HOST="$2"; shift 2;; --port) PORT="$2"; shift 2;; --key) KEY="$2"; shift 2;; + --ts) TS="$2"; shift 2;; --source) SRC="$2"; shift 2;; + *) echo "unknown arg: $1" >&2; exit 2;; esac; done +[ -n "$HOST" ] || { echo "usage: restore-to-fresh-vm.sh --host --ts [--port N] [--key P] [--source off|rfs]" >&2; exit 2; } +[ -n "$TS" ] || { echo "--ts required (a bundle in the offsite bucket)" >&2; exit 2; } + +export PULUMI_BACKEND_URL="file://${DIR}/state" +export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +cd "$DIR"; pulumi stack select foundation >/dev/null + +# Secrets from passphrase-encrypted config (the disaster-survivable set). +UK=$(pulumi config get vaultCredentials:unsealKeys) +RTOK=$(pulumi config get vaultCredentials:rootToken) +AGE_ID=$(pulumi config get foundation:backup.ageIdentity) +OFF_EP=$(pulumi config get foundation:backup.offsiteEndpoint) +OFF_AK=$(pulumi config get foundation:backup.offsiteAccessKey) +OFF_SK=$(pulumi config get foundation:backup.offsiteSecretKey) +NET=$(pulumi config get foundation:network.name) +SUBNET=$(pulumi config get foundation:network.subnet) + +SSHX="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=20 -i $KEY -p $PORT root@$HOST" + +echo "dr: restoring bundle $TS ($SRC) onto fresh VM $HOST:$PORT" +$SSHX "for t in docker age zstd jq; do command -v \$t >/dev/null || { echo \"missing \$t on target VM\" >&2; exit 1; }; done" \ + || { echo "dr: target VM missing prereqs (docker/age/zstd/jq) — see dr/RUNBOOK.md §2" >&2; exit 1; } + +# Ship the image pins + the VM-side restorer. +$SSHX "cat > /tmp/foundation-dr-VERSIONS" < "$ROOT/VERSIONS" +$SSHX "cat > /tmp/foundation-dr-remote-$TS.sh" < "$ROOT/dr/restore-to-fresh-vm-remote.sh" +# Secrets on stdin (never argv): unseal keys, root token, age identity, offsite creds. +printf '%s\n%s\n%s\n%s\n%s\n%s\n' "$UK" "$RTOK" "$AGE_ID" "$OFF_EP" "$OFF_AK" "$OFF_SK" \ + | $SSHX "sh /tmp/foundation-dr-remote-$TS.sh '$TS' '$SRC' '$NET' '$SUBNET'; rc=\$?; rm -f /tmp/foundation-dr-remote-$TS.sh /tmp/foundation-dr-VERSIONS; exit \$rc" +echo "dr: restore complete. Next (RUNBOOK §5): re-point DNS to $HOST, bring up Caddy + runner," +echo "dr: then re-adopt the stack with vm.host=$HOST (pulumi up) to resume IaC management." From dda83bdc870de549d344e62280dc586ad265a265 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Wed, 1 Jul 2026 00:15:01 +0200 Subject: [PATCH 3/3] feat(ci): baked CI image + runner config + self-check workflow (T14) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stand up the foundation's own CI on its Forgejo runner. The committed scope here is the self-contained half (toolchain + typecheck); the stack-state-dependent pipelines (pulumi preview, backup-verify) need CI secrets + a state fetch and land next. - containers/ci-image/Dockerfile + VERSIONS IMAGE_CI: one baked image carrying exactly what preflight validates (pulumi/bun/node/docker/git/age/zstd/jq/vault/ psql/mc). Built on the VM (like caddy-cloudflare) and used LOCALLY by the runner. - runner.ts: give act_runner a config.yaml — container.network=foundation-net (so job containers reach foundation-forgejo:3000 for checkout + the data plane) and force_pull=false (use the local foundation-ci image, no registry). Self-heals on up. - .forgejo/workflows/ci.yml: preflight (tools + versions vs VERSIONS pins) + typecheck (bun install + tsc --noEmit on bootstrap). Gates every push. - run.sh / backup.sh / restore.sh / dr: take PULUMI_CONFIG_PASSPHRASE from env when set (CI secret), falling back to `pass` (operator) — so the scripts run pass-free in CI. Reusable-workflows architecture (per the chosen direction) — the ecosystem CI (semantic-release, docker/npm/bun builds, eslint/yamllint over the 999_testing.md candidates) builds on this image + runner next phase. Co-Authored-By: Claude Opus 4.8 (1M context) --- .forgejo/workflows/.gitkeep | 0 .forgejo/workflows/ci.yml | 32 ++++++++++++++++++ VERSIONS | 7 ++++ backup/backup.sh | 2 +- backup/restore.sh | 2 +- bootstrap/components/runner.ts | 23 +++++++++++-- bootstrap/run.sh | 2 +- containers/ci-image/Dockerfile | 61 ++++++++++++++++++++++++++++++++++ dr/restore-to-fresh-vm.sh | 2 +- 9 files changed, 125 insertions(+), 6 deletions(-) delete mode 100644 .forgejo/workflows/.gitkeep create mode 100644 .forgejo/workflows/ci.yml create mode 100644 containers/ci-image/Dockerfile diff --git a/.forgejo/workflows/.gitkeep b/.forgejo/workflows/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/.forgejo/workflows/ci.yml b/.forgejo/workflows/ci.yml new file mode 100644 index 0000000..da2ebed --- /dev/null +++ b/.forgejo/workflows/ci.yml @@ -0,0 +1,32 @@ +# CI — foundation self-checks (T14). Runs on the foundation's own runner, in the +# baked foundation-ci image (VERSIONS IMAGE_CI; force_pull:false → local image). +# These two jobs are self-contained (checkout + toolchain only) — no stack state or +# secrets needed, so they gate every push. The stack-state-dependent pipelines +# (pulumi preview, backup-verify) live in their own files and need CI secrets + +# a state fetch (see those workflows' headers). +name: CI +on: + push: + pull_request: + +jobs: + preflight: + runs-on: docker + container: + image: foundation-ci:latest + steps: + - uses: actions/checkout@v4 + - name: Toolchain preflight (tools present + >= VERSIONS pins) + run: ./preflight/preflight.sh tools versions + + typecheck: + runs-on: docker + container: + image: foundation-ci:latest + steps: + - uses: actions/checkout@v4 + - name: Install workspace deps + run: bun install --frozen-lockfile || bun install + - name: Typecheck bootstrap (tsc --noEmit) + working-directory: bootstrap + run: bunx tsc --noEmit diff --git a/VERSIONS b/VERSIONS index cd2c4f0..8fa8ab5 100644 --- a/VERSIONS +++ b/VERSIONS @@ -72,6 +72,13 @@ IMAGE_REGISTRY=registry:2@sha256:PIN_DIGEST # (T04) and backup put/get (T12). RustFS speaks enough of the MinIO admin API. IMAGE_MC=minio/mc:latest@sha256:a7fe349ef4bd8521fb8497f55c6042871b2ae640607cf99d9bede5e9bdf11727 +# CI toolchain image (T14): the baked image the foundation's own .forgejo/workflows +# run in (pulumi/bun/node/docker/git/age/zstd/jq/vault/psql/mc). Built ON the VM from +# containers/ci-image/Dockerfile (like caddy-cloudflare) and used LOCALLY by the runner +# (runner config force_pull:false) — not pulled from a registry, so the tag is the ref. +# Rebuild: scp the Dockerfile + `docker build -t foundation-ci:latest .` on the VM. +IMAGE_CI=foundation-ci:latest + # NOTE on specific images: # IMAGE_RUSTFS uses `latest` because RustFS does not (yet) publish stable # semver tags reliably (PLAN-002 R3 — RustFS is young). MUST be pinned by diff --git a/backup/backup.sh b/backup/backup.sh index 6a99317..11acfe4 100755 --- a/backup/backup.sh +++ b/backup/backup.sh @@ -13,7 +13,7 @@ ROOT="$(cd "$(dirname "$0")/.." && pwd)" DIR="$ROOT/bootstrap" TS="${1:-$(date -u +%Y%m%dT%H%M%SZ)}" export PULUMI_BACKEND_URL="file://${DIR}/state" -export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +export PULUMI_CONFIG_PASSPHRASE="${PULUMI_CONFIG_PASSPHRASE:-$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)}" KEY="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}" MC_IMAGE="$(grep '^IMAGE_MC=' "$ROOT/VERSIONS" | cut -d= -f2-)" cd "$DIR" diff --git a/backup/restore.sh b/backup/restore.sh index 6206690..9d333e0 100755 --- a/backup/restore.sh +++ b/backup/restore.sh @@ -12,7 +12,7 @@ DIR="$ROOT/bootstrap" TS="${1:?usage: restore.sh [rfs|off]}" SRC="${2:-rfs}" export PULUMI_BACKEND_URL="file://${DIR}/state" -export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +export PULUMI_CONFIG_PASSPHRASE="${PULUMI_CONFIG_PASSPHRASE:-$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)}" KEY="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}" MC_IMAGE="$(grep '^IMAGE_MC=' "$ROOT/VERSIONS" | cut -d= -f2-)" PG_IMAGE="$(grep '^IMAGE_POSTGRES=' "$ROOT/VERSIONS" | cut -d= -f2-)" diff --git a/bootstrap/components/runner.ts b/bootstrap/components/runner.ts index f4a076d..9c5a8cb 100644 --- a/bootstrap/components/runner.ts +++ b/bootstrap/components/runner.ts @@ -42,6 +42,24 @@ export function deployRunner( { provider, retainOnDelete: true }, // holds .runner registration secret ); + // act_runner config (T14): job containers must join foundation-net to reach + // foundation-forgejo:3000 (checkout) + the data plane, and must NOT force-pull — + // the CI toolchain image (foundation-ci, VERSIONS IMAGE_CI) is built locally on the + // VM, not in a registry. valid_volumes allows jobs to mount the host docker socket + // (docker-label builds). Re-written on every up so config drift self-heals. + const RUNNER_CONFIG = `log: + level: info +runner: + capacity: 2 + timeout: 30m + fetch_interval: 2s +container: + network: foundation-net + force_pull: false + valid_volumes: + - /var/run/docker.sock +`; + const register = new command.remote.Command( "foundation-runner-register", { @@ -51,6 +69,7 @@ VOL=foundation-runner-data IMG='${img}' LABELS='${labels}' docker volume inspect "$VOL" >/dev/null 2>&1 || docker volume create "$VOL" >/dev/null +printf '%s' '${RUNNER_CONFIG}' | docker run --rm -i --entrypoint sh -v "$VOL":/data "$IMG" -c 'cat > /data/config.yaml' if docker run --rm --entrypoint sh -v "$VOL":/data "$IMG" -c '[ -s /data/.runner ]'; then echo "runner already registered" else @@ -60,7 +79,7 @@ else echo "runner registered" fi`, addPreviousOutputInEnv: false, - triggers: [forgejo.ready.id, labels], + triggers: [forgejo.ready.id, labels, RUNNER_CONFIG], }, { dependsOn: [forgejo.ready] }, ); @@ -73,7 +92,7 @@ fi`, hostname: "foundation-runner", restart: "unless-stopped", entrypoints: ["/bin/forgejo-runner"], - command: ["daemon"], + command: ["daemon", "-c", "/data/config.yaml"], // T14 runner config (network/force_pull) // The image runs as uid 1000; add the host docker group (gid of // /var/run/docker.sock) so the daemon can reach the socket without running // as root. NOTE: 996 is THIS host's docker gid — re-check on DR to a new VM diff --git a/bootstrap/run.sh b/bootstrap/run.sh index 9b8c88c..e00231c 100755 --- a/bootstrap/run.sh +++ b/bootstrap/run.sh @@ -5,7 +5,7 @@ DIR="$(cd "$(dirname "$0")" && pwd)" # Pin the backend PER-PROCESS via env — NEVER `pulumi login` (that mutates the # GLOBAL backend pointer in ~/.pulumi and would misdirect other projects' run.sh). export PULUMI_BACKEND_URL="file://${DIR}/state" -export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +export PULUMI_CONFIG_PASSPHRASE="${PULUMI_CONFIG_PASSPHRASE:-$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)}" export SSH_PRIVATE_KEY_PATH="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}" cd "$DIR" pulumi stack select foundation 2>/dev/null || pulumi stack init foundation diff --git a/containers/ci-image/Dockerfile b/containers/ci-image/Dockerfile new file mode 100644 index 0000000..4269dd9 --- /dev/null +++ b/containers/ci-image/Dockerfile @@ -0,0 +1,61 @@ +# foundation-ci — the baked CI toolchain image (T14). +# +# A single, pinnable image carrying every tool the foundation's own pipelines need +# so jobs don't install a toolchain on each run. Referenced by .forgejo/workflows/* +# via `container: foundation-ci:`. Built on the VM (like caddy-cloudflare) and +# used locally by the runner (force_pull:false) — see runner.ts / VERSIONS IMAGE_CI. +# +# Carries exactly what preflight/checks/tools.sh validates: pulumi, bun, node, +# docker (cli), git, age, zstd, jq, vault, psql, pg_dump, ssh, mc — plus pass-free +# operation (PULUMI_CONFIG_PASSPHRASE + SSH key arrive as CI secrets/env). +FROM node:20-bookworm + +ARG PULUMI_VERSION=3.145.0 +ARG VAULT_VERSION=1.18.5 +ARG MC_RELEASE=RELEASE.2025-04-03T17-07-56Z +ARG TARGETARCH=amd64 + +ENV DEBIAN_FRONTEND=noninteractive +# Install pulumi + bun into /usr/local/bin so they're on PATH for ANY shell/user +# (a login shell resets PATH, and jobs may not run as root). +ENV BUN_INSTALL=/usr/local + +# --- base apt tools: git, ssh, age, zstd, jq, postgresql-client, docker CLI ---------- +RUN set -eux; \ + install -m 0755 -d /etc/apt/keyrings; \ + apt-get update; \ + apt-get install -y --no-install-recommends \ + ca-certificates curl gnupg lsb-release unzip \ + git openssh-client age zstd jq; \ + # docker CE CLI (jobs build/push images via the mounted host socket) + curl -fsSL https://download.docker.com/linux/debian/gpg -o /etc/apt/keyrings/docker.asc; \ + chmod a+r /etc/apt/keyrings/docker.asc; \ + echo "deb [arch=$TARGETARCH signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/debian bookworm stable" > /etc/apt/sources.list.d/docker.list; \ + # postgresql-client 15 (psql + pg_dump) from pgdg + curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /etc/apt/keyrings/pgdg.gpg; \ + echo "deb [signed-by=/etc/apt/keyrings/pgdg.gpg] https://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list; \ + apt-get update; \ + apt-get install -y --no-install-recommends docker-ce-cli postgresql-client-16; \ + rm -rf /var/lib/apt/lists/* + +# --- pulumi (pinned) → copy binaries to /usr/local/bin ------------------------------- +RUN set -eux; curl -fsSL https://get.pulumi.com | sh -s -- --version "$PULUMI_VERSION"; \ + cp /root/.pulumi/bin/* /usr/local/bin/; rm -rf /root/.pulumi; \ + pulumi version + +# --- bun (pinned via official installer; BUN_INSTALL=/usr/local) --------------------- +RUN set -eux; curl -fsSL https://bun.sh/install | bash; bun --version + +# --- vault CLI (pinned) -------------------------------------------------------------- +RUN set -eux; \ + curl -fsSL "https://releases.hashicorp.com/vault/${VAULT_VERSION}/vault_${VAULT_VERSION}_linux_${TARGETARCH}.zip" -o /tmp/vault.zip; \ + unzip -d /usr/local/bin /tmp/vault.zip; rm -f /tmp/vault.zip; vault --version + +# --- minio client mc (pinned release) ------------------------------------------------ +RUN set -eux; \ + curl -fsSL "https://dl.min.io/client/mc/release/linux-${TARGETARCH}/archive/mc.${MC_RELEASE}" -o /usr/local/bin/mc; \ + chmod +x /usr/local/bin/mc; mc --version + +# Forgejo Actions overrides the entrypoint with its job script; keep a sane default. +WORKDIR /workspace +CMD ["bash"] diff --git a/dr/restore-to-fresh-vm.sh b/dr/restore-to-fresh-vm.sh index 20e1214..4324ac0 100755 --- a/dr/restore-to-fresh-vm.sh +++ b/dr/restore-to-fresh-vm.sh @@ -29,7 +29,7 @@ while [ $# -gt 0 ]; do case "$1" in [ -n "$TS" ] || { echo "--ts required (a bundle in the offsite bucket)" >&2; exit 2; } export PULUMI_BACKEND_URL="file://${DIR}/state" -export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +export PULUMI_CONFIG_PASSPHRASE="${PULUMI_CONFIG_PASSPHRASE:-$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)}" cd "$DIR"; pulumi stack select foundation >/dev/null # Secrets from passphrase-encrypted config (the disaster-survivable set).