#!/bin/sh # restore-to-fresh-vm-remote.sh — the VM-side DESTRUCTIVE disaster restore # (CONTRACT_004 §4.4). Shipped + run by dr/restore-to-fresh-vm.sh; NOT run directly. # This is the destructive sibling of backup/restore-remote.sh: it rebuilds the live # platform on a FRESH VM from an offsite, age-encrypted bundle, in the mandated # restore order: # # Vault -> Postgres -> RustFS -> Forgejo (Runner re-registers; Caddy re-issues) # # Trust chain (PLAN-002 §4.1): the bundle is decrypted with the age identity, and # Vault is unsealed with the OLD unseal keys — BOTH travel in passphrase-encrypted # config with the repo, so {repo + passphrase + offsite bundle} fully reconstitutes # the egg even after total loss of the original VM AND Vault. Once Vault is back, it # is the source of truth for every other service's credentials. # # Secrets on stdin (never argv); non-secrets are args. Image pins are sourced from # the shipped VERSIONS. Idempotent-ish: safe to re-run after a failed attempt (it # removes any half-built foundation-* containers first). set -eu IFS= read -r VAULT_UNSEAL_KEYS_JSON # OLD keys (JSON array) from config IFS= read -r VAULT_ROOT_TOKEN # OLD root token from config IFS= read -r AGE_IDENTITY # decrypts the bundle IFS= read -r OFF_EP IFS= read -r OFF_AK IFS= read -r OFF_SK TS="$1"; SRC="${2:-off}"; NET="${3:-foundation-net}"; SUBNET="${4:-172.30.0.0/24}" OFFSITE_BUCKET=olsitec-foundation DR=/tmp/foundation-dr-$TS . /tmp/foundation-dr-VERSIONS # IMAGE_* pins fail() { echo "DR RESTORE FAIL: $1" >&2; exit 1; } log() { echo "[dr] $1" >&2; } # ── clean any partial prior attempt (NOT the data volumes — those are the restore) ─ for c in foundation-vault foundation-postgres foundation-rustfs foundation-forgejo foundation-runner; do docker rm -f "$c" >/dev/null 2>&1 || true done rm -rf "$DR"; (umask 077; mkdir -p "$DR") docker network create --driver bridge --subnet "$SUBNET" "$NET" >/dev/null 2>&1 || true # ── 0. pull + decrypt the bundle from offsite ──────────────────────────────────── log "pull bundle $TS from $SRC + age-decrypt" ( umask 077; printf '%s\n' "$AGE_IDENTITY" > "$DR/age.key" ) docker run --rm -v "$DR":/w -e OFF_EP="$OFF_EP" -e OFF_AK="$OFF_AK" -e OFF_SK="$OFF_SK" \ -e TS="$TS" -e OFFB="$OFFSITE_BUCKET" --entrypoint sh "$IMAGE_MC" -c ' set -e mc alias set off "$OFF_EP" "$OFF_AK" "$OFF_SK" >/dev/null mc cp -r "off/$OFFB/$TS/" /w/ >/dev/null' [ -f "$DR/MANIFEST.json" ] || { [ -d "$DR/$TS" ] && mv "$DR/$TS"/* "$DR"/; } [ -f "$DR/MANIFEST.json" ] || fail "MANIFEST.json missing from offsite bundle" for name in $(jq -r '.artifacts[].name' "$DR/MANIFEST.json"); do [ -f "$DR/$name.age" ] || fail "$name.age missing" age -d -i "$DR/age.key" -o "$DR/$name" "$DR/$name.age" 2>/dev/null || fail "decrypt $name" got=$(sha256sum "$DR/$name" | cut -d' ' -f1) want=$(jq -r --arg n "$name" '.artifacts[]|select(.name==$n).sha256' "$DR/MANIFEST.json") [ "$got" = "$want" ] || fail "sha mismatch $name" done log "bundle decrypted + verified" # ── 1. VAULT: start fresh raft node, restore snapshot, unseal with OLD keys ─────── log "restore Vault" VLC='{"storage":{"raft":{"path":"/vault/file","node_id":"foundation-vault"}},"listener":{"tcp":{"address":"0.0.0.0:8200","tls_disable":true}},"api_addr":"http://foundation-vault:8200","cluster_addr":"http://foundation-vault:8201","ui":true,"disable_mlock":false}' docker run -d --name foundation-vault --hostname foundation-vault --restart unless-stopped \ --network "$NET" --cap-add IPC_LOCK -v foundation-vault-data:/vault/file \ -e VAULT_LOCAL_CONFIG="$VLC" -e VAULT_API_ADDR=http://foundation-vault:8200 \ "$IMAGE_VAULT" server >/dev/null VE='-e VAULT_ADDR=http://127.0.0.1:8200' vstat() { docker exec $VE foundation-vault vault status -format=json 2>/dev/null; } i=0; until vstat >/dev/null 2>&1 || [ "$(vstat | jq -r '.initialized' 2>/dev/null)" = false ]; do i=$((i+1)); [ "$i" -gt 40 ] && fail "vault not reachable"; sleep 2; done # A raft snapshot can only be restored into an initialised, unsealed node. Init a # THROWAWAY node, unseal it, restore -force (this swaps in the OLD data + keyring), # then re-unseal with the OLD keys the snapshot expects. if [ "$(vstat | jq -r '.initialized')" = false ]; then TMP=$(docker exec $VE foundation-vault vault operator init -key-shares=1 -key-threshold=1 -format=json) docker exec $VE foundation-vault vault operator unseal "$(printf '%s' "$TMP" | jq -r '.unseal_keys_b64[0]')" >/dev/null TMP_ROOT=$(printf '%s' "$TMP" | jq -r '.root_token') docker cp "$DR/vault-raft.snap" foundation-vault:/tmp/v.snap docker exec $VE -e VAULT_TOKEN="$TMP_ROOT" foundation-vault vault operator raft snapshot restore -force /tmp/v.snap docker exec foundation-vault rm -f /tmp/v.snap fi # `restore -force` swaps in the snapshot's keyring and re-seals the node — but the # re-seal lands a moment AFTER the command returns (a race that silently skips the # unseal if checked too early). Wait for the node to settle, then unseal with the # OLD keys, RETRYING until it actually opens (the keys are correct; timing isn't). unsealed= for _ in $(seq 1 30); do vstat >/dev/null 2>&1 || { sleep 2; continue; } if [ "$(vstat | jq -r '.sealed')" = false ]; then unsealed=1; break; fi printf '%s' "$VAULT_UNSEAL_KEYS_JSON" | jq -r '.[]' | while IFS= read -r k; do docker exec $VE foundation-vault vault operator unseal "$k" >/dev/null 2>&1 || true done sleep 2 done [ "$unsealed" = 1 ] || fail "vault still sealed after restore (OLD keys rejected?)" # OLD root token must now authenticate against the restored data. docker exec $VE -e VAULT_TOKEN="$VAULT_ROOT_TOKEN" foundation-vault vault kv get -format=json foundation/postgres/service-credentials >/dev/null 2>&1 \ || fail "restored Vault rejects OLD root token / missing creds" log "Vault restored + unsealed; reading service creds" gv() { docker exec $VE -e VAULT_TOKEN="$VAULT_ROOT_TOKEN" foundation-vault vault kv get -field="$2" "foundation/$1" 2>/dev/null; } PG_SUPER_PW=$(gv postgres/service-credentials postgresSuperPassword) RUSTFS_AK=$(gv rustfs/service-credentials rustfsAdminUser) RUSTFS_SK=$(gv rustfs/service-credentials rustfsAdminPassword) RUSTFS_SVC_ID=$(gv rustfs/service-credentials rustfsServiceKeyId) RUSTFS_SVC_SECRET=$(gv rustfs/service-credentials rustfsServiceKeySecret) FORGEJO_SECRET_KEY=$(gv forgejo/service-credentials forgejoSecretKey) # ── 2. POSTGRES: start, restore the dump (recreates forgejo role+DB) ────────────── log "restore Postgres" docker run -d --name foundation-postgres --hostname foundation-postgres --restart unless-stopped \ --network "$NET" -v foundation-postgres-data:/var/lib/postgresql/data \ -e POSTGRES_PASSWORD="$PG_SUPER_PW" "$IMAGE_POSTGRES" >/dev/null i=0; until docker exec foundation-postgres pg_isready -U postgres >/dev/null 2>&1; do i=$((i+1)); [ "$i" -gt 40 ] && fail "postgres not ready"; sleep 2; done gunzip < "$DR/postgres.sql.gz" | docker exec -i foundation-postgres psql -U postgres -q >/dev/null 2>&1 || true ROWS=$(docker exec foundation-postgres psql -U postgres -d forgejo -tAc 'SELECT count(*) FROM "user"' 2>/dev/null || echo 0) [ "${ROWS:-0}" -ge 1 ] || fail "restored forgejo DB has no users" log "Postgres restored: forgejo.\"user\" rows=$ROWS" # ── 3. RUSTFS: start, recreate buckets + service account, sync blobs back ───────── log "restore RustFS" docker run -d --name foundation-rustfs --hostname foundation-rustfs --restart unless-stopped \ --network "$NET" -v foundation-rustfs-data:/data \ -e RUSTFS_ACCESS_KEY="$RUSTFS_AK" -e RUSTFS_SECRET_KEY="$RUSTFS_SK" "$IMAGE_RUSTFS" >/dev/null tar -C "$DR" -xf - <"/dev/null" 2>/dev/null || true mkdir -p "$DR/blobs"; zstd -dc "$DR/rustfs-blobs.tar.zst" | tar -C "$DR/blobs" -xf - 2>/dev/null || true docker run --rm --network "$NET" -v "$DR/blobs":/blobs \ -e RAK="$RUSTFS_AK" -e RSK="$RUSTFS_SK" -e SID="$RUSTFS_SVC_ID" -e SSEC="$RUSTFS_SVC_SECRET" \ --entrypoint sh "$IMAGE_MC" -c ' set -e for i in $(seq 1 30); do mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null 2>&1 && mc ls rfs >/dev/null 2>&1 && break; sleep 2; done for b in forgejo-packages forgejo-artifacts forgejo-lfs foundation-backups; do mc mb --ignore-existing "rfs/$b" >/dev/null 2>&1 || true; done mc admin user svcacct add --access-key "$SID" --secret-key "$SSEC" rfs "$RAK" >/dev/null 2>&1 || true for b in forgejo-packages forgejo-artifacts forgejo-lfs; do [ -d "/blobs/$b" ] && mc mirror --overwrite --quiet "/blobs/$b" "rfs/$b" >/dev/null 2>&1 || true done' || fail "rustfs restore" log "RustFS restored (buckets + service account + blobs)" # ── 4. FORGEJO: restore the /data volume (repos + app.ini + ssh keys), then start ── log "restore Forgejo" # Decompress on the HOST (zstd is installed there) to a plain tar, then extract into # the volume with the forgejo image's busybox tar (no zstd inside the container). The # restored /data carries app.ini (DB + S3 creds + INTERNAL_TOKEN/JWTs) and the SSH # host keys, so Forgejo comes up fully configured against the restored PG + RustFS. zstd -d -f "$DR/forgejo-repos.tar.zst" -o "$DR/forgejo-repos.tar" docker run --rm -v foundation-forgejo-data:/data -v "$DR":/dr --entrypoint sh "$IMAGE_FORGEJO" -c 'cd /data && tar -xf /dr/forgejo-repos.tar' # Publish the canonical git endpoint on host :22 only when it's free. On a real DR VM # the provision cloud-init puts admin sshd on :222, so :22 is available (matching live); # on a VM whose admin sshd sits on :22 we skip it (Forgejo is still reachable on :2222). P22="-p 22:22" if ss -Hltn 'sport = :22' 2>/dev/null | grep -q . || netstat -ltn 2>/dev/null | grep -q ':22 '; then P22=""; log "host :22 is in use (admin sshd?) — publishing Forgejo git on :2222 only" fi docker run -d --name foundation-forgejo --hostname foundation-forgejo --restart unless-stopped \ --network "$NET" -v foundation-forgejo-data:/data \ -e USER_UID=1000 -e USER_GID=1000 \ -e FORGEJO__security__INSTALL_LOCK=true \ -e FORGEJO__server__START_SSH_SERVER=false \ -e "FORGEJO__security__SECRET_KEY=$FORGEJO_SECRET_KEY" \ $P22 -p 2222:22 "$IMAGE_FORGEJO" >/dev/null i=0; until docker exec foundation-forgejo wget -qO- http://127.0.0.1:3000/api/healthz 2>/dev/null | head -3 | grep -q '"status": "pass"'; do i=$((i+1)); [ "$i" -gt 60 ] && { docker logs --tail 40 foundation-forgejo >&2; fail "forgejo not healthy"; }; sleep 4; done log "Forgejo healthy against restored DB + S3" # ── verify the restore reconstituted the forge ─────────────────────────────────── REPO_OK=$(docker exec foundation-forgejo sh -c '[ -d /data/git/repositories/olsitec/foundation.git ] && echo yes || echo no') [ "$REPO_OK" = yes ] || fail "olsitec/foundation.git not present after restore" USERS=$(docker exec foundation-forgejo curl -s http://127.0.0.1:3000/api/v1/repos/olsitec/foundation | jq -r '.owner.login' 2>/dev/null) rm -rf "$DR" echo "DR RESTORE OK ($TS): vault unsealed, postgres rows=$ROWS, forge healthy, repo present, org=$USERS"