From d5c53ce9a2e3954c4f73ec4fbd796572d386829a Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 22:31:13 +0200 Subject: [PATCH 01/10] feat(provision): open :22 for the canonical git endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The VM's admin sshd is on :222, so :22 is free for Forgejo's git-over-SSH. Opening it makes the scp-form clone `git@git.olsitec.net:olsitec/...` work — Forgejo's sshd ForceCommands `serv`, so :22 only ever does git (like github.com:22). :2222 stays open too (CONTRACT_001 forgeSshPort). Co-Authored-By: Claude Opus 4.8 (1M context) --- provision/index.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/provision/index.ts b/provision/index.ts index 303acac..8bc5b53 100644 --- a/provision/index.ts +++ b/provision/index.ts @@ -76,6 +76,10 @@ const firewall = new hcloud.Firewall( { direction: "in", protocol: "tcp", port: "443", sourceIps: ["0.0.0.0/0", "::/0"] }, // Forgejo git-over-SSH (CONTRACT_001 forgeSshPort) { direction: "in", protocol: "tcp", port: "2222", sourceIps: ["0.0.0.0/0", "::/0"] }, + // Forgejo git-over-SSH on :22 too — the VM's admin sshd is on 222, so :22 is + // free for the canonical git endpoint, making the scp-form clone + // `git@git.olsitec.net:olsitec/...` work (Forgejo sshd ForceCommands `serv`). + { direction: "in", protocol: "tcp", port: "22", sourceIps: ["0.0.0.0/0", "::/0"] }, { direction: "in", protocol: "icmp", sourceIps: ["0.0.0.0/0", "::/0"] }, ], }, From 9618da1421f871ffb17fcf2668bd291bfecb28e0 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 22:38:37 +0200 Subject: [PATCH 02/10] feat(bootstrap): forgejo actions runner (T10) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit foundation-runner (forgejo/runner:6, digest-pinned). Registration is idempotent (ADR-007): it reuses /data/.runner if present, else mints a token via `forgejo actions generate-runner-token` and consumes it with `forgejo-runner register` (the token never leaves the VM). The daemon runs as uid 1000 with the host docker group (gid 996) added for socket access — root-equivalent and co-located, the documented day-zero compromise (PLAN-002 R5 / PLAN-001 §4a); a fenced or separate runner VM is the steady state. Live on cx33 Helsinki: runner declared (labels docker,dind) and polling; a hello-world `runs-on: docker` workflow pushed to olsitec/foundation ran to success (workflow run #1). Acceptance T10 met. Co-Authored-By: Claude Opus 4.8 (1M context) --- VERSIONS | 2 +- bootstrap/components/runner.ts | 103 +++++++++++++++++++++++++++++++++ bootstrap/index.ts | 6 +- 3 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 bootstrap/components/runner.ts diff --git a/VERSIONS b/VERSIONS index 6befa05..cd2c4f0 100644 --- a/VERSIONS +++ b/VERSIONS @@ -64,7 +64,7 @@ IMAGE_FORGEJO=codeberg.org/forgejo/forgejo:11@sha256:d98d860ea64fd36cb0aabf0b46b IMAGE_POSTGRES=postgres:17@sha256:5c855ad7b85e68e48a62f34662853f38b57c1c1d80f3a927ab58034fd6d31c5e IMAGE_VAULT=hashicorp/vault:1.18@sha256:750bb37c1638fa194ab37053a81618c61bb0491ddec6fccac87c07a8e6cd8166 IMAGE_RUSTFS=rustfs/rustfs:latest@sha256:fa19210ac4697c79d7ccca1ec9b0eb91aebacc6691991ffb14014bb3c67e6cc3 -IMAGE_ACT_RUNNER=code.forgejo.org/forgejo/runner:6@sha256:PIN_DIGEST +IMAGE_ACT_RUNNER=code.forgejo.org/forgejo/runner:6@sha256:e8dd2880f2fc81984d2308b93f1bc064dfb41187942300676536c09a3b30043d IMAGE_REGISTRY=registry:2@sha256:PIN_DIGEST # Tool image: MinIO client `mc` — used transiently (never a long-running service) diff --git a/bootstrap/components/runner.ts b/bootstrap/components/runner.ts new file mode 100644 index 0000000..f4a076d --- /dev/null +++ b/bootstrap/components/runner.ts @@ -0,0 +1,103 @@ +// components/runner.ts (T10) +// +// foundation-runner — a Forgejo Actions runner (CONTRACT_003 §3.2). Registration +// is idempotent (ADR-007): if the persistent /data/.runner config already exists it +// is reused; otherwise a fresh registration token is minted with +// `forgejo actions generate-runner-token` and consumed by `forgejo-runner register` +// (the token never leaves the VM). The daemon then polls Forgejo for jobs. +// +// SECURITY (PLAN-002 R5 / PLAN-001 §4a): this runner shares the VM and is given the +// host Docker socket so `docker`-label jobs can spawn containers — i.e. it is +// root-equivalent on the host and is NOT fenced from the forge's trust boundary. +// That is the documented day-zero compromise; the steady-state is a throwaway, +// separate privileged runner VM. Do not run untrusted workflows here until fenced. +import * as pulumi from "@pulumi/pulumi"; +import * as docker from "@pulumi/docker"; +import * as command from "@pulumi/command"; +import { DeployCtx } from "../lib/context"; +import { vmConnection } from "../lib/remote"; +import { ForgejoOutputs } from "./forgejo"; + +export interface RunnerOutputs { + container: docker.Container; + registered: command.remote.Command; +} + +export function deployRunner( + ctx: DeployCtx, + forgejo: ForgejoOutputs, +): RunnerOutputs { + const { cfg, provider, network } = ctx; + const img = ctx.image("ACT_RUNNER"); + const labels = cfg.runner.labels.join(","); + + const image = new docker.RemoteImage( + "foundation-runner-image", + { name: img, keepLocally: true }, + { provider }, + ); + const volume = new docker.Volume( + "foundation-runner-data", + { name: "foundation-runner-data" }, + { provider, retainOnDelete: true }, // holds .runner registration secret + ); + + const register = new command.remote.Command( + "foundation-runner-register", + { + connection: vmConnection(ctx), + create: pulumi.interpolate`set -eu +VOL=foundation-runner-data +IMG='${img}' +LABELS='${labels}' +docker volume inspect "$VOL" >/dev/null 2>&1 || docker volume create "$VOL" >/dev/null +if docker run --rm --entrypoint sh -v "$VOL":/data "$IMG" -c '[ -s /data/.runner ]'; then + echo "runner already registered" +else + TOKEN=$(docker exec -u git foundation-forgejo forgejo actions generate-runner-token) + docker run --rm --network foundation-net --entrypoint /bin/forgejo-runner -v "$VOL":/data "$IMG" \\ + register --no-interactive --instance http://foundation-forgejo:3000 --token "$TOKEN" --name foundation-runner --labels "$LABELS" >/dev/null + echo "runner registered" +fi`, + addPreviousOutputInEnv: false, + triggers: [forgejo.ready.id, labels], + }, + { dependsOn: [forgejo.ready] }, + ); + + const container = new docker.Container( + "foundation-runner", + { + name: "foundation-runner", + image: image.imageId, + hostname: "foundation-runner", + restart: "unless-stopped", + entrypoints: ["/bin/forgejo-runner"], + command: ["daemon"], + // The image runs as uid 1000; add the host docker group (gid of + // /var/run/docker.sock) so the daemon can reach the socket without running + // as root. NOTE: 996 is THIS host's docker gid — re-check on DR to a new VM + // (`stat -c %g /var/run/docker.sock`). Socket access is root-equivalent + // regardless (see the security note above). + groupAdds: ["996"], + envs: ["DOCKER_HOST=unix:///var/run/docker.sock"], + volumes: [ + { volumeName: volume.name, containerPath: "/data" }, + { + hostPath: "/var/run/docker.sock", + containerPath: "/var/run/docker.sock", + }, + ], + networksAdvanced: [{ name: network.name, aliases: ["foundation-runner"] }], + logDriver: "json-file", + logOpts: { "max-size": "10m", "max-file": "3" }, + }, + { + provider, + dependsOn: [network, register], + deleteBeforeReplace: true, + }, + ); + + return { container, registered: register }; +} diff --git a/bootstrap/index.ts b/bootstrap/index.ts index 5cf7982..1c1ed20 100644 --- a/bootstrap/index.ts +++ b/bootstrap/index.ts @@ -19,6 +19,7 @@ import { deployRustfs } from "./components/rustfs"; import { deployVault } from "./components/vault"; import { deployProxy } from "./components/proxy"; import { deployForgejo, bootstrapForgejo } from "./components/forgejo"; +import { deployRunner } from "./components/runner"; import * as fs from "fs"; const cfg = loadConfig(); @@ -70,18 +71,19 @@ const forgejoBootstrap = bootstrapForgejo(ctx, { repoName: "foundation", sshPublicKey, }); -// const runner = deployRunner(ctx, { forgejo, credentials }); +const runner = cfg.features.runner ? deployRunner(ctx, forgejo) : undefined; // ============================================================================= // Stack outputs (extended as phases land). // vaultCreds (T06) is a gate for Forgejo (T08) — it has no output to export yet. void vaultCreds; -export const phase = "T09-forge-bootstrap"; // admin + org + repo + operator key +export const phase = "T10-runner"; // forge + CI runner live export const caddyImageId = proxy.imageId; export const forgejoEndpoint = forgejo.endpoint; export const cloneUrl = pulumi.interpolate`git@${cfg.hosts.git}:${cfg.forgejo.orgName}/foundation.git`; void forgejoBootstrap; // GATE B consumer; no secret output to export +void runner; // CI runner (feature-flagged) export const networkName = network.name; export const vmTarget = `${cfg.vm.user}@${cfg.vm.host}`; export const postgresEndpoint = postgres.endpoint; From 41172b35116140ce4587cb64ddf62c58a6421d80 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 22:46:51 +0200 Subject: [PATCH 03/10] feat(backup): backup + restore-verify with offsite replication (T12) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit backup/backup.sh (operator orchestrator) + backup-remote.sh (VM assembler) produce a CONTRACT_004 bundle in RustFS foundation-backups// and replicate it to the offsite olsitec-foundation bucket: pg_dumpall, forgejo git repos (tar.zst), vault raft snapshot, pulumi state, rustfs blobs, MANIFEST.json (sha256 + restore order). The timestamp is caller-supplied (§4.1); secrets travel on stdin (never argv, ADR-007); mc runs containerized. restore.sh + restore-remote.sh are the §4.6 verifier: pull a bundle (rfs or offsite), check MANIFEST shas, then NON-DESTRUCTIVELY reconstruct into scratch resources and assert (postgres users>0, olsitec/foundation.git present, vault snapshot non-empty). Live on cx33 Helsinki: bundle written to RustFS + offsite; restore-verify PASSES from BOTH sources (forgejo.user rows=2, repo present, 16KB vault snapshot). Known gap: at-rest age encryption (§4.3) not yet applied — both destinations are private/access-controlled; adding age (generate key + encrypt-before-upload) is the next hardening. Acceptance T12 met. Co-Authored-By: Claude Opus 4.8 (1M context) --- backup/backup-remote.sh | 67 ++++++++++++++++++++++++++++++++++++++ backup/backup.sh | 40 +++++++++++++++++++++++ backup/restore-remote.sh | 70 ++++++++++++++++++++++++++++++++++++++++ backup/restore.sh | 33 +++++++++++++++++++ 4 files changed, 210 insertions(+) create mode 100755 backup/backup-remote.sh create mode 100755 backup/backup.sh create mode 100755 backup/restore-remote.sh create mode 100755 backup/restore.sh diff --git a/backup/backup-remote.sh b/backup/backup-remote.sh new file mode 100755 index 0000000..0ec19f4 --- /dev/null +++ b/backup/backup-remote.sh @@ -0,0 +1,67 @@ +#!/bin/sh +# backup-remote.sh — the VM-side bundle assembler (CONTRACT_004 producer half). +# Shipped + run by backup/backup.sh; NOT run directly. Secrets arrive on stdin +# (never argv); non-secrets ($TS, $MC_IMAGE) are args. pulumi-state.json is already +# in $W (the operator placed it there before invoking this). +# +# Produces foundation-backups// in RustFS and replicates it to the offsite +# bucket. Artifacts per CONTRACT_004 §4.2: postgres.sql.gz, forgejo-repos.tar.zst, +# vault-raft.snap, pulumi-state.json, rustfs-blobs/, MANIFEST.json. +# +# NOTE: at-rest age encryption (CONTRACT_004 §4.3) is NOT yet applied — both +# destinations are private (RustFS internal, offsite scoped creds). Adding age is +# the next hardening (generate the key, encrypt each artifact before `mc cp`). +set -eu +IFS= read -r VAULT_TOKEN +IFS= read -r OFF_EP +IFS= read -r OFF_AK +IFS= read -r OFF_SK +IFS= read -r BUCKET +TS="$1" +MC_IMAGE="$2" +OFFSITE_BUCKET=olsitec-foundation +W="/tmp/foundation-backup-$TS" +mkdir -p "$W" + +echo "[backup] postgres pg_dumpall" >&2 +docker exec foundation-postgres pg_dumpall -U postgres | gzip > "$W/postgres.sql.gz" + +echo "[backup] forgejo git repos (tar.zst)" >&2 +# Forgejo keeps repos under /data/git; use the container's own tar (no extra image). +docker exec foundation-forgejo sh -c 'tar -C /data -cf - git' | zstd -q -T0 > "$W/forgejo-repos.tar.zst" + +echo "[backup] vault raft snapshot" >&2 +docker exec -e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN="$VAULT_TOKEN" foundation-vault \ + sh -c 'vault operator raft snapshot save /tmp/v.snap >/dev/null 2>&1 && cat /tmp/v.snap && rm -f /tmp/v.snap' > "$W/vault-raft.snap" + +echo "[backup] MANIFEST.json" >&2 +( cd "$W" + jq -n --arg ts "$TS" \ + --argjson files "$(for f in postgres.sql.gz forgejo-repos.tar.zst vault-raft.snap pulumi-state.json; do + [ -f "$f" ] || continue + jq -n --arg n "$f" --arg sha "$(sha256sum "$f" | cut -d' ' -f1)" --argjson sz "$(stat -c %s "$f")" \ + '{name:$n, sha256:$sha, size:$sz}' + done | jq -s '.')" \ + '{timestamp:$ts, restoreOrder:["vault","postgres","rustfs","forgejo"], artifacts:$files}' > MANIFEST.json +) + +# RustFS root creds from the running container (VM-trusted). +RAK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_ACCESS_KEY=//p') +RSK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_SECRET_KEY=//p') + +echo "[backup] upload to RustFS $BUCKET/$TS + replicate offsite" >&2 +docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ + -e RAK="$RAK" -e RSK="$RSK" -e OFF_EP="$OFF_EP" -e OFF_AK="$OFF_AK" -e OFF_SK="$OFF_SK" \ + -e BUCKET="$BUCKET" -e TS="$TS" -e OFFB="$OFFSITE_BUCKET" \ + "$MC_IMAGE" -c ' + set -e + mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null + mc alias set off "$OFF_EP" "$OFF_AK" "$OFF_SK" >/dev/null + mc cp -r /w/ "rfs/$BUCKET/$TS/" >/dev/null + for b in forgejo-packages forgejo-artifacts forgejo-lfs; do + mc mirror --overwrite --quiet "rfs/$b" "rfs/$BUCKET/$TS/rustfs-blobs/$b" >/dev/null 2>&1 || true + done + mc mirror --overwrite --quiet "rfs/$BUCKET/$TS" "off/$OFFB/$TS" >/dev/null + ' +rm -rf "$W" +echo "[backup] complete: rfs/$BUCKET/$TS (+ offsite $OFFSITE_BUCKET/$TS)" >&2 diff --git a/backup/backup.sh b/backup/backup.sh new file mode 100755 index 0000000..c2190fb --- /dev/null +++ b/backup/backup.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +# backup.sh — CONTRACT_004 backup producer (operator orchestrator). +# +# ./backup/backup.sh [UTC-timestamp] +# +# The timestamp is supplied by the caller (CI/cron) per CONTRACT_004 §4.1; it +# defaults to now for manual runs. The operator contributes the Pulumi state +# (local file backend) and the secrets (from passphrase-encrypted config); the +# heavy lifting runs on the VM via backup-remote.sh. Result: a bundle in RustFS +# foundation-backups// replicated to the offsite bucket. +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DIR="$ROOT/bootstrap" +TS="${1:-$(date -u +%Y%m%dT%H%M%SZ)}" +export PULUMI_BACKEND_URL="file://${DIR}/state" +export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +KEY="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}" +MC_IMAGE="$(grep '^IMAGE_MC=' "$ROOT/VERSIONS" | cut -d= -f2-)" +cd "$DIR" +pulumi stack select foundation >/dev/null + +RT=$(pulumi config get vaultCredentials:rootToken) +OFF_EP=$(pulumi config get foundation:backup.offsiteEndpoint) +OFF_AK=$(pulumi config get foundation:backup.offsiteAccessKey) +OFF_SK=$(pulumi config get foundation:backup.offsiteSecretKey) +BUCKET=$(pulumi config get foundation:backup.bucket) +HOST=$(pulumi config get foundation:vm.host) +PORT=$(pulumi config get foundation:vm.sshPort) +SUSER=$(pulumi config get foundation:vm.user) +SSHX="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=15 -i $KEY -p $PORT $SUSER@$HOST" +W="/tmp/foundation-backup-$TS" + +echo "backup: $TS -> rfs/$BUCKET/$TS (+ offsite)" +# Pulumi state + the assembler script onto the VM. +pulumi stack export | $SSHX "mkdir -p $W && cat > $W/pulumi-state.json" +$SSHX "cat > /tmp/backup-remote-$TS.sh" < "$ROOT/backup/backup-remote.sh" +# Run the assembler: secrets on stdin (never argv), TS + MC_IMAGE as args. +printf '%s\n%s\n%s\n%s\n%s\n' "$RT" "$OFF_EP" "$OFF_AK" "$OFF_SK" "$BUCKET" \ + | $SSHX "sh /tmp/backup-remote-$TS.sh '$TS' '$MC_IMAGE'; rm -f /tmp/backup-remote-$TS.sh" +echo "backup: done ($TS)" diff --git a/backup/restore-remote.sh b/backup/restore-remote.sh new file mode 100755 index 0000000..12a10b0 --- /dev/null +++ b/backup/restore-remote.sh @@ -0,0 +1,70 @@ +#!/bin/sh +# restore-remote.sh — VM-side SCRATCH restore verifier (CONTRACT_004 consumer half, +# §4.6 "a backup is not trusted until restored"). Shipped + run by backup/restore.sh. +# NON-DESTRUCTIVE: it reconstructs into throwaway scratch resources and asserts the +# bundle is restorable — it never touches the live containers/volumes. A real +# disaster restore (overwriting live, restore order Vault->Postgres->RustFS->Forgejo) +# is dr/restore-to-fresh-vm.sh (T13), out of scope here. +# +# Secrets on stdin; non-secrets ($TS, $MC_IMAGE, $PG_IMAGE, $SRC) as args. +set -eu +IFS= read -r OFF_EP +IFS= read -r OFF_AK +IFS= read -r OFF_SK +IFS= read -r BUCKET +TS="$1"; MC_IMAGE="$2"; PG_IMAGE="$3"; SRC="${4:-rfs}" +OFFSITE_BUCKET=olsitec-foundation +W="/tmp/foundation-restore-$TS" +rm -rf "$W"; mkdir -p "$W" +fail() { echo "RESTORE VERIFY FAIL: $1" >&2; docker rm -f foundation-restore-pg >/dev/null 2>&1 || true; exit 1; } + +RAK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_ACCESS_KEY=//p') +RSK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_SECRET_KEY=//p') + +echo "[restore] pull bundle $TS from $SRC" >&2 +docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ + -e RAK="$RAK" -e RSK="$RSK" -e OFF_EP="$OFF_EP" -e OFF_AK="$OFF_AK" -e OFF_SK="$OFF_SK" \ + -e BUCKET="$BUCKET" -e TS="$TS" -e SRC="$SRC" -e OFFB="$OFFSITE_BUCKET" "$MC_IMAGE" -c ' + set -e + mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null + if [ "$SRC" = off ]; then + mc alias set off "$OFF_EP" "$OFF_AK" "$OFF_SK" >/dev/null + mc cp -r "off/$OFFB/$TS/" /w/ >/dev/null + else + mc cp -r "rfs/$BUCKET/$TS/" /w/ >/dev/null + fi' +# mc cp -r nests under $TS/ — flatten if needed +[ -f "$W/MANIFEST.json" ] || { [ -d "$W/$TS" ] && mv "$W/$TS"/* "$W"/; } +[ -f "$W/MANIFEST.json" ] || fail "MANIFEST.json missing from pulled bundle" + +echo "[restore] verify MANIFEST sha256" >&2 +cd "$W" +jq -r '.artifacts[] | "\(.sha256) \(.name)"' MANIFEST.json | while read -r sha name; do + [ -f "$name" ] || { echo "missing $name" >&2; exit 1; } + got=$(sha256sum "$name" | cut -d' ' -f1) + [ "$got" = "$sha" ] || { echo "sha mismatch $name" >&2; exit 1; } +done || fail "MANIFEST sha verification failed" + +echo "[restore] scratch Postgres restore + assert" >&2 +docker rm -f foundation-restore-pg >/dev/null 2>&1 || true +docker run -d --name foundation-restore-pg -e POSTGRES_PASSWORD=scratch "$PG_IMAGE" >/dev/null +i=0; until docker exec foundation-restore-pg pg_isready -U postgres >/dev/null 2>&1; do + i=$((i+1)); [ "$i" -gt 30 ] && fail "scratch postgres not ready"; sleep 2; done +gunzip < postgres.sql.gz | docker exec -i foundation-restore-pg psql -U postgres -q >/dev/null 2>&1 || fail "psql restore errored" +ROWS=$(docker exec foundation-restore-pg psql -U postgres -d forgejo -tAc 'SELECT count(*) FROM "user"' 2>/dev/null || echo 0) +[ "${ROWS:-0}" -ge 1 ] || fail "restored forgejo DB has no users (got '$ROWS')" +echo "[restore] postgres OK: forgejo.\"user\" rows=$ROWS" >&2 +docker rm -f foundation-restore-pg >/dev/null 2>&1 || true + +echo "[restore] extract forgejo repos + assert olsitec/foundation present" >&2 +mkdir -p repos +zstd -dc forgejo-repos.tar.zst | tar -C repos -xf - 2>/dev/null || fail "forgejo tar extract failed" +[ -d repos/git/repositories/olsitec/foundation.git ] || fail "olsitec/foundation.git not in repo bundle" +echo "[restore] forgejo repos OK: olsitec/foundation.git present" >&2 + +echo "[restore] vault snapshot sanity" >&2 +[ -s vault-raft.snap ] || fail "vault-raft.snap empty" +echo "[restore] vault snapshot OK: $(stat -c %s vault-raft.snap) bytes" >&2 + +rm -rf "$W" +echo "RESTORE VERIFY PASS ($TS from $SRC)" diff --git a/backup/restore.sh b/backup/restore.sh new file mode 100755 index 0000000..d8efe1d --- /dev/null +++ b/backup/restore.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +# restore.sh — CONTRACT_004 §4.6 restore verifier (operator orchestrator). +# +# ./backup/restore.sh [rfs|off] +# +# Pulls the bundle (default from RustFS; `off` checks the offsite copy) and asserts +# it reconstructs into scratch resources — NON-DESTRUCTIVE, it never touches the +# live platform. The real disaster restore is dr/restore-to-fresh-vm.sh (T13). +set -euo pipefail +ROOT="$(cd "$(dirname "$0")/.." && pwd)" +DIR="$ROOT/bootstrap" +TS="${1:?usage: restore.sh [rfs|off]}" +SRC="${2:-rfs}" +export PULUMI_BACKEND_URL="file://${DIR}/state" +export PULUMI_CONFIG_PASSPHRASE="$(pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE)" +KEY="${SSH_PRIVATE_KEY_PATH:-${HOME}/.ssh/foundation-test_ed25519}" +MC_IMAGE="$(grep '^IMAGE_MC=' "$ROOT/VERSIONS" | cut -d= -f2-)" +PG_IMAGE="$(grep '^IMAGE_POSTGRES=' "$ROOT/VERSIONS" | cut -d= -f2-)" +cd "$DIR" +pulumi stack select foundation >/dev/null + +OFF_EP=$(pulumi config get foundation:backup.offsiteEndpoint) +OFF_AK=$(pulumi config get foundation:backup.offsiteAccessKey) +OFF_SK=$(pulumi config get foundation:backup.offsiteSecretKey) +BUCKET=$(pulumi config get foundation:backup.bucket) +HOST=$(pulumi config get foundation:vm.host) +PORT=$(pulumi config get foundation:vm.sshPort) +SUSER=$(pulumi config get foundation:vm.user) +SSHX="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=15 -i $KEY -p $PORT $SUSER@$HOST" + +$SSHX "cat > /tmp/restore-remote-$TS.sh" < "$ROOT/backup/restore-remote.sh" +printf '%s\n%s\n%s\n%s\n' "$OFF_EP" "$OFF_AK" "$OFF_SK" "$BUCKET" \ + | $SSHX "sh /tmp/restore-remote-$TS.sh '$TS' '$MC_IMAGE' '$PG_IMAGE' '$SRC'; rm -f /tmp/restore-remote-$TS.sh" From 0e5d1e2fee5d05882da6ec2e31c620818b22a1d5 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 22:48:15 +0200 Subject: [PATCH 04/10] =?UTF-8?q?docs(session):=20SESSION=5F2026-06-30=5F0?= =?UTF-8?q?02=20=E2=80=94=20Wave=202=20complete,=20egg=20is=20live?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Data plane (postgres/rustfs/vault) → creds-in-Vault → Caddy DNS-01 → Forgejo → admin/org/repo → runner → backup, all deployed live and validated. The goal is met: git clone git@git.olsitec.net:olsitec/foundation.git works. Records state, the ADR-007 control-plane mechanism, known gaps (age encryption, refresh ipam diff), and the remaining PLAN-002 tasks (T11/T13/T14/T15). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../sessions/SESSION_2026-06-30_002.md | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 documentation/sessions/SESSION_2026-06-30_002.md diff --git a/documentation/sessions/SESSION_2026-06-30_002.md b/documentation/sessions/SESSION_2026-06-30_002.md new file mode 100644 index 0000000..845832d --- /dev/null +++ b/documentation/sessions/SESSION_2026-06-30_002.md @@ -0,0 +1,61 @@ +# Session 2026-06-30 #002 — Wave 2: data plane → forge → CI → backup (egg is LIVE) + +## What was done +Built and deployed **all of Wave 2** live to the Helsinki VM (cx33, `204.168.234.72`, +SSH :222). The egg now runs **6 containers** and **`git clone git@git.olsitec.net:olsitec/foundation.git` +works**. Each task is a reviewable commit (atomic, conventional). + +- **ADR-007** — control-plane ops via `@pulumi/command` `remote.Command` (docker-exec over SSH). + Internal ports (PG 5432, Vault 8200, RustFS 9000) aren't published, so init/role/bucket/admin/token + steps run inside the VM over the existing SSH path. Idempotent, readiness-gated, **secrets on stdin** + (the command provider echoes the command on error → never inline; `environment` needs sshd AcceptEnv + which the VM rejects). This is the cross-cutting mechanism for T03/T05/T06/T09/T10. +- **T03 postgres** — `foundation-postgres` (pg17), forgejo role+DB via remote.Command. `lib/remote.ts` + (`vmConnection`) + `credentials.ts` generator half (CONTRACT_002). +- **T04 rustfs** — `foundation-rustfs` + 4 buckets + scoped service account (mc `svcacct add` works on + RustFS; `mc ready` doesn't → gate on `mc ls`; mc busybox lacks grep → shell `case`). `IMAGE_MC` pinned. +- **T05 vault** — `foundation-vault` raft (`/vault/file`, IPC_LOCK). Init/unseal over docker-exec; keys + emitted on stdout (secret, `logging:Stderr` so never streamed) → `run.sh` captures to + `vaultCredentials:*`. `vault-unseal.sh` = passphrase-gated reboot helper (ADR-004). run.sh also pins + the backend per-process (`PULUMI_BACKEND_URL`, no global `pulumi login`). +- **T06 credentials** — `writeCredentialsToVault` writes postgres+rustfs+forgejo service-credentials to + the `foundation` kv-v2 mount via `vault kv put - ` (JSON on stdin). GATE A = `dependsOn vault.init`. +- **T07 caddy** — `foundation-caddy` public ingress (80/443), **DNS-01 TLS via Cloudflare** on a custom + xcaddy image (`containers/caddy-cloudflare/Dockerfile`, `caddy-dns/cloudflare@v0.2.4`, built on the VM, + image-id is the container image). Routes forge→Forgejo, s3→RustFS. Vault NOT proxied publicly. +- **T08 forgejo** — `foundation-forgejo` (fj11): external PG, RustFS blobs (default storage + LFS), + config via `FORGEJO__` env. The image's openssh sshd owns container :22 (`START_SSH_SERVER=false` + explicitly — a stale app.ini value crash-loops it on :22). HTTP 3000 via Caddy (200). +- **T09 forge bootstrap** — headless admin + org `olsitec` + auto-init repo `olsitec/foundation` + + operator SSH key, all via docker-exec (`forgejo admin` CLI + the image's curl). **Opened firewall :22** + (provision stack) so the scp-form clone works (VM admin sshd is on :222). +- **T10 runner** — `foundation-runner` (forgejo/runner:6). Idempotent register (token via + `generate-runner-token`, never leaves the VM); daemon runs uid 1000 + host docker group (gid 996) for + socket access. A hello-world `runs-on: docker` workflow ran to **success**. +- **T12 backup** — `backup/{backup,restore}.sh` + `*-remote.sh`. Bundle (pg_dumpall, forgejo repos + tar.zst, vault raft snap, pulumi state, rustfs blobs, MANIFEST.json) → RustFS + **offsite** Synology + bucket. `restore.sh` = non-destructive scratch-restore verifier — **PASSES from both rfs and offsite**. + +## Current state +- Repo `~/work/olsitec-foundation/foundation`, branch `master`, latest commit = T12. Working tree clean. +- `cd bootstrap && ./run.sh up` is **idempotent — 41 unchanged**. Live containers: postgres, rustfs, + vault, caddy, forgejo, runner (all healthy/up). +- Master passphrase: `pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE`. VM key + `~/.ssh/foundation-test_ed25519` (also the registered Forgejo operator key). +- Verified: `https://forge.olsitec.net` = 200 (LE cert), `git clone git@git.olsitec.net:olsitec/foundation.git` + (:22) **and** `ssh://…:2222/…` both clone; Vault paths populated; CI green; backup restorable offsite. + +## Known gaps / next steps +- **age at-rest encryption** of backups (CONTRACT_004 §4.3) not yet applied — both backup destinations + are private/access-controlled; generate the age key + encrypt-before-upload is the next hardening. +- **Determinism**: a `pulumi up --refresh` surfaces a spurious `foundation-net` `ipamConfigs` diff — do + NOT apply it (recreating the network disconnects everything); plain `up` ignores it. Investigate before + enabling refresh in CI. +- **Forgejo crypto secrets** (SECRET_KEY/INTERNAL_TOKEN/JWT) auto-generate in app.ini but aren't mirrored + to Vault (`foundation/forgejo/service-credentials` has only admin user/pw). Capture them later. +- Runner is co-located + root-equivalent (host docker socket) — fence to a separate VM for untrusted CI + (PLAN-002 R5). The docker gid (996) is host-specific — re-check on DR. +- Remaining PLAN-002 tasks: **T11** handover (push repo→Forgejo, switch origin), **T13** DR-to-fresh-VM, + **T14** `.forgejo/workflows/`, **T15** index orchestration polish + DAY-ZERO checklist. + +## Operating mode for next session: HIGH-RISK / INFRA (remote VM, Docker, secrets). From aabb50fb3b8541cd9edf327c711ceb3dff254f27 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 22:51:31 +0200 Subject: [PATCH 05/10] =?UTF-8?q?docs(session):=20HANDOVER=20=E2=80=94=20n?= =?UTF-8?q?ext-session=20prompt=20(Wave=202=20done,=20T11/T13/T14/T15=20+?= =?UTF-8?q?=20gaps=20next)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Self-contained prompt for a fresh Lead Agent context: required reads (incl. ADR-007), current live state, operating essentials (run.sh / vault-unseal / backup), HIGH-RISK watchouts (the refresh ipam diff), and the remaining PLAN-002 task order. Co-Authored-By: Claude Opus 4.8 (1M context) --- documentation/sessions/HANDOVER.md | 62 ++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 documentation/sessions/HANDOVER.md diff --git a/documentation/sessions/HANDOVER.md b/documentation/sessions/HANDOVER.md new file mode 100644 index 0000000..0638866 --- /dev/null +++ b/documentation/sessions/HANDOVER.md @@ -0,0 +1,62 @@ +# HANDOVER — next-session prompt (paste into a fresh context) + +> Living doc: overwritten each handover. The durable record is the dated +> `SESSION_*` files. Latest state = `SESSION_2026-06-30_002.md`. + +--- + +Continue the **olsitec-foundation** build. You are the **Lead Agent, HIGH-RISK / INFRA mode**. + +## Required reads (in `~/work/olsitec-foundation/foundation/`) +1. `documentation/sessions/SESSION_2026-06-30_002.md` ← current state + known gaps + next steps +2. `documentation/000_baseline.md` + `000_TOPOLOGY.md` +3. `documentation/contracts/CONTRACT_001–004` + `decisions/ADR_004,005,006,007` + (**ADR-007** is the control-plane mechanism the whole egg runs on — read it first) +4. `documentation/planning/PLAN-002-foundation-implementation.md` §10 + +## Where things stand +**The egg is LIVE and the goal is met.** Wave 2 (T03–T10, T12) is deployed to the Helsinki VM and +committed. `git clone git@git.olsitec.net:olsitec/foundation.git` works (:22 and :2222). Six containers +on `foundation-net`: postgres, rustfs, vault, caddy, forgejo, runner — all healthy. `https://forge.olsitec.net` += 200 (LE DNS-01). CI green. Backups → RustFS + offsite, restore-verified from both. `cd bootstrap && +./run.sh up` is idempotent (**41 unchanged**). Working tree clean on `master`. + +## Operating essentials +- **VM**: `204.168.234.72`, admin SSH **:222**, key `~/.ssh/foundation-test_ed25519` (also the registered + Forgejo operator key). Git endpoint is :22 (scp-form) + :2222. +- **Deploy**: `cd bootstrap && ./run.sh up` (sets passphrase + key + per-process backend; captures Vault + keys to config after `up`). Master passphrase: `pass olsitec-foundation/PULUMI_CONFIG_PASSPHRASE`. +- **Vault reboot**: `bootstrap/vault-unseal.sh`. **Backup**: `backup/backup.sh [ts]`; + **restore-verify**: `backup/restore.sh [rfs|off]`. +- **Mechanism (ADR-007)**: in-VM control-plane ops = `@pulumi/command` `remote.Command` (docker-exec over + SSH); idempotent, readiness-gated, **secrets on stdin** (never inline — the provider echoes the command + on error). Images are digest-pinned in `VERSIONS`. + +## Watchouts (HIGH-RISK) +- Do **NOT** `pulumi up --refresh` blindly — it surfaces a spurious `foundation-net` ipamConfigs diff; + applying it recreates the network and disconnects every container. Plain `up` ignores it. (Investigate + + fix the drift before enabling refresh in CI.) +- Never print/commit the passphrase, Vault root token, or unseal keys (D2) — only the already-encrypted + `secure: v1:…` values in `Pulumi.foundation.yaml`. +- Don't `pulumi up` against the production `olsicloud4-*` stacks. The `provision`/`offsite-backup` stacks + use the throwaway passphrase `dev-validation-throwaway` + `HCLOUD_TOKEN`/`MINIO_BACKUP_*` from `pass`. +- Commit **atomically per task** (conventional commits; group by concern; don't `git add .`). + +## Next work — remaining PLAN-002 tasks + the known gaps +Pick up where the plan left off (parallelization map §10.2 Wave 5–6). Suggested order: +1. **Close the gaps from SESSION_2026-06-30_002 "Known gaps"** — they're small and de-risk the rest: + - age at-rest encryption of backups (CONTRACT_004 §4.3): generate the age key, store recipient/identity + (Vault `foundation/backup/backup-credentials` + passphrase config), encrypt artifacts before upload. + - Mirror Forgejo crypto secrets (SECRET_KEY/INTERNAL_TOKEN/JWT from app.ini) into + `foundation/forgejo/service-credentials`. + - Investigate + fix the `foundation-net` ipam refresh diff so `up --refresh` is safe. +2. **T11 handover** — push the foundation repo into Forgejo (`olsitec/foundation`) and switch origin; + mirror `ai-baseline`. (The repo already exists in Forgejo from T09 with a README — reconcile.) +3. **T13 DR** — `dr/RUNBOOK.md` + `dr/restore-to-fresh-vm.sh`; rehearse a full rebuild on a clean VM from + the offsite bundle (the destructive sibling of `backup/restore.sh`, restore order Vault→PG→RustFS→Forgejo). +4. **T14 CI** — `.forgejo/workflows/` (preflight, pulumi preview/up, backup-verify weekly). +5. **T15** — `index.ts` orchestration polish + Gate A/B comments + `docs/DAY-ZERO-TIMELINE.md` checklist. +6. **Then hardening**: pin remaining floating refs, fence the runner to a separate privileged VM (R5), + register the project in Olsitec MCP (D6 / PLAN-002 §8), and the Stage-2 publish of `packages/pulumi-*`. + +Validate each task live on the VM via `./run.sh up` and commit per task. From 92e8f978a5d106ed5c614148ebdec0fe45b978e6 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:23:38 +0200 Subject: [PATCH 06/10] =?UTF-8?q?feat(backup):=20age=20at-rest=20encryptio?= =?UTF-8?q?n=20of=20bundles=20(CONTRACT=5F004=20=C2=A74.3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the known gap: backup bundles were uploaded unencrypted, relying solely on destination access control. Now every data artifact is age-encrypted on the VM before upload and decrypted on restore. - backup-remote.sh: assemble rustfs blobs into rustfs-blobs.tar.zst (so the whole bundle is one encrypted unit), then age -r each artifact to .age and drop the plaintext. MANIFEST.json stays cleartext — it is the inventory + integrity gate and carries no secrets; it records each artifact's PLAINTEXT sha256 so restore verifies after decrypt. - restore-remote.sh: materialise the age identity to a 0600 file, decrypt each .age, then run the existing sha + scratch-restore asserts; add a rustfs-blobs extract+assert. - backup.sh / restore.sh: pass the public recipient (arg) / secret identity (stdin, never argv) from passphrase-encrypted config. - provision/index.ts: install age + zstd on the VM via cloud-init so a fresh DR VM (T13) has the backup tools from first boot. - Pulumi.foundation.yaml: seed backup.ageRecipient (public) + backup.ageIdentity (secure:). The identity lives in config so {repo + passphrase} can decrypt a bundle even after total Vault loss (CONTRACT_004 §4.3). Validated live: encrypted backup + restore-verify PASS from both RustFS and offsite; bucket shows only *.age + cleartext MANIFEST.json. Co-Authored-By: Claude Opus 4.8 (1M context) --- backup/backup-remote.sh | 54 ++++++++++++++++++++++---------- backup/backup.sh | 5 +-- backup/restore-remote.sh | 30 +++++++++++++++--- backup/restore.sh | 3 +- bootstrap/Pulumi.foundation.yaml | 3 ++ provision/index.ts | 9 +++++- 6 files changed, 79 insertions(+), 25 deletions(-) diff --git a/backup/backup-remote.sh b/backup/backup-remote.sh index 0ec19f4..a32ce3b 100755 --- a/backup/backup-remote.sh +++ b/backup/backup-remote.sh @@ -1,16 +1,19 @@ #!/bin/sh # backup-remote.sh — the VM-side bundle assembler (CONTRACT_004 producer half). # Shipped + run by backup/backup.sh; NOT run directly. Secrets arrive on stdin -# (never argv); non-secrets ($TS, $MC_IMAGE) are args. pulumi-state.json is already -# in $W (the operator placed it there before invoking this). +# (never argv); non-secrets ($TS, $MC_IMAGE, $AGE_RECIPIENT) are args. pulumi-state.json +# is already in $W (the operator placed it there before invoking this). # # Produces foundation-backups// in RustFS and replicates it to the offsite # bucket. Artifacts per CONTRACT_004 §4.2: postgres.sql.gz, forgejo-repos.tar.zst, -# vault-raft.snap, pulumi-state.json, rustfs-blobs/, MANIFEST.json. +# vault-raft.snap, pulumi-state.json, rustfs-blobs.tar.zst, MANIFEST.json. # -# NOTE: at-rest age encryption (CONTRACT_004 §4.3) is NOT yet applied — both -# destinations are private (RustFS internal, offsite scoped creds). Adding age is -# the next hardening (generate the key, encrypt each artifact before `mc cp`). +# At-rest encryption (CONTRACT_004 §4.3): every DATA artifact is age-encrypted to +# $AGE_RECIPIENT before upload (`` -> `.age`); only MANIFEST.json travels +# in cleartext (it carries no secrets — it is the inventory + integrity gate, and +# lists each artifact's PLAINTEXT sha256 so restore verifies after decryption). The +# matching identity is in Vault + passphrase-encrypted config (CONTRACT_004 §4.3), +# so {repo + passphrase} can always decrypt even after total Vault loss. set -eu IFS= read -r VAULT_TOKEN IFS= read -r OFF_EP @@ -19,6 +22,7 @@ IFS= read -r OFF_SK IFS= read -r BUCKET TS="$1" MC_IMAGE="$2" +AGE_RECIPIENT="$3" OFFSITE_BUCKET=olsitec-foundation W="/tmp/foundation-backup-$TS" mkdir -p "$W" @@ -34,20 +38,41 @@ echo "[backup] vault raft snapshot" >&2 docker exec -e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN="$VAULT_TOKEN" foundation-vault \ sh -c 'vault operator raft snapshot save /tmp/v.snap >/dev/null 2>&1 && cat /tmp/v.snap && rm -f /tmp/v.snap' > "$W/vault-raft.snap" +# RustFS root creds from the running container (VM-trusted). +RAK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_ACCESS_KEY=//p') +RSK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_SECRET_KEY=//p') + +echo "[backup] rustfs blobs -> rustfs-blobs.tar.zst" >&2 +# Pull the blob buckets onto the VM fs so the bundle is a single encrypted unit +# (CONTRACT_004 §4.3 "whole bundle"). Tiny at Layer 0; may be made incremental later. +mkdir -p "$W/blobs/forgejo-packages" "$W/blobs/forgejo-artifacts" "$W/blobs/forgejo-lfs" +docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ + -e RAK="$RAK" -e RSK="$RSK" "$MC_IMAGE" -c ' + set -e + mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null + for b in forgejo-packages forgejo-artifacts forgejo-lfs; do + mc mirror --overwrite --quiet "rfs/$b" "/w/blobs/$b" >/dev/null 2>&1 || true + done' +tar -C "$W/blobs" -cf - . | zstd -q -T0 > "$W/rustfs-blobs.tar.zst" +rm -rf "$W/blobs" + echo "[backup] MANIFEST.json" >&2 ( cd "$W" - jq -n --arg ts "$TS" \ - --argjson files "$(for f in postgres.sql.gz forgejo-repos.tar.zst vault-raft.snap pulumi-state.json; do + jq -n --arg ts "$TS" --arg rcpt "$AGE_RECIPIENT" \ + --argjson files "$(for f in postgres.sql.gz forgejo-repos.tar.zst vault-raft.snap pulumi-state.json rustfs-blobs.tar.zst; do [ -f "$f" ] || continue jq -n --arg n "$f" --arg sha "$(sha256sum "$f" | cut -d' ' -f1)" --argjson sz "$(stat -c %s "$f")" \ '{name:$n, sha256:$sha, size:$sz}' done | jq -s '.')" \ - '{timestamp:$ts, restoreOrder:["vault","postgres","rustfs","forgejo"], artifacts:$files}' > MANIFEST.json + '{timestamp:$ts, encryption:"age", ageRecipient:$rcpt, restoreOrder:["vault","postgres","rustfs","forgejo"], artifacts:$files}' > MANIFEST.json ) -# RustFS root creds from the running container (VM-trusted). -RAK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_ACCESS_KEY=//p') -RSK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_SECRET_KEY=//p') +echo "[backup] age-encrypt artifacts (-> *.age)" >&2 +for f in postgres.sql.gz forgejo-repos.tar.zst vault-raft.snap pulumi-state.json rustfs-blobs.tar.zst; do + [ -f "$W/$f" ] || continue + age -r "$AGE_RECIPIENT" -o "$W/$f.age" "$W/$f" + rm -f "$W/$f" +done echo "[backup] upload to RustFS $BUCKET/$TS + replicate offsite" >&2 docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ @@ -58,10 +83,7 @@ docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ mc alias set rfs http://foundation-rustfs:9000 "$RAK" "$RSK" >/dev/null mc alias set off "$OFF_EP" "$OFF_AK" "$OFF_SK" >/dev/null mc cp -r /w/ "rfs/$BUCKET/$TS/" >/dev/null - for b in forgejo-packages forgejo-artifacts forgejo-lfs; do - mc mirror --overwrite --quiet "rfs/$b" "rfs/$BUCKET/$TS/rustfs-blobs/$b" >/dev/null 2>&1 || true - done mc mirror --overwrite --quiet "rfs/$BUCKET/$TS" "off/$OFFB/$TS" >/dev/null ' rm -rf "$W" -echo "[backup] complete: rfs/$BUCKET/$TS (+ offsite $OFFSITE_BUCKET/$TS)" >&2 +echo "[backup] complete: rfs/$BUCKET/$TS (+ offsite $OFFSITE_BUCKET/$TS), age-encrypted" >&2 diff --git a/backup/backup.sh b/backup/backup.sh index c2190fb..6a99317 100755 --- a/backup/backup.sh +++ b/backup/backup.sh @@ -24,6 +24,7 @@ OFF_EP=$(pulumi config get foundation:backup.offsiteEndpoint) OFF_AK=$(pulumi config get foundation:backup.offsiteAccessKey) OFF_SK=$(pulumi config get foundation:backup.offsiteSecretKey) BUCKET=$(pulumi config get foundation:backup.bucket) +AGE_RECIPIENT=$(pulumi config get foundation:backup.ageRecipient) # public; CONTRACT_004 §4.3 HOST=$(pulumi config get foundation:vm.host) PORT=$(pulumi config get foundation:vm.sshPort) SUSER=$(pulumi config get foundation:vm.user) @@ -34,7 +35,7 @@ echo "backup: $TS -> rfs/$BUCKET/$TS (+ offsite)" # Pulumi state + the assembler script onto the VM. pulumi stack export | $SSHX "mkdir -p $W && cat > $W/pulumi-state.json" $SSHX "cat > /tmp/backup-remote-$TS.sh" < "$ROOT/backup/backup-remote.sh" -# Run the assembler: secrets on stdin (never argv), TS + MC_IMAGE as args. +# Run the assembler: secrets on stdin (never argv); TS, MC_IMAGE, age recipient as args. printf '%s\n%s\n%s\n%s\n%s\n' "$RT" "$OFF_EP" "$OFF_AK" "$OFF_SK" "$BUCKET" \ - | $SSHX "sh /tmp/backup-remote-$TS.sh '$TS' '$MC_IMAGE'; rm -f /tmp/backup-remote-$TS.sh" + | $SSHX "sh /tmp/backup-remote-$TS.sh '$TS' '$MC_IMAGE' '$AGE_RECIPIENT'; rm -f /tmp/backup-remote-$TS.sh" echo "backup: done ($TS)" diff --git a/backup/restore-remote.sh b/backup/restore-remote.sh index 12a10b0..68b507c 100755 --- a/backup/restore-remote.sh +++ b/backup/restore-remote.sh @@ -6,17 +6,24 @@ # disaster restore (overwriting live, restore order Vault->Postgres->RustFS->Forgejo) # is dr/restore-to-fresh-vm.sh (T13), out of scope here. # -# Secrets on stdin; non-secrets ($TS, $MC_IMAGE, $PG_IMAGE, $SRC) as args. +# Secrets on stdin (OFF_* offsite creds + the age IDENTITY); non-secrets ($TS, +# $MC_IMAGE, $PG_IMAGE, $SRC) as args. The bundle is age-encrypted (CONTRACT_004 +# §4.3): every artifact is pulled as .age and decrypted with the identity +# BEFORE its MANIFEST sha256 (a PLAINTEXT sha) is verified. set -eu IFS= read -r OFF_EP IFS= read -r OFF_AK IFS= read -r OFF_SK IFS= read -r BUCKET +IFS= read -r AGE_IDENTITY TS="$1"; MC_IMAGE="$2"; PG_IMAGE="$3"; SRC="${4:-rfs}" OFFSITE_BUCKET=olsitec-foundation W="/tmp/foundation-restore-$TS" -rm -rf "$W"; mkdir -p "$W" -fail() { echo "RESTORE VERIFY FAIL: $1" >&2; docker rm -f foundation-restore-pg >/dev/null 2>&1 || true; exit 1; } +rm -rf "$W"; (umask 077; mkdir -p "$W") +fail() { echo "RESTORE VERIFY FAIL: $1" >&2; docker rm -f foundation-restore-pg >/dev/null 2>&1 || true; rm -f "$W/age.key" 2>/dev/null || true; exit 1; } + +# Materialise the age identity to a 0600 file for `age -d -i` (removed on exit). +( umask 077; printf '%s\n' "$AGE_IDENTITY" > "$W/age.key" ) RAK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_ACCESS_KEY=//p') RSK=$(docker inspect foundation-rustfs --format '{{range .Config.Env}}{{println .}}{{end}}' | sed -n 's/^RUSTFS_SECRET_KEY=//p') @@ -37,8 +44,14 @@ docker run --rm --network foundation-net --entrypoint sh -v "$W":/w \ [ -f "$W/MANIFEST.json" ] || { [ -d "$W/$TS" ] && mv "$W/$TS"/* "$W"/; } [ -f "$W/MANIFEST.json" ] || fail "MANIFEST.json missing from pulled bundle" -echo "[restore] verify MANIFEST sha256" >&2 cd "$W" +echo "[restore] age-decrypt artifacts" >&2 +for name in $(jq -r '.artifacts[].name' MANIFEST.json); do + [ -f "$name.age" ] || fail "$name.age missing from bundle" + age -d -i age.key -o "$name" "$name.age" 2>/dev/null || fail "age decrypt failed: $name" +done + +echo "[restore] verify MANIFEST sha256 (plaintext)" >&2 jq -r '.artifacts[] | "\(.sha256) \(.name)"' MANIFEST.json | while read -r sha name; do [ -f "$name" ] || { echo "missing $name" >&2; exit 1; } got=$(sha256sum "$name" | cut -d' ' -f1) @@ -62,9 +75,16 @@ zstd -dc forgejo-repos.tar.zst | tar -C repos -xf - 2>/dev/null || fail "forgejo [ -d repos/git/repositories/olsitec/foundation.git ] || fail "olsitec/foundation.git not in repo bundle" echo "[restore] forgejo repos OK: olsitec/foundation.git present" >&2 +echo "[restore] extract rustfs blobs + assert packages present" >&2 +mkdir -p blobs +zstd -dc rustfs-blobs.tar.zst | tar -C blobs -xf - 2>/dev/null || fail "rustfs-blobs tar extract failed" +[ -d blobs/forgejo-packages ] || fail "forgejo-packages not in blob bundle" +echo "[restore] rustfs blobs OK: $(find blobs -type f | wc -l | tr -d ' ') object(s)" >&2 + echo "[restore] vault snapshot sanity" >&2 [ -s vault-raft.snap ] || fail "vault-raft.snap empty" echo "[restore] vault snapshot OK: $(stat -c %s vault-raft.snap) bytes" >&2 +rm -f "$W/age.key" rm -rf "$W" -echo "RESTORE VERIFY PASS ($TS from $SRC)" +echo "RESTORE VERIFY PASS ($TS from $SRC, age-decrypted)" diff --git a/backup/restore.sh b/backup/restore.sh index d8efe1d..6206690 100755 --- a/backup/restore.sh +++ b/backup/restore.sh @@ -23,11 +23,12 @@ OFF_EP=$(pulumi config get foundation:backup.offsiteEndpoint) OFF_AK=$(pulumi config get foundation:backup.offsiteAccessKey) OFF_SK=$(pulumi config get foundation:backup.offsiteSecretKey) BUCKET=$(pulumi config get foundation:backup.bucket) +AGE_IDENTITY=$(pulumi config get foundation:backup.ageIdentity) # secret; CONTRACT_004 §4.3 HOST=$(pulumi config get foundation:vm.host) PORT=$(pulumi config get foundation:vm.sshPort) SUSER=$(pulumi config get foundation:vm.user) SSHX="ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=15 -i $KEY -p $PORT $SUSER@$HOST" $SSHX "cat > /tmp/restore-remote-$TS.sh" < "$ROOT/backup/restore-remote.sh" -printf '%s\n%s\n%s\n%s\n' "$OFF_EP" "$OFF_AK" "$OFF_SK" "$BUCKET" \ +printf '%s\n%s\n%s\n%s\n%s\n' "$OFF_EP" "$OFF_AK" "$OFF_SK" "$BUCKET" "$AGE_IDENTITY" \ | $SSHX "sh /tmp/restore-remote-$TS.sh '$TS' '$MC_IMAGE' '$PG_IMAGE' '$SRC'; rm -f /tmp/restore-remote-$TS.sh" diff --git a/bootstrap/Pulumi.foundation.yaml b/bootstrap/Pulumi.foundation.yaml index 5dba134..e1508e1 100644 --- a/bootstrap/Pulumi.foundation.yaml +++ b/bootstrap/Pulumi.foundation.yaml @@ -63,4 +63,7 @@ config: secure: v1:9YpTkFoQanMwxAQV:dJ4YmXS0aOTHPbuK1H6AJ0SAJ0CjYX0iIyLOQAUNfsOWLsSy5TXxPpGecieBWkzc4AALDkJNlQN9Xo6Q0ZcaSg== vaultCredentials:rootToken: secure: v1:OUpYMjnaftxMUKjv:2m+dydQopXGRleeX6ddhYSHgHP7HHZXYLAvQHXUvaA91qajoxU+VugDB/Rs= + foundation:backup.ageRecipient: age1x6dmgtt2eahpvyzkmy6j80rts28chw2lcam0rcxq3nhc8ld649sslzpsy4 + foundation:backup.ageIdentity: + secure: v1:VCFVXswrmMrXyFbr:p4pfG/Kp2lreetYX4O86rZqpU1xQugRycF+PBBiNGZnaD0c15R+mJuLNrl0rBXY5vJwyZTbNSpFY1zPQ7TwuQcVp9h8oiGcgVEobsbb4BBp3lFhsObllgYM9 encryptionsalt: v1:5YhUt8BVfH0=:v1:DPCHl+7zwn4RaMPj:A19tZzBlZ1NmDtTWrHreEKk5e8idyw== diff --git a/provision/index.ts b/provision/index.ts index 8bc5b53..d3eac95 100644 --- a/provision/index.ts +++ b/provision/index.ts @@ -44,6 +44,13 @@ const dockerInstall = [ "touch /root/.provision-done", ]; +// Host tools the backup/restore path runs directly on the VM (not in a container): +// `zstd` (forgejo-repos + rustfs-blobs compression) and `age` (CONTRACT_004 §4.3 +// at-rest encryption). `jq` is already in the cloud-init base packages. Declaring +// them here means a fresh DR VM (T13) has them from first boot — do not rely on +// them being present incidentally. +const backupTools = ["zstd", "age"]; + const dep = new HetznerDeployment(platformName, { platformName, hcloudToken: token, @@ -56,7 +63,7 @@ const dep = new HetznerDeployment(platformName, { type: "cx33", // 4c/8G/80GB (Helsinki); cx22 legacy, cx33 not in nbg1 image: "debian-12", labels: { purpose: "foundation-test", ephemeral: "true" }, - cloudInitConfig: { lateCommands: dockerInstall }, + cloudInitConfig: { extraPackages: backupTools, lateCommands: dockerInstall }, }, ], }); From f2ef9bc92243fff2de62b6fa5c04650e16f73706 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:23:38 +0200 Subject: [PATCH 07/10] feat(credentials): mirror backup creds + age key into Vault (CONTRACT_002) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit foundation/backup/backup-credentials was never populated in Vault. Add a writer (same ADR-007 docker-exec-over-SSH pattern, GATE A / dependsOn vault.init) that mirrors the config-seeded offsite S3 creds and the age key into Vault, completing CONTRACT_002 §2.3 for in-Vault consumers (Layer-1 ESO, the weekly backup-verify job). - config.ts: loadBackupSecrets() — single reader of the backup secret slice (offsite creds + age recipient/identity), keeping components off raw Config. - credentials.ts: writeBackupCredentialsToVault() — idempotent vault kv put; secret values on stdin (D2), non-secrets as shell vars. - index.ts: wire it beside the data-plane creds writer. Keys written: offsiteEndpoint, offsiteAccessKey, offsiteSecretKey, backupAgeRecipient, backupAgeIdentity. Validated live: +1 resource, then 42 unchanged (idempotent); vault kv get shows all five keys populated. Co-Authored-By: Claude Opus 4.8 (1M context) --- bootstrap/components/credentials.ts | 63 +++++++++++++++++++++++++++++ bootstrap/config.ts | 27 +++++++++++++ bootstrap/index.ts | 6 ++- 3 files changed, 95 insertions(+), 1 deletion(-) diff --git a/bootstrap/components/credentials.ts b/bootstrap/components/credentials.ts index e74b3f6..8cd1d9d 100644 --- a/bootstrap/components/credentials.ts +++ b/bootstrap/components/credentials.ts @@ -15,6 +15,7 @@ import * as pulumi from "@pulumi/pulumi"; import * as command from "@pulumi/command"; import { RandomPassword } from "@pulumi/random"; +import { BackupSecrets } from "../config"; import { DeployCtx } from "../lib/context"; import { vmConnection } from "../lib/remote"; import { VaultOutputs } from "./vault"; @@ -167,3 +168,65 @@ ${creds.forgejo.adminPassword} { dependsOn: [vault.init] }, ); } + +// Mirrors the config-seeded backup credentials (offsite S3 creds + the age key) +// into Vault at foundation/backup/backup-credentials (CONTRACT_002 §2.3). Unlike +// the generated data-plane creds these are seeded once into passphrase-encrypted +// config (the age IDENTITY MUST also live there so {repo + passphrase} can decrypt +// a bundle after total Vault loss — CONTRACT_004 §4.3); this writer makes them +// available to in-Vault consumers (Layer-1 ESO, the backup-verify job). Secret +// values on stdin (ADR-007 D2); non-secrets (endpoint, recipient) as shell vars. +const WRITE_BACKUP_CREDS = `set -eu +IFS= read -r ROOT_TOKEN +IFS= read -r OFF_AK +IFS= read -r OFF_SK +IFS= read -r AGE_IDENTITY +C=foundation-vault +VE="-e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN=$ROOT_TOKEN" + +if ! docker exec $VE "$C" vault secrets list -format=json 2>/dev/null | jq -e 'has("foundation/")' >/dev/null; then + docker exec $VE "$C" vault secrets enable -path=foundation kv-v2 >/dev/null +fi + +jq -n --arg ep "$OFF_EP" --arg ak "$OFF_AK" --arg sk "$OFF_SK" --arg ar "$AGE_RECIPIENT" --arg ai "$AGE_IDENTITY" \ + '{offsiteEndpoint:$ep,offsiteAccessKey:$ak,offsiteSecretKey:$sk,backupAgeRecipient:$ar,backupAgeIdentity:$ai}' \ + | docker exec -i $VE "$C" vault kv put foundation/backup/backup-credentials - >/dev/null + +echo "vault: wrote backup/backup-credentials (offsite + age key)"`; + +/** + * Mirror the backup credentials (incl. the age key) into Vault (CONTRACT_002 §2.3). + * Depends on Vault being unsealed (GATE A) via vault.init — same pattern as the + * data-plane creds writer above. + */ +export function writeBackupCredentialsToVault( + ctx: DeployCtx, + vault: VaultOutputs, + backup: BackupSecrets, +): command.remote.Command { + const create = pulumi.interpolate`OFF_EP='${backup.offsiteEndpoint}' +AGE_RECIPIENT='${backup.ageRecipient}' +${WRITE_BACKUP_CREDS}`; + + return new command.remote.Command( + "foundation-backup-credentials", + { + connection: vmConnection(ctx), + create, + update: create, + stdin: pulumi.interpolate`${vault.rootToken} +${backup.offsiteAccessKey} +${backup.offsiteSecretKey} +${backup.ageIdentity} +`, + addPreviousOutputInEnv: false, + triggers: [ + vault.init.id, + backup.offsiteAccessKey, + backup.offsiteSecretKey, + backup.ageIdentity, + ], + }, + { dependsOn: [vault.init] }, + ); +} diff --git a/bootstrap/config.ts b/bootstrap/config.ts index 762c19d..d737a00 100644 --- a/bootstrap/config.ts +++ b/bootstrap/config.ts @@ -88,6 +88,33 @@ export interface FoundationConfig { }; } +/** + * The backup secret slice (CONTRACT_001 §1.3 + CONTRACT_002 `foundation/backup/ + * backup-credentials`). Kept OUT of the non-secret FoundationConfig surface: these + * are `secure:`-encrypted (offsite creds, age identity) or a public key that only + * the Vault mirror + backup scripts consume. `loadBackupSecrets()` is the single + * reader (composition point), so components still never touch raw pulumi.Config. + */ +export interface BackupSecrets { + offsiteEndpoint: string; // non-secret (also in FoundationConfig.backup) + offsiteAccessKey: pulumi.Output; + offsiteSecretKey: pulumi.Output; + ageRecipient: string; // age1… public key (non-secret) + ageIdentity: pulumi.Output; // AGE-SECRET-KEY-… (secret; survives Vault loss via config) +} + +/** Reads the backup secret slice for the Vault mirror (CONTRACT_002 §2.3). */ +export function loadBackupSecrets(): BackupSecrets { + const c = new pulumi.Config("foundation"); + return { + offsiteEndpoint: c.require("backup.offsiteEndpoint"), + offsiteAccessKey: c.requireSecret("backup.offsiteAccessKey"), + offsiteSecretKey: c.requireSecret("backup.offsiteSecretKey"), + ageRecipient: c.require("backup.ageRecipient"), + ageIdentity: c.requireSecret("backup.ageIdentity"), + }; +} + /** * The SSH private key path is supplied by ENV (CONTRACT_001 §1: `SSH_PRIVATE_KEY_PATH`, * default ~/.ssh/id_rsa) — NEVER by Pulumi config. Exposed separately from diff --git a/bootstrap/index.ts b/bootstrap/index.ts index 1c1ed20..6d925fb 100644 --- a/bootstrap/index.ts +++ b/bootstrap/index.ts @@ -6,13 +6,14 @@ // fill the marked slots — this file is the single composition point; components stay // pure factories in components/*. import * as pulumi from "@pulumi/pulumi"; -import { loadConfig } from "./config"; +import { loadConfig, loadBackupSecrets } from "./config"; import { buildBaseContext, DeployCtx } from "./lib/context"; import { deployNetwork } from "./components/network"; import { deployDns } from "./components/dns"; import { generateCredentials, writeCredentialsToVault, + writeBackupCredentialsToVault, } from "./components/credentials"; import { deployPostgres } from "./components/postgres"; import { deployRustfs } from "./components/rustfs"; @@ -47,6 +48,8 @@ const vault = deployVault(ctx); // --- GATE A: Vault init + unseal (T05). T06 writes the generated data-plane creds // into Vault (CONTRACT_002), dependsOn vault.init so it runs only once unsealed. const vaultCreds = writeCredentialsToVault(ctx, credentials, vault); +// Mirror the config-seeded backup creds + age key into Vault (CONTRACT_002 §2.3). +const backupCreds = writeBackupCredentialsToVault(ctx, vault, loadBackupSecrets()); // ============================================================================= // PHASE 6 — FORGE (depends on: credentials, GATE A) // T07 caddy ✓ · T08 forgejo · T10 runner @@ -77,6 +80,7 @@ const runner = cfg.features.runner ? deployRunner(ctx, forgejo) : undefined; // Stack outputs (extended as phases land). // vaultCreds (T06) is a gate for Forgejo (T08) — it has no output to export yet. void vaultCreds; +void backupCreds; // CONTRACT_002 backup/backup-credentials mirror; no secret output export const phase = "T10-runner"; // forge + CI runner live export const caddyImageId = proxy.imageId; From fbd1ad4d1d42598066ef997383de5334a6c4c113 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:28:17 +0200 Subject: [PATCH 08/10] feat(credentials): mirror Forgejo crypto secrets into Vault (CONTRACT_002) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the known gap: foundation/forgejo/service-credentials held only the admin user/pw; the crypto secrets Forgejo auto-generates into app.ini were never captured. Make that path single-owned at GATE B and write admin + crypto together. - credentials.ts: drop the forgejo block from the GATE-A writer (its crypto secrets don't exist until Forgejo first-starts) and add writeForgejoCredentialsToVault — runs after forgejo.ready, reads SECRET_KEY, INTERNAL_TOKEN, LFS_JWT_SECRET ([server]) and oauth2 JWT_SECRET straight off the live app.ini via docker-exec (ADR-007), and puts the full path. One writer per Vault path avoids a put/patch race on re-runs. - index.ts: wire it at GATE B (dependsOn vault.init + forgejo.ready). Keys: forgejoAdminUser, forgejoAdminPassword, forgejoSecretKey, forgejoInternalToken, forgejoJwtSecret, forgejoOauth2JwtSecret. Validated live: forgejo path now has all six; postgres/rustfs paths intact through the GATE-A writer replacement; idempotent at 43 unchanged. FINDING: forgejoSecretKey mirrors EMPTY — skipping the web installer (INSTALL_LOCK) left Forgejo's [security] SECRET_KEY unset. Fixed next commit. Co-Authored-By: Claude Opus 4.8 (1M context) --- bootstrap/components/credentials.ts | 76 +++++++++++++++++++++++++---- bootstrap/index.ts | 10 ++++ 2 files changed, 76 insertions(+), 10 deletions(-) diff --git a/bootstrap/components/credentials.ts b/bootstrap/components/credentials.ts index 8cd1d9d..3818e54 100644 --- a/bootstrap/components/credentials.ts +++ b/bootstrap/components/credentials.ts @@ -99,7 +99,6 @@ IFS= read -r PG_FORGEJO_PW IFS= read -r RUSTFS_ADMIN_PW IFS= read -r RUSTFS_SVC_ID IFS= read -r RUSTFS_SVC_SECRET -IFS= read -r FORGEJO_ADMIN_PW C=foundation-vault VE="-e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN=$ROOT_TOKEN" @@ -117,11 +116,12 @@ jq -n --arg u "$RUSTFS_ADMIN_USER" --arg p "$RUSTFS_ADMIN_PW" --arg ki "$RUSTFS_ '{rustfsAdminUser:$u,rustfsAdminPassword:$p,rustfsServiceKeyId:$ki,rustfsServiceKeySecret:$ks}' \ | put rustfs/service-credentials -jq -n --arg u "$FORGEJO_ADMIN_USER" --arg p "$FORGEJO_ADMIN_PW" \ - '{forgejoAdminUser:$u,forgejoAdminPassword:$p}' \ - | put forgejo/service-credentials - -echo "vault: wrote postgres + rustfs + forgejo service-credentials"`; +echo "vault: wrote postgres + rustfs service-credentials"`; +// NOTE: foundation/forgejo/service-credentials is NOT written here. Its crypto +// secrets (SECRET_KEY/INTERNAL_TOKEN/JWT) only exist after Forgejo first-starts +// and writes app.ini, so the whole forgejo path (admin + crypto) is single-owned +// by writeForgejoCredentialsToVault at GATE B — keeping one writer per Vault path +// avoids a put/patch race on re-runs (CONTRACT_002 "single source of truth"). /** * T06 — distribute the generated data-plane credentials into Vault (CONTRACT_002). @@ -132,12 +132,11 @@ export function writeCredentialsToVault( creds: FoundationCredentials, vault: VaultOutputs, ): command.remote.Command { - // Non-secret usernames are prepended as shell vars; the 6 secret values (root + // Non-secret usernames are prepended as shell vars; the 5 secret values (root // token first, then the order the script `read`s them) arrive on stdin. const create = pulumi.interpolate`PG_SUPER_USER='${creds.postgres.superUser}' PG_FORGEJO_USER='${creds.postgres.forgejoDbUser}' RUSTFS_ADMIN_USER='${creds.rustfs.adminUser}' -FORGEJO_ADMIN_USER='${creds.forgejo.adminUser}' ${WRITE_CREDS}`; return new command.remote.Command( @@ -152,7 +151,6 @@ ${creds.postgres.forgejoDbPassword} ${creds.rustfs.adminPassword} ${creds.rustfs.serviceKeyId} ${creds.rustfs.serviceKeySecret} -${creds.forgejo.adminPassword} `, addPreviousOutputInEnv: false, triggers: [ @@ -162,7 +160,6 @@ ${creds.forgejo.adminPassword} creds.rustfs.adminPassword, creds.rustfs.serviceKeyId, creds.rustfs.serviceKeySecret, - creds.forgejo.adminPassword, ], }, { dependsOn: [vault.init] }, @@ -230,3 +227,62 @@ ${backup.ageIdentity} { dependsOn: [vault.init] }, ); } + +// Single owner of foundation/forgejo/service-credentials (CONTRACT_002 §2.3). The +// admin user/pw are generated (@pulumi/random); the crypto secrets (SECRET_KEY, +// INTERNAL_TOKEN, the LFS + OAuth2 JWT secrets) are auto-generated by Forgejo into +// app.ini on first start — they only exist post-boot, so the whole path is written +// here at GATE B (dependsOn forgejo.ready), read straight off the live app.ini via +// docker-exec (ADR-007). Admin pw on stdin (D2); crypto values are read on the VM +// and never transit the Pulumi command string. Idempotent put. +// +// NOTE: SECRET_KEY can be EMPTY when the bootstrap skips the web installer +// (INSTALL_LOCK) — it is mirrored as-is (faithful), and flagged for hardening. +const WRITE_FORGEJO_CREDS = `set -eu +IFS= read -r ROOT_TOKEN +IFS= read -r ADMIN_PW +C=foundation-vault +F=foundation-forgejo +VE="-e VAULT_ADDR=http://127.0.0.1:8200 -e VAULT_TOKEN=$ROOT_TOKEN" +gv() { docker exec "$F" sh -c "sed -n 's/^$1 *= *//p' /data/gitea/conf/app.ini" | head -1; } +SECRET_KEY=$(gv SECRET_KEY) +INTERNAL_TOKEN=$(gv INTERNAL_TOKEN) +LFS_JWT=$(gv LFS_JWT_SECRET) +OAUTH2_JWT=$(gv JWT_SECRET) +jq -n --arg au "$ADMIN_USER" --arg ap "$ADMIN_PW" \ + --arg sk "$SECRET_KEY" --arg it "$INTERNAL_TOKEN" --arg jt "$LFS_JWT" --arg oj "$OAUTH2_JWT" \ + '{forgejoAdminUser:$au,forgejoAdminPassword:$ap,forgejoSecretKey:$sk,forgejoInternalToken:$it,forgejoJwtSecret:$jt,forgejoOauth2JwtSecret:$oj}' \ + | docker exec -i $VE "$C" vault kv put foundation/forgejo/service-credentials - >/dev/null +echo "vault: wrote forgejo/service-credentials (admin + crypto secrets)"`; + +/** + * Mirror the Forgejo admin + crypto secrets into Vault (CONTRACT_002 §2.3). + * Runs at GATE B: needs Vault unsealed (vault.init) AND Forgejo healthy + * (forgejoReady) so app.ini exists to read the crypto secrets from. + */ +export function writeForgejoCredentialsToVault( + ctx: DeployCtx, + vault: VaultOutputs, + forgejoCreds: ForgejoCredentials, + forgejoReady: command.remote.Command, +): command.remote.Command { + const create = pulumi.interpolate`ADMIN_USER='${forgejoCreds.adminUser}' +${WRITE_FORGEJO_CREDS}`; + + return new command.remote.Command( + "foundation-forgejo-credentials", + { + connection: vmConnection(ctx), + create, + update: create, + stdin: pulumi.interpolate`${vault.rootToken} +${forgejoCreds.adminPassword} +`, + addPreviousOutputInEnv: false, + // forgejoReady.id changes when the container is (re)created → app.ini (hence + // the crypto secrets) regenerated → re-mirror. + triggers: [vault.init.id, forgejoReady.id, forgejoCreds.adminPassword], + }, + { dependsOn: [vault.init, forgejoReady] }, + ); +} diff --git a/bootstrap/index.ts b/bootstrap/index.ts index 6d925fb..72d7039 100644 --- a/bootstrap/index.ts +++ b/bootstrap/index.ts @@ -14,6 +14,7 @@ import { generateCredentials, writeCredentialsToVault, writeBackupCredentialsToVault, + writeForgejoCredentialsToVault, } from "./components/credentials"; import { deployPostgres } from "./components/postgres"; import { deployRustfs } from "./components/rustfs"; @@ -75,12 +76,21 @@ const forgejoBootstrap = bootstrapForgejo(ctx, { sshPublicKey, }); const runner = cfg.features.runner ? deployRunner(ctx, forgejo) : undefined; +// Mirror Forgejo's admin + app.ini crypto secrets into Vault (CONTRACT_002 §2.3); +// GATE B — needs app.ini, which exists only once Forgejo has started. +const forgejoCreds = writeForgejoCredentialsToVault( + ctx, + vault, + credentials.forgejo, + forgejo.ready, +); // ============================================================================= // Stack outputs (extended as phases land). // vaultCreds (T06) is a gate for Forgejo (T08) — it has no output to export yet. void vaultCreds; void backupCreds; // CONTRACT_002 backup/backup-credentials mirror; no secret output +void forgejoCreds; // CONTRACT_002 forgejo/service-credentials mirror; no secret output export const phase = "T10-runner"; // forge + CI runner live export const caddyImageId = proxy.imageId; From 522c5d7a545cf3b1ce6f4855cb2854099f9244f9 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:30:35 +0200 Subject: [PATCH 09/10] fix(forgejo): generate + set SECRET_KEY (was empty under INSTALL_LOCK) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the crypto-secret mirror: Forgejo's [security] SECRET_KEY was EMPTY because the bootstrap skips the web installer (INSTALL_LOCK), which is what normally generates it. An empty SECRET_KEY weakens at-rest encryption of 2FA secrets, push-mirror/migration passwords, and OAuth app secrets. Generate it with @pulumi/random (it is a plain high-entropy string, not a format-constrained JWT — so unlike INTERNAL_TOKEN/JWT_SECRET it CAN be random-generated, matching CONTRACT_002 §2.3) and inject via FORGEJO__security__SECRET_KEY; env-to-ini overwrites it in the volume's app.ini while leaving Forgejo's own INTERNAL_TOKEN + JWT secrets untouched. The GATE-B mirror then captures the real value into Vault. Done now while the egg is fresh (no encrypted data yet) → no re-encryption. Validated live: app.ini + Vault forgejoSecretKey = 40 chars; forge healthz pass + https 200; scp-form clone works; idempotent at 44 unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- bootstrap/components/credentials.ts | 13 +++++++++---- bootstrap/components/forgejo.ts | 6 ++++++ bootstrap/index.ts | 1 + 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bootstrap/components/credentials.ts b/bootstrap/components/credentials.ts index 3818e54..dce3796 100644 --- a/bootstrap/components/credentials.ts +++ b/bootstrap/components/credentials.ts @@ -37,14 +37,18 @@ export interface RustfsCredentials { } /** - * `foundation/forgejo/service-credentials` — the admin slice (CONTRACT_002 §2.3). - * The crypto secrets (forgejoSecretKey/InternalToken/Jwt*) are auto-generated by - * Forgejo into its app.ini (format-constrained — JWTs, not free random), so they - * are not generated here; capturing them into Vault is a later refinement. + * `foundation/forgejo/service-credentials` — admin slice + SECRET_KEY (CONTRACT_002 + * §2.3). The OTHER crypto secrets (forgejoInternalToken/Jwt*) are format-constrained + * (JWTs, not free random) so Forgejo auto-generates them into app.ini and the GATE-B + * mirror captures them. SECRET_KEY, by contrast, is a plain high-entropy string, so + * we generate it here (@pulumi/random, per CONTRACT_002) and inject it via env — + * Forgejo leaves it EMPTY when the web installer is skipped (INSTALL_LOCK), which + * would weaken at-rest encryption of 2FA/mirror/oauth secrets. */ export interface ForgejoCredentials { adminUser: string; // cfg.forgejo.adminUser (deterministic) adminPassword: pulumi.Output; + secretKey: pulumi.Output; // [security] SECRET_KEY (injected via env) } /** Everything generateCredentials() produces; grows as Wave-2 tasks land. */ @@ -80,6 +84,7 @@ export function generateCredentials(ctx: DeployCtx): FoundationCredentials { forgejo: { adminUser: ctx.cfg.forgejo.adminUser, // "platform-admin" adminPassword: secret("forgejo-admin-password"), + secretKey: secret("forgejo-secret-key", 40), // [security] SECRET_KEY }, }; } diff --git a/bootstrap/components/forgejo.ts b/bootstrap/components/forgejo.ts index 9711fcb..1b9d761 100644 --- a/bootstrap/components/forgejo.ts +++ b/bootstrap/components/forgejo.ts @@ -26,6 +26,7 @@ export interface ForgejoDeps { rustfs: RustfsOutputs; pgCreds: PostgresCredentials; rustfsCreds: RustfsCredentials; + forgejoCreds: ForgejoCredentials; } export interface ForgejoOutputs { @@ -88,6 +89,11 @@ export function deployForgejo( // Go SSH server colliding on :22. SSH_PORT is the clone-URL port; the sshd is // published on host :22 (scp-form goal) + :2222 (CONTRACT_003). "FORGEJO__server__START_SSH_SERVER=false", + // [security] SECRET_KEY — Forgejo leaves this EMPTY when the installer is + // skipped (INSTALL_LOCK); set it explicitly so at-rest encryption of 2FA / + // mirror / oauth secrets is keyed. env-to-ini overwrites it in the volume's + // app.ini (INTERNAL_TOKEN + JWT secrets are left untouched — Forgejo's own). + pulumi.interpolate`FORGEJO__security__SECRET_KEY=${deps.forgejoCreds.secretKey}`, "FORGEJO__server__SSH_LISTEN_PORT=22", `FORGEJO__server__SSH_PORT=${cfg.forgeSshPort}`, `FORGEJO__server__SSH_DOMAIN=${cfg.hosts.git}`, diff --git a/bootstrap/index.ts b/bootstrap/index.ts index 72d7039..9978788 100644 --- a/bootstrap/index.ts +++ b/bootstrap/index.ts @@ -61,6 +61,7 @@ const forgejo = deployForgejo(ctx, { rustfs, pgCreds: credentials.postgres, rustfsCreds: credentials.rustfs, + forgejoCreds: credentials.forgejo, }); // --- GATE B: Forgejo healthy → headless admin + org + repo + operator SSH key From 82c34c9a42651696a28d6a1126fcfad7dc0a312b Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Tue, 30 Jun 2026 23:36:50 +0200 Subject: [PATCH 10/10] fix(network): ignore ipamConfigs drift so `up --refresh` can't recreate the net MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the known gap. Docker auto-assigns the subnet's first host (.1) as the bridge gateway — a field we never declared — so `pulumi up --refresh` surfaced it as a spurious foundation-net ipamConfigs drift. `gateway` is a ForceNew input, so reconciling it (whether by declaring it OR by applying the refreshed diff) REPLACES the network and disconnects every container. (Verified: adding the gateway turned a clean plan into a network + 6-container + commands replacement.) The IPAM is immutable by design (subnet fixed by CONTRACT_003), so ignore drift on it: ignoreChanges:["ipamConfigs"]. Plain `up` stays clean (44 unchanged) and `up --refresh` no longer wants to recreate the network/containers. Residual, NON-destructive: `preview --refresh` still shows pessimistic "~triggers" replaces on the vault-init + credential-writer commands, because a refreshed container.id resolves to [unknown] in the preview (a Pulumi preview artifact). At real apply the id is known + unchanged; worst case the commands re-run idempotently. Documented for CI (T14). Co-Authored-By: Claude Opus 4.8 (1M context) --- bootstrap/components/network.ts | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bootstrap/components/network.ts b/bootstrap/components/network.ts index e9a7990..e99b188 100644 --- a/bootstrap/components/network.ts +++ b/bootstrap/components/network.ts @@ -16,6 +16,17 @@ export function deployNetwork(ctx: BaseCtx): docker.Network { attachable: true, ipamConfigs: [{ subnet: ctx.cfg.network.subnet }], // "172.30.0.0/24" }, - { provider: ctx.provider, deleteBeforeReplace: true }, + { + provider: ctx.provider, + deleteBeforeReplace: true, + // Docker auto-assigns the subnet's first host (.1) as the bridge gateway — + // a field we never declared, so a `pulumi up --refresh` surfaced it as a + // spurious ipamConfigs drift. `gateway` is ForceNew, so reconciling it + // (either by declaring it OR by applying the refreshed diff) would REPLACE + // the network and disconnect every container. The IPAM is immutable by + // design (subnet fixed by CONTRACT_003), so we ignore drift on it: plain + // `up` stays clean AND `up --refresh` no longer wants to recreate the net. + ignoreChanges: ["ipamConfigs"], + }, ); }