diff --git a/.gitignore b/.gitignore index 61df145..2799a7a 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,3 @@ provision/state/ offsite-backup/state/ offsite-backup/Pulumi.prod.yaml provision/Pulumi.foundation-test.yaml -runners/state/ diff --git a/bootstrap/trigger.txt b/bootstrap/trigger.txt deleted file mode 100644 index ec0aca5..0000000 --- a/bootstrap/trigger.txt +++ /dev/null @@ -1 +0,0 @@ -1782867576 diff --git a/bun.lock b/bun.lock index cf12d41..f31b3f2 100644 --- a/bun.lock +++ b/bun.lock @@ -98,21 +98,6 @@ "typescript": "^5.0.0", }, }, - "runners": { - "name": "@olsitec/foundation-runners", - "version": "0.0.0", - "dependencies": { - "@pulumi/command": "^1.1.3", - "@pulumi/libvirt": "^0.5.3", - "@pulumi/pulumi": "^3.138.0", - "js-yaml": "^4.1.0", - }, - "devDependencies": { - "@types/js-yaml": "^4.0.9", - "@types/node": "^18", - "typescript": "^5.0.0", - }, - }, }, "packages": { "@eslint-community/eslint-utils": ["@eslint-community/eslint-utils@4.9.1", "", { "dependencies": { "eslint-visitor-keys": "^3.4.3" }, "peerDependencies": { "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" } }, "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ=="], @@ -195,8 +180,6 @@ "@olsitec/foundation-provision": ["@olsitec/foundation-provision@workspace:provision"], - "@olsitec/foundation-runners": ["@olsitec/foundation-runners@workspace:runners"], - "@olsitec/pulumi-docker": ["@olsitec/pulumi-docker@workspace:packages/pulumi-docker"], "@olsitec/pulumi-hetzner": ["@olsitec/pulumi-hetzner@workspace:packages/pulumi-hetzner"], @@ -269,8 +252,6 @@ "@pulumi/hcloud": ["@pulumi/hcloud@1.39.0", "", { "dependencies": { "@pulumi/pulumi": "^3.142.0" } }, "sha512-rrjOZ1bPliOpsuoGBrd6b9GOeM+CoNSLTJrd061JzwAREdztVP6vy8UEROQj7zIUypEI0+eCqXAA1bxYIQSwkQ=="], - "@pulumi/libvirt": ["@pulumi/libvirt@0.5.4", "", { "dependencies": { "@pulumi/pulumi": "^3.142.0" } }, "sha512-iStzokbaU71cySC05IS+OX9Rx+CpfZIYeRiehqSZ60DKpd4Ou4XgZEp7GmQE8E2Cd/Ou7HA/MUnDxyKk9TmsPQ=="], - "@pulumi/minio": ["@pulumi/minio@0.16.9", "", { "dependencies": { "@pulumi/pulumi": "^3.142.0" } }, "sha512-druJ9i1edmXbzTTyHaH2W5xK2BRB4k4O02jTV6FBk1cRp8na9y5dDIrzWjDTRTEqXSRjSNruEWzltyj6Bh2aVg=="], "@pulumi/pulumi": ["@pulumi/pulumi@3.248.0", "", { "dependencies": { "@grpc/grpc-js": "^1.10.1", "@logdna/tail-file": "^2.0.6", "@npmcli/arborist": "^9.0.0", "@opentelemetry/api": "^1.9", "@opentelemetry/exporter-trace-otlp-grpc": "^0.57", "@opentelemetry/exporter-zipkin": "^1.30", "@opentelemetry/instrumentation": "^0.57", "@opentelemetry/instrumentation-grpc": "^0.57", "@opentelemetry/resources": "^1.30", "@opentelemetry/sdk-trace-base": "^1.30", "@opentelemetry/sdk-trace-node": "^1.30", "@types/google-protobuf": "^3.15.5", "@types/semver": "^7.5.6", "@types/tmp": "^0.2.6", "execa": "^5.1.0", "fdir": "^6.5.0", "google-protobuf": "^3.21.4", "ini": "^2.0.0", "js-yaml": "^4.0.0", "minimist": "^1.2.6", "normalize-package-data": "^6.0.0", "picomatch": "^4.0.0", "require-from-string": "^2.0.1", "semver": "^7.5.2", "source-map-support": "^0.5.6", "tmp": "^0.2.4", "upath": "^1.1.0" }, "peerDependencies": { "ts-node": ">= 7.0.1 < 12", "typescript": ">= 3.8.3 < 7" }, "optionalPeers": ["ts-node", "typescript"] }, "sha512-EqgeHjVIqMS8voAM7F8SOzFAMHnVXUDdKTNF1o3Lg85YwVI0j4/eIlWG0iIVAWJl3DX0KOOM6++X0wLKHWWwmQ=="], diff --git a/package.json b/package.json index d0c7ee1..cb5bf80 100644 --- a/package.json +++ b/package.json @@ -7,8 +7,7 @@ "packages/*", "bootstrap", "provision", - "offsite-backup", - "runners" + "offsite-backup" ], "devDependencies": { "typescript": "^5.0.0" diff --git a/runners/Pulumi.yaml b/runners/Pulumi.yaml deleted file mode 100644 index 58ba161..0000000 --- a/runners/Pulumi.yaml +++ /dev/null @@ -1,12 +0,0 @@ -name: foundation-runners -description: >- - Step-0-after-foundation — the fenced Forgejo Actions runner fleet on libvirt - hosts (e.g. crunchy01), as an ISOLATED stack. Decoupled from `bootstrap` on - purpose: a @pulumi/libvirt provider dials the runner host on every up/refresh, - so keeping it here (never imported by bootstrap) means foundation ops never - require — and are never blocked by — the runner host. One-way dependency: needs - the forge up to mint a runner token (i.e. runs after the foundation stands). -runtime: - name: nodejs - options: - packagemanager: bun diff --git a/runners/README.md b/runners/README.md deleted file mode 100644 index 6ef02cd..0000000 --- a/runners/README.md +++ /dev/null @@ -1,70 +0,0 @@ -# foundation-runners — the fenced Actions runner fleet (isolated stack) - -**Step-0 *after* the foundation stands.** A separate Pulumi project/stack that -provisions runner VM(s) on a libvirt host (crunchy01) and registers Forgejo Actions -runners with a distinct label (`fenced`), so ecosystem/untrusted jobs (`runs-on: -fenced`) execute **off** the forge VM — the R5 fence. - -## Why a separate stack (decoupling) - -A `@pulumi/libvirt` provider dials the runner host on **every** `up`/`refresh`/`preview` -of the stack it lives in. If the runner VM lived in `bootstrap`, then crunchy01 being -down — or you not having access to it — would break `pulumi refresh`/`up` of the -**foundation itself** (the classic Terraform coupling trap). Pulumi isolates this at -the **stack boundary**: a provider only initializes when *its own* stack runs. So the -fleet is its own project; `bootstrap` never imports it. Consequences: - -- Foundation deploy/refresh **never touches** crunchy01. -- crunchy01 down ⇒ only *this* stack's refresh is affected, and only when you run it. -- One-way dependency: this stack mints a runner token *from* the forge, so it runs - **after** the foundation is up. - -## Host prep (one-time, kept OUT of this stack) - -The libvirt provider needs something to connect to, so install libvirt on the host -out-of-band (not via this stack), and ensure a LAN bridge exists: - -```sh -sudo apt-get update -sudo apt-get install -y qemu-kvm libvirt-daemon-system libvirt-clients \ - bridge-utils dnsmasq qemu-utils virtinst cloud-image-utils -sudo systemctl enable --now libvirtd -# a LAN bridge (br0) enslaving the physical NIC must already exist (crunchy01 had it). -``` - -## Deploy - -```sh -export RUNNER_SSH_KEY_PATH=~/.ssh/foundation-test_ed25519 # reaches host + VM (root) -cd runners -pulumi stack init crunchy # isolated file backend, like bootstrap/provision -pulumi config set host.address 192.168.1.2 -pulumi config set forge.address 204.168.234.72 -pulumi up -``` - -`pulumi up` will: apply the kube-router-proof FORWARD timer on the host, create an -Ubuntu VM on `br0` (docker + qemu-guest-agent via cloud-init), mint a runner token -from the forge, and register + run the `fenced` runner in the VM. Verify with a -`runs-on: fenced` job on any repo. - -> **Cutover note.** The first fenced runner was built by hand (SESSION_2026-07-01_003). -> A `pulumi up` here creates a *fresh* declarative VM; retire the hand-built -> `foundation-runner-01` (`virsh destroy/undefine`) at cutover, or point config at a -> new `vm.name` to run both. This code is committed + typechecked; the live `up` -> cutover is the remaining validation step. - -## Gotchas baked into the code (learned the hard way) - -- **k3s host firewall.** crunchy01 is a k3s node; kube-router sets `FORWARD policy - DROP` + `br_netfilter=1`, dropping bridged VM↔LAN traffic. Fix = `iptables -I FORWARD - -m physdev --physdev-is-bridged -j ACCEPT`, re-asserted by a **60s systemd timer** - (kube-router flushes iptables on resync, so a boot-only rule isn't enough). -- **Ubuntu, not Debian genericcloud.** Debian's cloud-init wrote netplan the image - never applied → no IPv4 (static *or* DHCP). Ubuntu 24.04 renders + applies cleanly. -- **PTY console.** The domain declares a `pty` serial console so `virsh console ` - works. (Don't back serial with a file — you lose interactive console.) -- **Docker socket gid.** act_runner runs as uid 1000; the daemon container gets - `--group-add ` so it can reach `/var/run/docker.sock`. -- **IP is optional.** The runner polls the forge outbound, so a fixed LAN IP isn't - required — set `vm.ipCidr` empty for DHCP. Default is a static `.15` for predictability. diff --git a/runners/index.ts b/runners/index.ts deleted file mode 100644 index 6512290..0000000 --- a/runners/index.ts +++ /dev/null @@ -1,272 +0,0 @@ -// runners/index.ts — Step-0-after-foundation: the fenced Actions runner fleet. -// -// ISOLATED STACK, decoupled from `bootstrap` on purpose (see Pulumi.yaml). A -// @pulumi/libvirt provider dials the runner HOST (e.g. crunchy01) on every -// up/refresh/preview; keeping that here — never imported by bootstrap — means -// foundation ops never require, and are never blocked by, the runner host being -// reachable. One-way dependency only: this stack mints a runner token FROM the -// forge, so it runs after the foundation stands. -// -// It codifies what SESSION_2026-07-01_003 built by hand (see runners/README.md): -// 1. an Ubuntu VM on the host's LAN bridge, with docker + qemu-guest-agent; -// 2. a kube-router-proof FORWARD accept (the host is a k3s node whose FORWARD -// policy is DROP — bridged VM traffic needs an idempotent, re-asserted rule); -// 3. a Forgejo Actions runner registered with a distinct label ("fenced") so -// ecosystem/untrusted jobs (runs-on: fenced) run OFF the forge VM (R5). -// -// PREREQUISITE (host, one-time, kept OUT of this stack so the libvirt provider -// always has something to connect to): qemu-kvm + libvirt-daemon-system + -// libvirt-clients + virtinst + cloud-image-utils installed on the host, libvirtd -// enabled, and a LAN bridge (br0). See runners/README.md §Host prep. -import * as pulumi from "@pulumi/pulumi"; -import * as libvirt from "@pulumi/libvirt"; -import * as command from "@pulumi/command"; -import * as fs from "fs"; -import * as yaml from "js-yaml"; - -const cfg = new pulumi.Config(); - -// --- runner host (libvirt over qemu+ssh) --- -const host = { - address: cfg.get("host.address") ?? "192.168.1.2", // crunchy01 - user: cfg.get("host.user") ?? "root", - bridge: cfg.get("host.bridge") ?? "br0", // LAN bridge the VM joins - pool: cfg.get("host.pool") ?? "default", // libvirt storage pool (nvme) -}; -// SSH key reaching the host AND the created VM (root). Path via ENV, never config. -const sshKeyPath = - process.env.RUNNER_SSH_KEY_PATH ?? - `${process.env.HOME}/.ssh/foundation-test_ed25519`; -const sshPrivateKey = pulumi.secret(fs.readFileSync(sshKeyPath, "utf8")); -const sshPublicKey = fs.readFileSync(`${sshKeyPath}.pub`, "utf8").trim(); -const hostConn: command.types.input.remote.ConnectionArgs = { - host: host.address, - port: cfg.getNumber("host.sshPort") ?? 22, - user: host.user, - privateKey: sshPrivateKey, -}; - -// --- the forge (mint a runner registration token; reached over SSH via docker) --- -const forge = { - address: cfg.get("forge.address") ?? "204.168.234.72", - sshPort: cfg.getNumber("forge.sshPort") ?? 222, - user: cfg.get("forge.user") ?? "root", - instanceUrl: cfg.get("forge.instanceUrl") ?? "https://forge.olsitec.net", - container: cfg.get("forge.container") ?? "foundation-forgejo", -}; -const forgeConn: command.types.input.remote.ConnectionArgs = { - host: forge.address, - port: forge.sshPort, - user: forge.user, - privateKey: sshPrivateKey, -}; - -// --- the VM + runner shape --- -const vm = { - name: cfg.get("vm.name") ?? "foundation-runner-01", - vcpu: cfg.getNumber("vm.vcpu") ?? 4, - memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 8192, - diskGiB: cfg.getNumber("vm.diskGiB") ?? 40, - // LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a - // fixed address is optional). Default matches the hand-built VM. - ipCidr: cfg.get("vm.ipCidr") ?? "192.168.1.15/24", - gateway: cfg.get("vm.gateway") ?? "192.168.1.251", - nameservers: (cfg.getObject("vm.nameservers")) ?? [ - "192.168.1.251", - "1.1.1.1", - ], - ubuntuImageUrl: - cfg.get("vm.ubuntuImageUrl") ?? - "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img", -}; -// The runner label(s). `fenced` routes runs-on: fenced here; the schema maps it to a -// default job image on the VM's docker. -const runnerLabels = - cfg.get("runner.labels") ?? "fenced:docker://node:20-bookworm"; - -// ============================================================================= -// libvirt provider — qemu+ssh to the host. Lazy-connects when a libvirt resource -// is created; the host prerequisite (libvirtd) must already be satisfied. -// ============================================================================= -const provider = new libvirt.Provider("runner-host", { - uri: `qemu+ssh://${host.user}@${host.address}/system?sshauth=privkey&keyfile=${sshKeyPath}&known_hosts_verify=ignore`, -}); - -// --- host prep: the kube-router-proof bridged-FORWARD accept (idempotent timer) --- -// The host is a k3s node; kube-router sets FORWARD policy DROP and re-syncs iptables, -// which drops bridged VM↔LAN traffic and can flush a hand-added rule. A 60s systemd -// timer re-asserts it. This is a control-plane op on the HOST (not a libvirt resource). -const FIREWALL = `set -eu -cat > /etc/systemd/system/libvirt-bridge-forward.service <<'U' -[Unit] -Description=Ensure bridged VM traffic passes iptables FORWARD (libvirt on a kube-router host) -After=network-online.target -[Service] -Type=oneshot -ExecStart=/bin/sh -c 'iptables -C FORWARD -m physdev --physdev-is-bridged -j ACCEPT 2>/dev/null || iptables -I FORWARD 1 -m physdev --physdev-is-bridged -j ACCEPT' -U -cat > /etc/systemd/system/libvirt-bridge-forward.timer <<'U' -[Unit] -Description=Re-assert the bridged-FORWARD accept rule (kube-router flushes iptables on resync) -[Timer] -OnBootSec=30s -OnUnitActiveSec=60s -AccuracySec=5s -[Install] -WantedBy=timers.target -U -systemctl daemon-reload -systemctl enable --now libvirt-bridge-forward.timer >/dev/null -echo "bridged-FORWARD timer active"`; -const firewall = new command.remote.Command("runner-host-firewall", { - connection: hostConn, - create: FIREWALL, - update: FIREWALL, -}); - -// ============================================================================= -// The VM: Ubuntu base volume → backed domain disk → cloud-init → domain. -// Ubuntu (not the Debian genericcloud image) because Debian's cloud-init wrote -// netplan the image never applied (no IPv4); Ubuntu renders + applies it cleanly. -// ============================================================================= -const base = new libvirt.Volume( - `${vm.name}-base`, - { - name: `${vm.name}-ubuntu-base.img`, - source: vm.ubuntuImageUrl, - pool: host.pool, - format: "qcow2", - }, - { provider }, -); -const disk = new libvirt.Volume( - `${vm.name}-disk`, - { - name: `${vm.name}.qcow2`, - pool: host.pool, - format: "qcow2", - baseVolumeId: base.id, - size: vm.diskGiB * 1024 * 1024 * 1024, - }, - { provider }, -); - -// cloud-init user-data: docker + qemu-guest-agent + our SSH key + a marker. -const userData = - "#cloud-config\n" + - yaml.dump({ - hostname: vm.name, - manage_etc_hosts: true, - ssh_pwauth: false, - users: [ - { - name: "root", - lock_passwd: false, - ssh_authorized_keys: [sshPublicKey], - }, - ], - packages: ["ca-certificates", "curl", "jq", "qemu-guest-agent"], - runcmd: [ - ["sh", "-c", "curl -fsSL https://get.docker.com | sh"], - "systemctl enable --now docker qemu-guest-agent", - "touch /root/cloud-init-done", - ], - }); -// network-config v2: static on enp1s0 if ipCidr set, else DHCP. Ubuntu applies both. -const networkConfig = yaml.dump({ - version: 2, - ethernets: { - enp1s0: vm.ipCidr - ? { - dhcp4: false, - addresses: [vm.ipCidr], - routes: [{ to: "default", via: vm.gateway }], - nameservers: { addresses: vm.nameservers }, - } - : { dhcp4: true }, - }, -}); -const cloudinit = new libvirt.CloudInitDisk( - `${vm.name}-cloudinit`, - { name: `${vm.name}-cloudinit.iso`, pool: host.pool, userData, networkConfig }, - { provider }, -); - -const domain = new libvirt.Domain( - vm.name, - { - name: vm.name, - memory: vm.memoryMiB, - vcpu: vm.vcpu, - cpu: { mode: "host-passthrough" }, - autostart: true, - qemuAgent: true, - cloudinit: cloudinit.id, - disks: [{ volumeId: disk.id }], - networkInterfaces: [{ bridge: host.bridge }], - // A real PTY console so `virsh console ` works (learned the hard way). - consoles: [ - { type: "pty", targetPort: "0", targetType: "serial" }, - { type: "pty", targetPort: "1", targetType: "virtio" }, - ], - }, - { provider, dependsOn: [firewall] }, -); - -// ============================================================================= -// Register the Forgejo runner: mint a token on the forge, then register + run -// act_runner inside the VM (docker), reachable once cloud-init has installed docker. -// ============================================================================= -// 1) token — instance-scoped registration token, minted over SSH via docker exec. -const tokenCmd = new command.remote.Command("runner-token", { - connection: forgeConn, - create: `docker exec -u git ${forge.container} forgejo actions generate-runner-token`, - // Re-mint if the forge container id or the label set changes. - triggers: [runnerLabels], -}); -const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim())); - -// 2) register + run — connect to the VM (its static/DHCP IP). The script waits for -// cloud-init (docker) to be ready, registers idempotently, and runs the daemon with -// the host docker gid so uid-1000 act_runner can reach the socket. -const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP -const REGISTER = pulumi.interpolate`set -eu -IMG=code.forgejo.org/forgejo/runner:6 -for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done -DGID=$(stat -c %g /var/run/docker.sock) -docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null -docker pull -q "$IMG" >/dev/null -if docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c '[ -s /data/.runner ]'; then - echo "already registered" -else - printf '%s' '${runnerToken}' | docker run --rm -i -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat > /tmp/t' - docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \ - register --no-interactive --instance ${forge.instanceUrl} --token "$(docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat /tmp/t')" \ - --name ${vm.name} --labels '${runnerLabels}' >/dev/null - echo "registered" -fi -docker rm -f forgejo-runner >/dev/null 2>&1 || true -docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \ - -v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \ - --entrypoint /bin/forgejo-runner "$IMG" daemon >/dev/null -echo "runner daemon up"`; -const register = new command.remote.Command( - "runner-register", - { - connection: { - host: vmIp, - port: 22, - user: "root", - privateKey: sshPrivateKey, - }, - create: REGISTER, - triggers: [domain.id, runnerToken, runnerLabels], - }, - { dependsOn: [domain, tokenCmd] }, -); - -export const runnerHost = host.address; -export const runnerVmIp = vmIp; -export const runnerLabelsOut = runnerLabels; -export const forgeInstance = forge.instanceUrl; -void register; diff --git a/runners/package.json b/runners/package.json deleted file mode 100644 index 932e892..0000000 --- a/runners/package.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "name": "@olsitec/foundation-runners", - "private": true, - "version": "0.0.0", - "main": "index.ts", - "description": "Isolated stack: the fenced Forgejo Actions runner fleet on libvirt hosts.", - "dependencies": { - "@pulumi/pulumi": "^3.138.0", - "@pulumi/libvirt": "^0.5.3", - "@pulumi/command": "^1.1.3", - "js-yaml": "^4.1.0" - }, - "devDependencies": { - "@types/node": "^18", - "@types/js-yaml": "^4.0.9", - "typescript": "^5.0.0" - } -} diff --git a/runners/tsconfig.json b/runners/tsconfig.json deleted file mode 100644 index 97a45fd..0000000 --- a/runners/tsconfig.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "compilerOptions": { - "strict": true, "outDir": "bin", "target": "es2020", "module": "commonjs", - "moduleResolution": "node", "sourceMap": true, "experimentalDecorators": true, - "esModuleInterop": true, "skipLibCheck": true, "forceConsistentCasingInFileNames": true - }, - "files": ["index.ts"] -}