From 44a96d84eb7cd6aa421148193a502b2d38aab623 Mon Sep 17 00:00:00 2001 From: Andreas Niemann Date: Wed, 1 Jul 2026 03:35:06 +0200 Subject: [PATCH] fix(runners): live-validated the crunchy stack; cutover done MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes found running `pulumi up` live against crunchy01 (foundation-runner-02, static .16, 8c/32G — the new default sizing): - network-config matches the NIC by glob (`match: {name: "e*"}`) instead of a hardcoded enp1s0 — the libvirt.Domain enumerated it differently, leaving the VM with no IP. - drop `qemuAgent: true` — it blocks the provider on the guest agent (not up on a fresh boot) during create; we register over the static IP instead. - runner-register connection gets `dialErrorLimit: 30` so it waits ~5 min for the VM to boot + apply its IP, landing the runner in a single `up`. - fix the register token passing (the old /tmp/t hop was an ephemeral --rm container → empty token); pass it directly (pulumi redacts the secret). - README: host prep (root SSH + the `images` pool), the exact stack config, and the cutover marked DONE — a `runs-on: fenced` job ran green on the Pulumi-managed runner-02; the hand-built VM was retired. Co-Authored-By: Claude Opus 4.8 (1M context) --- .gitignore | 1 + runners/README.md | 28 +++++++++++++++++++----- runners/index.ts | 56 +++++++++++++++++++++++++++++------------------ 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index 61df145..f39f914 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ offsite-backup/state/ offsite-backup/Pulumi.prod.yaml provision/Pulumi.foundation-test.yaml runners/state/ +runners/Pulumi.crunchy.yaml diff --git a/runners/README.md b/runners/README.md index 6ef02cd..c1aa9ae 100644 --- a/runners/README.md +++ b/runners/README.md @@ -32,6 +32,13 @@ sudo systemctl enable --now libvirtd # a LAN bridge (br0) enslaving the physical NIC must already exist (crunchy01 had it). ``` +Also required on the host, one-time: +- **root SSH via key** — the `@pulumi/libvirt` provider and the host firewall command + connect as `root` (add the operator pubkey to `/root/.ssh/authorized_keys`). +- **a libvirt storage pool** — crunchy01 already had one named `images` (at + `/var/lib/libvirt/images`), so the stack is configured with `host.pool images`. On a + host with the conventional `default` pool, leave `host.pool` at its default. + ## Deploy ```sh @@ -39,7 +46,10 @@ export RUNNER_SSH_KEY_PATH=~/.ssh/foundation-test_ed25519 # reaches host + VM cd runners pulumi stack init crunchy # isolated file backend, like bootstrap/provision pulumi config set host.address 192.168.1.2 +pulumi config set host.pool images # crunchy01's pool (see host prep) pulumi config set forge.address 204.168.234.72 +pulumi config set vm.name foundation-runner-02 +pulumi config set vm.ipCidr 192.168.1.16/24 pulumi up ``` @@ -48,11 +58,12 @@ Ubuntu VM on `br0` (docker + qemu-guest-agent via cloud-init), mint a runner tok from the forge, and register + run the `fenced` runner in the VM. Verify with a `runs-on: fenced` job on any repo. -> **Cutover note.** The first fenced runner was built by hand (SESSION_2026-07-01_003). -> A `pulumi up` here creates a *fresh* declarative VM; retire the hand-built -> `foundation-runner-01` (`virsh destroy/undefine`) at cutover, or point config at a -> new `vm.name` to run both. This code is committed + typechecked; the live `up` -> cutover is the remaining validation step. +> **Cutover: DONE.** `pulumi up` on the `crunchy` stack created `foundation-runner-02` +> (static `.16`, 8c/32G), registered the `fenced` runner, and a `runs-on: fenced` job +> ran on it green. The hand-built `foundation-runner-01` was then retired +> (`virsh destroy/undefine` + disk removed), so the Pulumi-managed runner-02 is the +> sole fenced runner. (A now-offline `crunchy-runner` registration from the hand-built +> VM may still be listed on the forge — harmless; deregister at leisure.) ## Gotchas baked into the code (learned the hard way) @@ -62,6 +73,13 @@ from the forge, and register + run the `fenced` runner in the VM. Verify with a (kube-router flushes iptables on resync, so a boot-only rule isn't enough). - **Ubuntu, not Debian genericcloud.** Debian's cloud-init wrote netplan the image never applied → no IPv4 (static *or* DHCP). Ubuntu 24.04 renders + applies cleanly. +- **NIC name-agnostic network-config.** The cloud-init network-config matches the NIC + by glob (`match: {name: "e*"}`), not a hardcoded `enp1s0` — the libvirt.Domain may + enumerate it as `ens3`/etc., which left the VM with no IP until matched generically. +- **No `qemuAgent: true`.** It makes the provider block on the guest agent (not up on a + fresh boot) during create. We register over the VM's static IP, so it's not needed. +- **Register dial window.** The runner-register command uses `dialErrorLimit: 30` so it + waits ~5 min for the VM to boot + apply its IP, landing the runner in a single `up`. - **PTY console.** The domain declares a `pty` serial console so `virsh console ` works. (Don't back serial with a file — you lose interactive console.) - **Docker socket gid.** act_runner runs as uid 1000; the daemon container gets diff --git a/runners/index.ts b/runners/index.ts index 6512290..f02e584 100644 --- a/runners/index.ts +++ b/runners/index.ts @@ -64,8 +64,8 @@ const forgeConn: command.types.input.remote.ConnectionArgs = { // --- the VM + runner shape --- const vm = { name: cfg.get("vm.name") ?? "foundation-runner-01", - vcpu: cfg.getNumber("vm.vcpu") ?? 4, - memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 8192, + vcpu: cfg.getNumber("vm.vcpu") ?? 8, + memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 32768, diskGiB: cfg.getNumber("vm.diskGiB") ?? 40, // LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a // fixed address is optional). Default matches the hand-built VM. @@ -172,18 +172,24 @@ const userData = "touch /root/cloud-init-done", ], }); -// network-config v2: static on enp1s0 if ipCidr set, else DHCP. Ubuntu applies both. +// network-config v2. Match ANY ethernet by name-glob (the NIC may enumerate as +// enp1s0/ens3/… depending on the machine type — hardcoding enp1s0 left the VM with +// no IP). Static if ipCidr set (needed so the register command knows where to +// connect), else DHCP. Ubuntu (netplan-native) applies this at first boot. const networkConfig = yaml.dump({ version: 2, ethernets: { - enp1s0: vm.ipCidr - ? { - dhcp4: false, - addresses: [vm.ipCidr], - routes: [{ to: "default", via: vm.gateway }], - nameservers: { addresses: vm.nameservers }, - } - : { dhcp4: true }, + primary: { + match: { name: "e*" }, + ...(vm.ipCidr + ? { + dhcp4: false, + addresses: [vm.ipCidr], + routes: [{ to: "default", via: vm.gateway }], + nameservers: { addresses: vm.nameservers }, + } + : { dhcp4: true }), + }, }, }); const cloudinit = new libvirt.CloudInitDisk( @@ -200,7 +206,10 @@ const domain = new libvirt.Domain( vcpu: vm.vcpu, cpu: { mode: "host-passthrough" }, autostart: true, - qemuAgent: true, + // NB: do NOT set qemuAgent:true — it makes the provider block on the guest agent + // (not up on a fresh boot) during create. We register the runner over the VM's + // STATIC IP, so we don't need agent-discovered addresses. (guest-agent is still + // installed via cloud-init for `virsh domifaddr --source agent` convenience.) cloudinit: cloudinit.id, disks: [{ volumeId: disk.id }], networkInterfaces: [{ bridge: host.bridge }], @@ -232,19 +241,19 @@ const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim())); const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP const REGISTER = pulumi.interpolate`set -eu IMG=code.forgejo.org/forgejo/runner:6 +TOKEN='${runnerToken}' for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done DGID=$(stat -c %g /var/run/docker.sock) docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null docker pull -q "$IMG" >/dev/null -if docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c '[ -s /data/.runner ]'; then - echo "already registered" -else - printf '%s' '${runnerToken}' | docker run --rm -i -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat > /tmp/t' - docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \ - register --no-interactive --instance ${forge.instanceUrl} --token "$(docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat /tmp/t')" \ - --name ${vm.name} --labels '${runnerLabels}' >/dev/null - echo "registered" -fi +# (Re)register — this command only re-runs when the token/domain/labels change +# (triggers), so a clean re-register each time is safe. Token passed directly +# (pulumi redacts it in its own output as a secret; short-lived on the VM's argv). +docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'rm -f /data/.runner' +docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \ + register --no-interactive --instance ${forge.instanceUrl} --token "$TOKEN" \ + --name ${vm.name} --labels '${runnerLabels}' >/dev/null +echo "registered" docker rm -f forgejo-runner >/dev/null 2>&1 || true docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \ -v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \ @@ -258,6 +267,11 @@ const register = new command.remote.Command( port: 22, user: "root", privateKey: sshPrivateKey, + // The VM needs ~60-90s to boot + apply the static IP before sshd answers; + // wait up to ~5min rather than the default ~10 dials so a fresh create lands + // the runner in one `up`. + dialErrorLimit: 30, + perDialTimeout: 10, }, create: REGISTER, triggers: [domain.id, runnerToken, runnerLabels],