fix(runners): live-validated the crunchy stack; cutover done
All checks were successful
CI / preflight (push) Successful in 9s
CI / typecheck (push) Successful in 23s
pulumi-preview / preview (push) Successful in 26s

Fixes found running `pulumi up` live against crunchy01 (foundation-runner-02,
static .16, 8c/32G — the new default sizing):

- network-config matches the NIC by glob (`match: {name: "e*"}`) instead of a
  hardcoded enp1s0 — the libvirt.Domain enumerated it differently, leaving the VM
  with no IP.
- drop `qemuAgent: true` — it blocks the provider on the guest agent (not up on a
  fresh boot) during create; we register over the static IP instead.
- runner-register connection gets `dialErrorLimit: 30` so it waits ~5 min for the
  VM to boot + apply its IP, landing the runner in a single `up`.
- fix the register token passing (the old /tmp/t hop was an ephemeral --rm
  container → empty token); pass it directly (pulumi redacts the secret).
- README: host prep (root SSH + the `images` pool), the exact stack config, and
  the cutover marked DONE — a `runs-on: fenced` job ran green on the Pulumi-managed
  runner-02; the hand-built VM was retired.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Andreas Niemann 2026-07-01 03:35:06 +02:00
parent cfa71847ba
commit 44a96d84eb
3 changed files with 59 additions and 26 deletions

View file

@ -64,8 +64,8 @@ const forgeConn: command.types.input.remote.ConnectionArgs = {
// --- the VM + runner shape ---
const vm = {
name: cfg.get("vm.name") ?? "foundation-runner-01",
vcpu: cfg.getNumber("vm.vcpu") ?? 4,
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 8192,
vcpu: cfg.getNumber("vm.vcpu") ?? 8,
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 32768,
diskGiB: cfg.getNumber("vm.diskGiB") ?? 40,
// LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a
// fixed address is optional). Default matches the hand-built VM.
@ -172,18 +172,24 @@ const userData =
"touch /root/cloud-init-done",
],
});
// network-config v2: static on enp1s0 if ipCidr set, else DHCP. Ubuntu applies both.
// network-config v2. Match ANY ethernet by name-glob (the NIC may enumerate as
// enp1s0/ens3/… depending on the machine type — hardcoding enp1s0 left the VM with
// no IP). Static if ipCidr set (needed so the register command knows where to
// connect), else DHCP. Ubuntu (netplan-native) applies this at first boot.
const networkConfig = yaml.dump({
version: 2,
ethernets: {
enp1s0: vm.ipCidr
? {
dhcp4: false,
addresses: [vm.ipCidr],
routes: [{ to: "default", via: vm.gateway }],
nameservers: { addresses: vm.nameservers },
}
: { dhcp4: true },
primary: {
match: { name: "e*" },
...(vm.ipCidr
? {
dhcp4: false,
addresses: [vm.ipCidr],
routes: [{ to: "default", via: vm.gateway }],
nameservers: { addresses: vm.nameservers },
}
: { dhcp4: true }),
},
},
});
const cloudinit = new libvirt.CloudInitDisk(
@ -200,7 +206,10 @@ const domain = new libvirt.Domain(
vcpu: vm.vcpu,
cpu: { mode: "host-passthrough" },
autostart: true,
qemuAgent: true,
// NB: do NOT set qemuAgent:true — it makes the provider block on the guest agent
// (not up on a fresh boot) during create. We register the runner over the VM's
// STATIC IP, so we don't need agent-discovered addresses. (guest-agent is still
// installed via cloud-init for `virsh domifaddr --source agent` convenience.)
cloudinit: cloudinit.id,
disks: [{ volumeId: disk.id }],
networkInterfaces: [{ bridge: host.bridge }],
@ -232,19 +241,19 @@ const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim()));
const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP
const REGISTER = pulumi.interpolate`set -eu
IMG=code.forgejo.org/forgejo/runner:6
TOKEN='${runnerToken}'
for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done
DGID=$(stat -c %g /var/run/docker.sock)
docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null
docker pull -q "$IMG" >/dev/null
if docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c '[ -s /data/.runner ]'; then
echo "already registered"
else
printf '%s' '${runnerToken}' | docker run --rm -i -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat > /tmp/t'
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
register --no-interactive --instance ${forge.instanceUrl} --token "$(docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat /tmp/t')" \
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
echo "registered"
fi
# (Re)register this command only re-runs when the token/domain/labels change
# (triggers), so a clean re-register each time is safe. Token passed directly
# (pulumi redacts it in its own output as a secret; short-lived on the VM's argv).
docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'rm -f /data/.runner'
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
register --no-interactive --instance ${forge.instanceUrl} --token "$TOKEN" \
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
echo "registered"
docker rm -f forgejo-runner >/dev/null 2>&1 || true
docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \
-v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \
@ -258,6 +267,11 @@ const register = new command.remote.Command(
port: 22,
user: "root",
privateKey: sshPrivateKey,
// The VM needs ~60-90s to boot + apply the static IP before sshd answers;
// wait up to ~5min rather than the default ~10 dials so a fresh create lands
// the runner in one `up`.
dialErrorLimit: 30,
perDialTimeout: 10,
},
create: REGISTER,
triggers: [domain.id, runnerToken, runnerLabels],