fix(runners): live-validated the crunchy stack; cutover done
Fixes found running `pulumi up` live against crunchy01 (foundation-runner-02,
static .16, 8c/32G — the new default sizing):
- network-config matches the NIC by glob (`match: {name: "e*"}`) instead of a
hardcoded enp1s0 — the libvirt.Domain enumerated it differently, leaving the VM
with no IP.
- drop `qemuAgent: true` — it blocks the provider on the guest agent (not up on a
fresh boot) during create; we register over the static IP instead.
- runner-register connection gets `dialErrorLimit: 30` so it waits ~5 min for the
VM to boot + apply its IP, landing the runner in a single `up`.
- fix the register token passing (the old /tmp/t hop was an ephemeral --rm
container → empty token); pass it directly (pulumi redacts the secret).
- README: host prep (root SSH + the `images` pool), the exact stack config, and
the cutover marked DONE — a `runs-on: fenced` job ran green on the Pulumi-managed
runner-02; the hand-built VM was retired.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
cfa71847ba
commit
44a96d84eb
3 changed files with 59 additions and 26 deletions
|
|
@ -64,8 +64,8 @@ const forgeConn: command.types.input.remote.ConnectionArgs = {
|
|||
// --- the VM + runner shape ---
|
||||
const vm = {
|
||||
name: cfg.get("vm.name") ?? "foundation-runner-01",
|
||||
vcpu: cfg.getNumber("vm.vcpu") ?? 4,
|
||||
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 8192,
|
||||
vcpu: cfg.getNumber("vm.vcpu") ?? 8,
|
||||
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 32768,
|
||||
diskGiB: cfg.getNumber("vm.diskGiB") ?? 40,
|
||||
// LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a
|
||||
// fixed address is optional). Default matches the hand-built VM.
|
||||
|
|
@ -172,18 +172,24 @@ const userData =
|
|||
"touch /root/cloud-init-done",
|
||||
],
|
||||
});
|
||||
// network-config v2: static on enp1s0 if ipCidr set, else DHCP. Ubuntu applies both.
|
||||
// network-config v2. Match ANY ethernet by name-glob (the NIC may enumerate as
|
||||
// enp1s0/ens3/… depending on the machine type — hardcoding enp1s0 left the VM with
|
||||
// no IP). Static if ipCidr set (needed so the register command knows where to
|
||||
// connect), else DHCP. Ubuntu (netplan-native) applies this at first boot.
|
||||
const networkConfig = yaml.dump({
|
||||
version: 2,
|
||||
ethernets: {
|
||||
enp1s0: vm.ipCidr
|
||||
? {
|
||||
dhcp4: false,
|
||||
addresses: [vm.ipCidr],
|
||||
routes: [{ to: "default", via: vm.gateway }],
|
||||
nameservers: { addresses: vm.nameservers },
|
||||
}
|
||||
: { dhcp4: true },
|
||||
primary: {
|
||||
match: { name: "e*" },
|
||||
...(vm.ipCidr
|
||||
? {
|
||||
dhcp4: false,
|
||||
addresses: [vm.ipCidr],
|
||||
routes: [{ to: "default", via: vm.gateway }],
|
||||
nameservers: { addresses: vm.nameservers },
|
||||
}
|
||||
: { dhcp4: true }),
|
||||
},
|
||||
},
|
||||
});
|
||||
const cloudinit = new libvirt.CloudInitDisk(
|
||||
|
|
@ -200,7 +206,10 @@ const domain = new libvirt.Domain(
|
|||
vcpu: vm.vcpu,
|
||||
cpu: { mode: "host-passthrough" },
|
||||
autostart: true,
|
||||
qemuAgent: true,
|
||||
// NB: do NOT set qemuAgent:true — it makes the provider block on the guest agent
|
||||
// (not up on a fresh boot) during create. We register the runner over the VM's
|
||||
// STATIC IP, so we don't need agent-discovered addresses. (guest-agent is still
|
||||
// installed via cloud-init for `virsh domifaddr --source agent` convenience.)
|
||||
cloudinit: cloudinit.id,
|
||||
disks: [{ volumeId: disk.id }],
|
||||
networkInterfaces: [{ bridge: host.bridge }],
|
||||
|
|
@ -232,19 +241,19 @@ const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim()));
|
|||
const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP
|
||||
const REGISTER = pulumi.interpolate`set -eu
|
||||
IMG=code.forgejo.org/forgejo/runner:6
|
||||
TOKEN='${runnerToken}'
|
||||
for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done
|
||||
DGID=$(stat -c %g /var/run/docker.sock)
|
||||
docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null
|
||||
docker pull -q "$IMG" >/dev/null
|
||||
if docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c '[ -s /data/.runner ]'; then
|
||||
echo "already registered"
|
||||
else
|
||||
printf '%s' '${runnerToken}' | docker run --rm -i -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat > /tmp/t'
|
||||
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
|
||||
register --no-interactive --instance ${forge.instanceUrl} --token "$(docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat /tmp/t')" \
|
||||
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
|
||||
echo "registered"
|
||||
fi
|
||||
# (Re)register — this command only re-runs when the token/domain/labels change
|
||||
# (triggers), so a clean re-register each time is safe. Token passed directly
|
||||
# (pulumi redacts it in its own output as a secret; short-lived on the VM's argv).
|
||||
docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'rm -f /data/.runner'
|
||||
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
|
||||
register --no-interactive --instance ${forge.instanceUrl} --token "$TOKEN" \
|
||||
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
|
||||
echo "registered"
|
||||
docker rm -f forgejo-runner >/dev/null 2>&1 || true
|
||||
docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \
|
||||
-v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \
|
||||
|
|
@ -258,6 +267,11 @@ const register = new command.remote.Command(
|
|||
port: 22,
|
||||
user: "root",
|
||||
privateKey: sshPrivateKey,
|
||||
// The VM needs ~60-90s to boot + apply the static IP before sshd answers;
|
||||
// wait up to ~5min rather than the default ~10 dials so a fresh create lands
|
||||
// the runner in one `up`.
|
||||
dialErrorLimit: 30,
|
||||
perDialTimeout: 10,
|
||||
},
|
||||
create: REGISTER,
|
||||
triggers: [domain.id, runnerToken, runnerLabels],
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue