Fixes found running `pulumi up` live against crunchy01 (foundation-runner-02,
static .16, 8c/32G — the new default sizing):
- network-config matches the NIC by glob (`match: {name: "e*"}`) instead of a
hardcoded enp1s0 — the libvirt.Domain enumerated it differently, leaving the VM
with no IP.
- drop `qemuAgent: true` — it blocks the provider on the guest agent (not up on a
fresh boot) during create; we register over the static IP instead.
- runner-register connection gets `dialErrorLimit: 30` so it waits ~5 min for the
VM to boot + apply its IP, landing the runner in a single `up`.
- fix the register token passing (the old /tmp/t hop was an ephemeral --rm
container → empty token); pass it directly (pulumi redacts the secret).
- README: host prep (root SSH + the `images` pool), the exact stack config, and
the cutover marked DONE — a `runs-on: fenced` job ran green on the Pulumi-managed
runner-02; the hand-built VM was retired.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
286 lines
12 KiB
TypeScript
286 lines
12 KiB
TypeScript
// runners/index.ts — Step-0-after-foundation: the fenced Actions runner fleet.
|
|
//
|
|
// ISOLATED STACK, decoupled from `bootstrap` on purpose (see Pulumi.yaml). A
|
|
// @pulumi/libvirt provider dials the runner HOST (e.g. crunchy01) on every
|
|
// up/refresh/preview; keeping that here — never imported by bootstrap — means
|
|
// foundation ops never require, and are never blocked by, the runner host being
|
|
// reachable. One-way dependency only: this stack mints a runner token FROM the
|
|
// forge, so it runs after the foundation stands.
|
|
//
|
|
// It codifies what SESSION_2026-07-01_003 built by hand (see runners/README.md):
|
|
// 1. an Ubuntu VM on the host's LAN bridge, with docker + qemu-guest-agent;
|
|
// 2. a kube-router-proof FORWARD accept (the host is a k3s node whose FORWARD
|
|
// policy is DROP — bridged VM traffic needs an idempotent, re-asserted rule);
|
|
// 3. a Forgejo Actions runner registered with a distinct label ("fenced") so
|
|
// ecosystem/untrusted jobs (runs-on: fenced) run OFF the forge VM (R5).
|
|
//
|
|
// PREREQUISITE (host, one-time, kept OUT of this stack so the libvirt provider
|
|
// always has something to connect to): qemu-kvm + libvirt-daemon-system +
|
|
// libvirt-clients + virtinst + cloud-image-utils installed on the host, libvirtd
|
|
// enabled, and a LAN bridge (br0). See runners/README.md §Host prep.
|
|
import * as pulumi from "@pulumi/pulumi";
|
|
import * as libvirt from "@pulumi/libvirt";
|
|
import * as command from "@pulumi/command";
|
|
import * as fs from "fs";
|
|
import * as yaml from "js-yaml";
|
|
|
|
const cfg = new pulumi.Config();
|
|
|
|
// --- runner host (libvirt over qemu+ssh) ---
|
|
const host = {
|
|
address: cfg.get("host.address") ?? "192.168.1.2", // crunchy01
|
|
user: cfg.get("host.user") ?? "root",
|
|
bridge: cfg.get("host.bridge") ?? "br0", // LAN bridge the VM joins
|
|
pool: cfg.get("host.pool") ?? "default", // libvirt storage pool (nvme)
|
|
};
|
|
// SSH key reaching the host AND the created VM (root). Path via ENV, never config.
|
|
const sshKeyPath =
|
|
process.env.RUNNER_SSH_KEY_PATH ??
|
|
`${process.env.HOME}/.ssh/foundation-test_ed25519`;
|
|
const sshPrivateKey = pulumi.secret(fs.readFileSync(sshKeyPath, "utf8"));
|
|
const sshPublicKey = fs.readFileSync(`${sshKeyPath}.pub`, "utf8").trim();
|
|
const hostConn: command.types.input.remote.ConnectionArgs = {
|
|
host: host.address,
|
|
port: cfg.getNumber("host.sshPort") ?? 22,
|
|
user: host.user,
|
|
privateKey: sshPrivateKey,
|
|
};
|
|
|
|
// --- the forge (mint a runner registration token; reached over SSH via docker) ---
|
|
const forge = {
|
|
address: cfg.get("forge.address") ?? "204.168.234.72",
|
|
sshPort: cfg.getNumber("forge.sshPort") ?? 222,
|
|
user: cfg.get("forge.user") ?? "root",
|
|
instanceUrl: cfg.get("forge.instanceUrl") ?? "https://forge.olsitec.net",
|
|
container: cfg.get("forge.container") ?? "foundation-forgejo",
|
|
};
|
|
const forgeConn: command.types.input.remote.ConnectionArgs = {
|
|
host: forge.address,
|
|
port: forge.sshPort,
|
|
user: forge.user,
|
|
privateKey: sshPrivateKey,
|
|
};
|
|
|
|
// --- the VM + runner shape ---
|
|
const vm = {
|
|
name: cfg.get("vm.name") ?? "foundation-runner-01",
|
|
vcpu: cfg.getNumber("vm.vcpu") ?? 8,
|
|
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 32768,
|
|
diskGiB: cfg.getNumber("vm.diskGiB") ?? 40,
|
|
// LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a
|
|
// fixed address is optional). Default matches the hand-built VM.
|
|
ipCidr: cfg.get("vm.ipCidr") ?? "192.168.1.15/24",
|
|
gateway: cfg.get("vm.gateway") ?? "192.168.1.251",
|
|
nameservers: (cfg.getObject<string[]>("vm.nameservers")) ?? [
|
|
"192.168.1.251",
|
|
"1.1.1.1",
|
|
],
|
|
ubuntuImageUrl:
|
|
cfg.get("vm.ubuntuImageUrl") ??
|
|
"https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img",
|
|
};
|
|
// The runner label(s). `fenced` routes runs-on: fenced here; the schema maps it to a
|
|
// default job image on the VM's docker.
|
|
const runnerLabels =
|
|
cfg.get("runner.labels") ?? "fenced:docker://node:20-bookworm";
|
|
|
|
// =============================================================================
|
|
// libvirt provider — qemu+ssh to the host. Lazy-connects when a libvirt resource
|
|
// is created; the host prerequisite (libvirtd) must already be satisfied.
|
|
// =============================================================================
|
|
const provider = new libvirt.Provider("runner-host", {
|
|
uri: `qemu+ssh://${host.user}@${host.address}/system?sshauth=privkey&keyfile=${sshKeyPath}&known_hosts_verify=ignore`,
|
|
});
|
|
|
|
// --- host prep: the kube-router-proof bridged-FORWARD accept (idempotent timer) ---
|
|
// The host is a k3s node; kube-router sets FORWARD policy DROP and re-syncs iptables,
|
|
// which drops bridged VM↔LAN traffic and can flush a hand-added rule. A 60s systemd
|
|
// timer re-asserts it. This is a control-plane op on the HOST (not a libvirt resource).
|
|
const FIREWALL = `set -eu
|
|
cat > /etc/systemd/system/libvirt-bridge-forward.service <<'U'
|
|
[Unit]
|
|
Description=Ensure bridged VM traffic passes iptables FORWARD (libvirt on a kube-router host)
|
|
After=network-online.target
|
|
[Service]
|
|
Type=oneshot
|
|
ExecStart=/bin/sh -c 'iptables -C FORWARD -m physdev --physdev-is-bridged -j ACCEPT 2>/dev/null || iptables -I FORWARD 1 -m physdev --physdev-is-bridged -j ACCEPT'
|
|
U
|
|
cat > /etc/systemd/system/libvirt-bridge-forward.timer <<'U'
|
|
[Unit]
|
|
Description=Re-assert the bridged-FORWARD accept rule (kube-router flushes iptables on resync)
|
|
[Timer]
|
|
OnBootSec=30s
|
|
OnUnitActiveSec=60s
|
|
AccuracySec=5s
|
|
[Install]
|
|
WantedBy=timers.target
|
|
U
|
|
systemctl daemon-reload
|
|
systemctl enable --now libvirt-bridge-forward.timer >/dev/null
|
|
echo "bridged-FORWARD timer active"`;
|
|
const firewall = new command.remote.Command("runner-host-firewall", {
|
|
connection: hostConn,
|
|
create: FIREWALL,
|
|
update: FIREWALL,
|
|
});
|
|
|
|
// =============================================================================
|
|
// The VM: Ubuntu base volume → backed domain disk → cloud-init → domain.
|
|
// Ubuntu (not the Debian genericcloud image) because Debian's cloud-init wrote
|
|
// netplan the image never applied (no IPv4); Ubuntu renders + applies it cleanly.
|
|
// =============================================================================
|
|
const base = new libvirt.Volume(
|
|
`${vm.name}-base`,
|
|
{
|
|
name: `${vm.name}-ubuntu-base.img`,
|
|
source: vm.ubuntuImageUrl,
|
|
pool: host.pool,
|
|
format: "qcow2",
|
|
},
|
|
{ provider },
|
|
);
|
|
const disk = new libvirt.Volume(
|
|
`${vm.name}-disk`,
|
|
{
|
|
name: `${vm.name}.qcow2`,
|
|
pool: host.pool,
|
|
format: "qcow2",
|
|
baseVolumeId: base.id,
|
|
size: vm.diskGiB * 1024 * 1024 * 1024,
|
|
},
|
|
{ provider },
|
|
);
|
|
|
|
// cloud-init user-data: docker + qemu-guest-agent + our SSH key + a marker.
|
|
const userData =
|
|
"#cloud-config\n" +
|
|
yaml.dump({
|
|
hostname: vm.name,
|
|
manage_etc_hosts: true,
|
|
ssh_pwauth: false,
|
|
users: [
|
|
{
|
|
name: "root",
|
|
lock_passwd: false,
|
|
ssh_authorized_keys: [sshPublicKey],
|
|
},
|
|
],
|
|
packages: ["ca-certificates", "curl", "jq", "qemu-guest-agent"],
|
|
runcmd: [
|
|
["sh", "-c", "curl -fsSL https://get.docker.com | sh"],
|
|
"systemctl enable --now docker qemu-guest-agent",
|
|
"touch /root/cloud-init-done",
|
|
],
|
|
});
|
|
// network-config v2. Match ANY ethernet by name-glob (the NIC may enumerate as
|
|
// enp1s0/ens3/… depending on the machine type — hardcoding enp1s0 left the VM with
|
|
// no IP). Static if ipCidr set (needed so the register command knows where to
|
|
// connect), else DHCP. Ubuntu (netplan-native) applies this at first boot.
|
|
const networkConfig = yaml.dump({
|
|
version: 2,
|
|
ethernets: {
|
|
primary: {
|
|
match: { name: "e*" },
|
|
...(vm.ipCidr
|
|
? {
|
|
dhcp4: false,
|
|
addresses: [vm.ipCidr],
|
|
routes: [{ to: "default", via: vm.gateway }],
|
|
nameservers: { addresses: vm.nameservers },
|
|
}
|
|
: { dhcp4: true }),
|
|
},
|
|
},
|
|
});
|
|
const cloudinit = new libvirt.CloudInitDisk(
|
|
`${vm.name}-cloudinit`,
|
|
{ name: `${vm.name}-cloudinit.iso`, pool: host.pool, userData, networkConfig },
|
|
{ provider },
|
|
);
|
|
|
|
const domain = new libvirt.Domain(
|
|
vm.name,
|
|
{
|
|
name: vm.name,
|
|
memory: vm.memoryMiB,
|
|
vcpu: vm.vcpu,
|
|
cpu: { mode: "host-passthrough" },
|
|
autostart: true,
|
|
// NB: do NOT set qemuAgent:true — it makes the provider block on the guest agent
|
|
// (not up on a fresh boot) during create. We register the runner over the VM's
|
|
// STATIC IP, so we don't need agent-discovered addresses. (guest-agent is still
|
|
// installed via cloud-init for `virsh domifaddr --source agent` convenience.)
|
|
cloudinit: cloudinit.id,
|
|
disks: [{ volumeId: disk.id }],
|
|
networkInterfaces: [{ bridge: host.bridge }],
|
|
// A real PTY console so `virsh console <vm>` works (learned the hard way).
|
|
consoles: [
|
|
{ type: "pty", targetPort: "0", targetType: "serial" },
|
|
{ type: "pty", targetPort: "1", targetType: "virtio" },
|
|
],
|
|
},
|
|
{ provider, dependsOn: [firewall] },
|
|
);
|
|
|
|
// =============================================================================
|
|
// Register the Forgejo runner: mint a token on the forge, then register + run
|
|
// act_runner inside the VM (docker), reachable once cloud-init has installed docker.
|
|
// =============================================================================
|
|
// 1) token — instance-scoped registration token, minted over SSH via docker exec.
|
|
const tokenCmd = new command.remote.Command("runner-token", {
|
|
connection: forgeConn,
|
|
create: `docker exec -u git ${forge.container} forgejo actions generate-runner-token`,
|
|
// Re-mint if the forge container id or the label set changes.
|
|
triggers: [runnerLabels],
|
|
});
|
|
const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim()));
|
|
|
|
// 2) register + run — connect to the VM (its static/DHCP IP). The script waits for
|
|
// cloud-init (docker) to be ready, registers idempotently, and runs the daemon with
|
|
// the host docker gid so uid-1000 act_runner can reach the socket.
|
|
const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP
|
|
const REGISTER = pulumi.interpolate`set -eu
|
|
IMG=code.forgejo.org/forgejo/runner:6
|
|
TOKEN='${runnerToken}'
|
|
for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done
|
|
DGID=$(stat -c %g /var/run/docker.sock)
|
|
docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null
|
|
docker pull -q "$IMG" >/dev/null
|
|
# (Re)register — this command only re-runs when the token/domain/labels change
|
|
# (triggers), so a clean re-register each time is safe. Token passed directly
|
|
# (pulumi redacts it in its own output as a secret; short-lived on the VM's argv).
|
|
docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'rm -f /data/.runner'
|
|
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
|
|
register --no-interactive --instance ${forge.instanceUrl} --token "$TOKEN" \
|
|
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
|
|
echo "registered"
|
|
docker rm -f forgejo-runner >/dev/null 2>&1 || true
|
|
docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \
|
|
-v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \
|
|
--entrypoint /bin/forgejo-runner "$IMG" daemon >/dev/null
|
|
echo "runner daemon up"`;
|
|
const register = new command.remote.Command(
|
|
"runner-register",
|
|
{
|
|
connection: {
|
|
host: vmIp,
|
|
port: 22,
|
|
user: "root",
|
|
privateKey: sshPrivateKey,
|
|
// The VM needs ~60-90s to boot + apply the static IP before sshd answers;
|
|
// wait up to ~5min rather than the default ~10 dials so a fresh create lands
|
|
// the runner in one `up`.
|
|
dialErrorLimit: 30,
|
|
perDialTimeout: 10,
|
|
},
|
|
create: REGISTER,
|
|
triggers: [domain.id, runnerToken, runnerLabels],
|
|
},
|
|
{ dependsOn: [domain, tokenCmd] },
|
|
);
|
|
|
|
export const runnerHost = host.address;
|
|
export const runnerVmIp = vmIp;
|
|
export const runnerLabelsOut = runnerLabels;
|
|
export const forgeInstance = forge.instanceUrl;
|
|
void register;
|