foundation/runners/index.ts
Andreas Niemann cfa71847ba
All checks were successful
CI / preflight (push) Successful in 4s
CI / typecheck (push) Successful in 13s
pulumi-preview / preview (push) Successful in 17s
feat(runners): decoupled Pulumi stack for the fenced runner fleet (R5)
A separate, isolated Pulumi project (peer to bootstrap/provision/offsite-backup)
that provisions runner VM(s) on a libvirt host and registers Forgejo Actions
runners with a distinct `fenced` label — so ecosystem/untrusted jobs run OFF the
forge VM.

Decoupled ON PURPOSE: a @pulumi/libvirt provider dials the runner host on every
up/refresh, so keeping it in `bootstrap` would make the foundation undeployable/
unrefreshable whenever the host (crunchy01) is down or unreachable (the Terraform
coupling trap). As its own stack, bootstrap never imports it — foundation ops
never touch crunchy01, and this stack's health is independent. One-way dependency:
it mints a runner token FROM the forge, i.e. runs after the foundation stands.

Codifies what was built + hardened by hand this session (runners/README.md):
Ubuntu VM on the LAN bridge (docker + qemu-guest-agent via cloud-init), the
kube-router-proof FORWARD timer, and runner registration. Typechecked; the live
`pulumi up` cutover from the hand-built VM is the remaining validation step.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-01 03:15:39 +02:00

272 lines
11 KiB
TypeScript

// runners/index.ts — Step-0-after-foundation: the fenced Actions runner fleet.
//
// ISOLATED STACK, decoupled from `bootstrap` on purpose (see Pulumi.yaml). A
// @pulumi/libvirt provider dials the runner HOST (e.g. crunchy01) on every
// up/refresh/preview; keeping that here — never imported by bootstrap — means
// foundation ops never require, and are never blocked by, the runner host being
// reachable. One-way dependency only: this stack mints a runner token FROM the
// forge, so it runs after the foundation stands.
//
// It codifies what SESSION_2026-07-01_003 built by hand (see runners/README.md):
// 1. an Ubuntu VM on the host's LAN bridge, with docker + qemu-guest-agent;
// 2. a kube-router-proof FORWARD accept (the host is a k3s node whose FORWARD
// policy is DROP — bridged VM traffic needs an idempotent, re-asserted rule);
// 3. a Forgejo Actions runner registered with a distinct label ("fenced") so
// ecosystem/untrusted jobs (runs-on: fenced) run OFF the forge VM (R5).
//
// PREREQUISITE (host, one-time, kept OUT of this stack so the libvirt provider
// always has something to connect to): qemu-kvm + libvirt-daemon-system +
// libvirt-clients + virtinst + cloud-image-utils installed on the host, libvirtd
// enabled, and a LAN bridge (br0). See runners/README.md §Host prep.
import * as pulumi from "@pulumi/pulumi";
import * as libvirt from "@pulumi/libvirt";
import * as command from "@pulumi/command";
import * as fs from "fs";
import * as yaml from "js-yaml";
const cfg = new pulumi.Config();
// --- runner host (libvirt over qemu+ssh) ---
const host = {
address: cfg.get("host.address") ?? "192.168.1.2", // crunchy01
user: cfg.get("host.user") ?? "root",
bridge: cfg.get("host.bridge") ?? "br0", // LAN bridge the VM joins
pool: cfg.get("host.pool") ?? "default", // libvirt storage pool (nvme)
};
// SSH key reaching the host AND the created VM (root). Path via ENV, never config.
const sshKeyPath =
process.env.RUNNER_SSH_KEY_PATH ??
`${process.env.HOME}/.ssh/foundation-test_ed25519`;
const sshPrivateKey = pulumi.secret(fs.readFileSync(sshKeyPath, "utf8"));
const sshPublicKey = fs.readFileSync(`${sshKeyPath}.pub`, "utf8").trim();
const hostConn: command.types.input.remote.ConnectionArgs = {
host: host.address,
port: cfg.getNumber("host.sshPort") ?? 22,
user: host.user,
privateKey: sshPrivateKey,
};
// --- the forge (mint a runner registration token; reached over SSH via docker) ---
const forge = {
address: cfg.get("forge.address") ?? "204.168.234.72",
sshPort: cfg.getNumber("forge.sshPort") ?? 222,
user: cfg.get("forge.user") ?? "root",
instanceUrl: cfg.get("forge.instanceUrl") ?? "https://forge.olsitec.net",
container: cfg.get("forge.container") ?? "foundation-forgejo",
};
const forgeConn: command.types.input.remote.ConnectionArgs = {
host: forge.address,
port: forge.sshPort,
user: forge.user,
privateKey: sshPrivateKey,
};
// --- the VM + runner shape ---
const vm = {
name: cfg.get("vm.name") ?? "foundation-runner-01",
vcpu: cfg.getNumber("vm.vcpu") ?? 4,
memoryMiB: cfg.getNumber("vm.memoryMiB") ?? 8192,
diskGiB: cfg.getNumber("vm.diskGiB") ?? 40,
// LAN address for the VM. Empty → DHCP (the runner polls the forge outbound, so a
// fixed address is optional). Default matches the hand-built VM.
ipCidr: cfg.get("vm.ipCidr") ?? "192.168.1.15/24",
gateway: cfg.get("vm.gateway") ?? "192.168.1.251",
nameservers: (cfg.getObject<string[]>("vm.nameservers")) ?? [
"192.168.1.251",
"1.1.1.1",
],
ubuntuImageUrl:
cfg.get("vm.ubuntuImageUrl") ??
"https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img",
};
// The runner label(s). `fenced` routes runs-on: fenced here; the schema maps it to a
// default job image on the VM's docker.
const runnerLabels =
cfg.get("runner.labels") ?? "fenced:docker://node:20-bookworm";
// =============================================================================
// libvirt provider — qemu+ssh to the host. Lazy-connects when a libvirt resource
// is created; the host prerequisite (libvirtd) must already be satisfied.
// =============================================================================
const provider = new libvirt.Provider("runner-host", {
uri: `qemu+ssh://${host.user}@${host.address}/system?sshauth=privkey&keyfile=${sshKeyPath}&known_hosts_verify=ignore`,
});
// --- host prep: the kube-router-proof bridged-FORWARD accept (idempotent timer) ---
// The host is a k3s node; kube-router sets FORWARD policy DROP and re-syncs iptables,
// which drops bridged VM↔LAN traffic and can flush a hand-added rule. A 60s systemd
// timer re-asserts it. This is a control-plane op on the HOST (not a libvirt resource).
const FIREWALL = `set -eu
cat > /etc/systemd/system/libvirt-bridge-forward.service <<'U'
[Unit]
Description=Ensure bridged VM traffic passes iptables FORWARD (libvirt on a kube-router host)
After=network-online.target
[Service]
Type=oneshot
ExecStart=/bin/sh -c 'iptables -C FORWARD -m physdev --physdev-is-bridged -j ACCEPT 2>/dev/null || iptables -I FORWARD 1 -m physdev --physdev-is-bridged -j ACCEPT'
U
cat > /etc/systemd/system/libvirt-bridge-forward.timer <<'U'
[Unit]
Description=Re-assert the bridged-FORWARD accept rule (kube-router flushes iptables on resync)
[Timer]
OnBootSec=30s
OnUnitActiveSec=60s
AccuracySec=5s
[Install]
WantedBy=timers.target
U
systemctl daemon-reload
systemctl enable --now libvirt-bridge-forward.timer >/dev/null
echo "bridged-FORWARD timer active"`;
const firewall = new command.remote.Command("runner-host-firewall", {
connection: hostConn,
create: FIREWALL,
update: FIREWALL,
});
// =============================================================================
// The VM: Ubuntu base volume → backed domain disk → cloud-init → domain.
// Ubuntu (not the Debian genericcloud image) because Debian's cloud-init wrote
// netplan the image never applied (no IPv4); Ubuntu renders + applies it cleanly.
// =============================================================================
const base = new libvirt.Volume(
`${vm.name}-base`,
{
name: `${vm.name}-ubuntu-base.img`,
source: vm.ubuntuImageUrl,
pool: host.pool,
format: "qcow2",
},
{ provider },
);
const disk = new libvirt.Volume(
`${vm.name}-disk`,
{
name: `${vm.name}.qcow2`,
pool: host.pool,
format: "qcow2",
baseVolumeId: base.id,
size: vm.diskGiB * 1024 * 1024 * 1024,
},
{ provider },
);
// cloud-init user-data: docker + qemu-guest-agent + our SSH key + a marker.
const userData =
"#cloud-config\n" +
yaml.dump({
hostname: vm.name,
manage_etc_hosts: true,
ssh_pwauth: false,
users: [
{
name: "root",
lock_passwd: false,
ssh_authorized_keys: [sshPublicKey],
},
],
packages: ["ca-certificates", "curl", "jq", "qemu-guest-agent"],
runcmd: [
["sh", "-c", "curl -fsSL https://get.docker.com | sh"],
"systemctl enable --now docker qemu-guest-agent",
"touch /root/cloud-init-done",
],
});
// network-config v2: static on enp1s0 if ipCidr set, else DHCP. Ubuntu applies both.
const networkConfig = yaml.dump({
version: 2,
ethernets: {
enp1s0: vm.ipCidr
? {
dhcp4: false,
addresses: [vm.ipCidr],
routes: [{ to: "default", via: vm.gateway }],
nameservers: { addresses: vm.nameservers },
}
: { dhcp4: true },
},
});
const cloudinit = new libvirt.CloudInitDisk(
`${vm.name}-cloudinit`,
{ name: `${vm.name}-cloudinit.iso`, pool: host.pool, userData, networkConfig },
{ provider },
);
const domain = new libvirt.Domain(
vm.name,
{
name: vm.name,
memory: vm.memoryMiB,
vcpu: vm.vcpu,
cpu: { mode: "host-passthrough" },
autostart: true,
qemuAgent: true,
cloudinit: cloudinit.id,
disks: [{ volumeId: disk.id }],
networkInterfaces: [{ bridge: host.bridge }],
// A real PTY console so `virsh console <vm>` works (learned the hard way).
consoles: [
{ type: "pty", targetPort: "0", targetType: "serial" },
{ type: "pty", targetPort: "1", targetType: "virtio" },
],
},
{ provider, dependsOn: [firewall] },
);
// =============================================================================
// Register the Forgejo runner: mint a token on the forge, then register + run
// act_runner inside the VM (docker), reachable once cloud-init has installed docker.
// =============================================================================
// 1) token — instance-scoped registration token, minted over SSH via docker exec.
const tokenCmd = new command.remote.Command("runner-token", {
connection: forgeConn,
create: `docker exec -u git ${forge.container} forgejo actions generate-runner-token`,
// Re-mint if the forge container id or the label set changes.
triggers: [runnerLabels],
});
const runnerToken = pulumi.secret(tokenCmd.stdout.apply((s) => s.trim()));
// 2) register + run — connect to the VM (its static/DHCP IP). The script waits for
// cloud-init (docker) to be ready, registers idempotently, and runs the daemon with
// the host docker gid so uid-1000 act_runner can reach the socket.
const vmIp = vm.ipCidr ? vm.ipCidr.split("/")[0] : host.address; // static → known IP
const REGISTER = pulumi.interpolate`set -eu
IMG=code.forgejo.org/forgejo/runner:6
for _ in $(seq 1 60); do [ -f /root/cloud-init-done ] && docker info >/dev/null 2>&1 && break; sleep 5; done
DGID=$(stat -c %g /var/run/docker.sock)
docker volume inspect crunchy-runner-data >/dev/null 2>&1 || docker volume create crunchy-runner-data >/dev/null
docker pull -q "$IMG" >/dev/null
if docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c '[ -s /data/.runner ]'; then
echo "already registered"
else
printf '%s' '${runnerToken}' | docker run --rm -i -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat > /tmp/t'
docker run --rm -v crunchy-runner-data:/data --entrypoint /bin/forgejo-runner "$IMG" \
register --no-interactive --instance ${forge.instanceUrl} --token "$(docker run --rm -v crunchy-runner-data:/data --entrypoint sh "$IMG" -c 'cat /tmp/t')" \
--name ${vm.name} --labels '${runnerLabels}' >/dev/null
echo "registered"
fi
docker rm -f forgejo-runner >/dev/null 2>&1 || true
docker run -d --name forgejo-runner --restart unless-stopped --group-add "$DGID" \
-v crunchy-runner-data:/data -v /var/run/docker.sock:/var/run/docker.sock \
--entrypoint /bin/forgejo-runner "$IMG" daemon >/dev/null
echo "runner daemon up"`;
const register = new command.remote.Command(
"runner-register",
{
connection: {
host: vmIp,
port: 22,
user: "root",
privateKey: sshPrivateKey,
},
create: REGISTER,
triggers: [domain.id, runnerToken, runnerLabels],
},
{ dependsOn: [domain, tokenCmd] },
);
export const runnerHost = host.address;
export const runnerVmIp = vmIp;
export const runnerLabelsOut = runnerLabels;
export const forgeInstance = forge.instanceUrl;
void register;