From a9fd211b11cd7e9f719a000901f4e8e2c99a27b5 Mon Sep 17 00:00:00 2001 From: gabriel engvall Date: Fri, 5 Jun 2026 20:36:37 +0200 Subject: [PATCH 1/2] fix(build): keep SSH sessions alive so a dropped connection cannot hang the build cf SSH invocations had no keepalive, so a long quiet remote step (e.g. a large artifact upload in CF_UPLOAD_CMD) could leave packer streaming SSH waiting forever on a half-open connection, hanging the build with nothing running on the node. Add ServerAliveInterval=15 / ServerAliveCountMax=6 to every ssh call so idle sessions stay alive and a dead peer is detected within ~90s. --- src/build/remote.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/build/remote.ts b/src/build/remote.ts index 368b826..dd3c256 100644 --- a/src/build/remote.ts +++ b/src/build/remote.ts @@ -1,6 +1,11 @@ import { execa, ExecaError } from 'execa' import { redactSensitive } from '../util.ts' +// Keep idle SSH sessions alive — and detect a dead peer — so a long quiet remote +// step (e.g. a multi-hundred-MB artifact upload in CF_UPLOAD_CMD) can't leave the +// build hanging forever on a half-open connection. +const SSH_OPTS = ['-o', 'ServerAliveInterval=15', '-o', 'ServerAliveCountMax=6'] + // ── SIGINT cleanup ──────────────────────────────────────────────────────────── type KillableProc = { kill: (signal?: string) => boolean } @@ -25,7 +30,7 @@ export const captureRemote = async ( cmd: string ): Promise => { try { - const { stdout } = await execa('ssh', [target, cmd], { + const { stdout } = await execa('ssh', [...SSH_OPTS, target, cmd], { stdin: 'inherit', stderr: 'inherit', }) @@ -45,14 +50,14 @@ export const remoteStreaming = ( target: string, cmd: string, onLine?: (line: string) => void -): Promise => streaming('ssh', [target, cmd], onLine) +): Promise => streaming('ssh', [...SSH_OPTS, target, cmd], onLine) // Allocates a PTY so remote programs (e.g. wget) detect a terminal and show // their native progress bar rather than falling back to dot-style output. export const remoteStreamingPty = ( target: string, cmd: string -): Promise => streaming('ssh', ['-t', '-t', target, cmd]) +): Promise => streaming('ssh', [...SSH_OPTS, '-t', '-t', target, cmd]) // Wget exit codes worth surfacing. See man wget(1) EXIT STATUS. const WGET_EXIT: Record = { @@ -75,7 +80,7 @@ export const remoteWgetCapture = async ( onLine: (line: string) => void, context?: { url?: string; what?: string } ): Promise => { - const proc = execa('ssh', ['-t', '-t', target, `{ ${cmd}; } 2>&1`], { + const proc = execa('ssh', [...SSH_OPTS, '-t', '-t', target, `{ ${cmd}; } 2>&1`], { stdin: 'pipe', stdout: 'pipe', stderr: 'ignore', From 86beefd79afcf40f9a9e38e8681d761170ca22b5 Mon Sep 17 00:00:00 2001 From: gabriel engvall Date: Fri, 5 Jun 2026 20:36:37 +0200 Subject: [PATCH 2/2] feat(bootstrap): open the build network through the Proxmox firewall when enabled On nodes with the datacenter or host firewall enabled, PVEFW-INPUT drops the build VM connection to the packer HTTP server (preseed/kickstart) on vmbr1, so installer builds hang at Waiting for SSH. Add a bootstrap step that detects an enabled pve-firewall and adds a host rule allowing the build subnet in. A pve-firewall host rule (not a post-up iptables rule) survives reboots and firewall reloads. No-op when the firewall is disabled. --- src/bootstrap.ts | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/bootstrap.ts b/src/bootstrap.ts index 80b4d2c..52eb22a 100644 --- a/src/bootstrap.ts +++ b/src/bootstrap.ts @@ -232,6 +232,47 @@ const stepVmbr1: Step = { }, } +const BUILD_NET_SUBNET = '10.0.0.0/24' +const BUILD_NET_FW_COMMENT = 'cofoundry build network (packer HTTP)' + +// When the Proxmox firewall is enabled, the host's PVEFW-INPUT chain drops the +// build VM's connection to packer's HTTP server (preseed/kickstart fetch) on +// vmbr1 — the build then hangs at "Waiting for SSH". A pve-firewall host rule is +// the correct fix: it survives reboots AND firewall reloads, unlike a post-up +// iptables rule which pve-firewall flushes whenever it recompiles. No-op when the +// firewall is disabled (nothing to open). +const stepBuildNetFirewall: Step = { + id: 'build-net-firewall', + label: 'allow build network through Proxmox firewall', + inScope: plan => plan.needBuildNet, + probe: async plan => { + const status = await sshCapture( + plan.target, + 'pve-firewall status 2>/dev/null' + ) + if (!/enabled/i.test(status.stdout)) { + return { + done: true, + note: 'Proxmox firewall disabled — no rule needed', + } + } + const rules = await sshCapture( + plan.target, + `pvesh get /nodes/$(hostname)/firewall/rules --output-format json 2>/dev/null` + ) + return rules.stdout.includes('cofoundry build network') + ? { done: true, note: 'host firewall rule already present' } + : { done: false, note: 'Proxmox firewall on — opening build net' } + }, + apply: async plan => { + await remoteStreaming( + plan.target, + `pvesh create /nodes/$(hostname)/firewall/rules --action ACCEPT --type in --source ${BUILD_NET_SUBNET} --enable 1 --comment ${shellQuote(BUILD_NET_FW_COMMENT)}` + ) + return { note: `allowed ${BUILD_NET_SUBNET} in (host firewall rule)` } + }, +} + const stepDnsmasq: Step = { id: 'dnsmasq', label: 'install dnsmasq', @@ -347,6 +388,7 @@ const ALL_STEPS: Step[] = [ stepAwscli, stepIsoCache, stepVmbr1, + stepBuildNetFirewall, stepDnsmasq, stepDnsmasqConf, stepNetslotDir,