From 6a069bf963aaf53b5b614dd9152b0a6b4d540de0 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 13:11:25 -0400 Subject: [PATCH 1/7] daemon: skip reconciliation after reboot is issued After the daemon issues a reboot, it now returns early from Reconcile() on subsequent invocations. This prevents the daemon from running bootc status on a node that is mid-shutdown, which could return garbage data. This also makes the Rebooting state durable (it persists until the node actually goes down), so the e2e test can now assert on it as an intermediate lifecycle step. This also matches what the MCO does. Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- internal/daemon/reconciler.go | 19 +++++++------------ internal/daemon/reconciler_test.go | 7 +++++++ internal/daemon/suite_test.go | 12 +++++++----- test/e2e/bootcnode_test.go | 17 ++++++++++++++--- 4 files changed, 35 insertions(+), 20 deletions(-) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index d86e697..13a13c3 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -56,8 +56,8 @@ type BootcNodeReconciler struct { inflight stageOp stageDone chan event.GenericEvent - // rebootIssued tracks whether a reboot has been issued so classifyAction - // can distinguish the Staged→Rebooting. + // rebootIssued tracks whether a reboot has been issued so Reconcile + // can skip reconciliation while the node is shutting down. rebootIssued bool } @@ -78,6 +78,11 @@ func (r *BootcNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, nil } + if r.rebootIssued { + log.Info("Reboot already issued, skipping reconcile") + return ctrl.Result{}, nil + } + var bn bootcv1alpha1.BootcNode if err := r.Get(ctx, req.NamespacedName, &bn); err != nil { if apierrors.IsNotFound(err) { @@ -211,9 +216,6 @@ func (r *BootcNodeReconciler) reconcileBootcNode(ctx context.Context, bn *bootcv reason = bootcv1alpha1.NodeReasonRebooting res.needsReboot = true - case actionAwaitReboot: - reason = bootcv1alpha1.NodeReasonRebooting - case actionAwaitBooted: reason = bootcv1alpha1.NodeReasonStaged log.Info("Image staged", "image", desiredImage) @@ -331,7 +333,6 @@ const ( actionAwaitStage // stage in-flight, waiting for completion actionAwaitBooted // staged, waiting for reboot approval actionReboot // staged + approved, issue reboot - actionAwaitReboot // reboot issued, waiting for completion ) func (r *BootcNodeReconciler) classifyAction(bn *bootcv1alpha1.BootcNode, digested reference.Digested, desiredImage string) updateAction { @@ -348,12 +349,6 @@ func (r *BootcNodeReconciler) classifyAction(bn *bootcv1alpha1.BootcNode, digest return actionAwaitBooted } - // rebootIssued is volatile: if the daemon restarts it resets to false. - // That is safe because either the reboot already landed (booted digest - // matches and we return idle earlier) or it hasn't and we re-issue it. - if r.rebootIssued { - return actionAwaitReboot - } return actionReboot } diff --git a/internal/daemon/reconciler_test.go b/internal/daemon/reconciler_test.go index e5a5c8d..bd9f3d1 100644 --- a/internal/daemon/reconciler_test.go +++ b/internal/daemon/reconciler_test.go @@ -100,6 +100,7 @@ func TestReconcileBootcStatusError(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.setStatusErr(errors.New(bootcStatusErrMsg)) bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefA) @@ -127,6 +128,7 @@ func TestStagingTriggered(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) @@ -164,6 +166,7 @@ func TestStagingError(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) fake.setStageErr(errors.New(stageErrMsg)) @@ -197,6 +200,7 @@ func TestAlreadyStaged(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -226,6 +230,7 @@ func TestRebootingSet(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -255,6 +260,7 @@ func TestRollback(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -286,6 +292,7 @@ func TestCancelInflightStage(t *testing.T) { ctx := context.Background() fake.reset() + reconciler.reset() fake.status = newBootcStatus(testutil.DigestA) firstBlock := make(chan struct{}) diff --git a/internal/daemon/suite_test.go b/internal/daemon/suite_test.go index f2cf140..a695879 100644 --- a/internal/daemon/suite_test.go +++ b/internal/daemon/suite_test.go @@ -22,9 +22,10 @@ import ( const testNodeName = "test-node" var ( - testEnv *envtest.Environment - k8sClient client.Client - fake *fakeExecutor + testEnv *envtest.Environment + k8sClient client.Client + fake *fakeExecutor + reconciler *BootcNodeReconciler ) func TestMain(m *testing.M) { @@ -65,12 +66,13 @@ func TestMain(m *testing.M) { os.Exit(1) } - if err := (&BootcNodeReconciler{ + reconciler = &BootcNodeReconciler{ Client: mgr.GetClient(), Scheme: mgr.GetScheme(), NodeName: testNodeName, Executor: fake, - }).SetupWithManager(mgr); err != nil { + } + if err := reconciler.SetupWithManager(mgr); err != nil { fmt.Fprintf(os.Stderr, "Failed to setup reconciler: %v\n", err) os.Exit(1) } diff --git a/test/e2e/bootcnode_test.go b/test/e2e/bootcnode_test.go index 1177bfa..2b57b22 100644 --- a/test/e2e/bootcnode_test.go +++ b/test/e2e/bootcnode_test.go @@ -131,10 +131,21 @@ func TestUpdateReboot(t *testing.T) { t.Logf("Patched pool to update image %s", updateRef) - // Phase 3: Wait for Idle with the update digest — proves the full + // Phase 3: Wait for Rebooting — the daemon skips reconciliation after + // issuing a reboot, so this state is durable until the node goes down. + g.Eventually(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &bn)).To(Succeed()) + g.Expect(bn.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonRebooting), + ))) + }).WithTimeout(5*time.Minute).Should(Succeed(), "expected node to reach Rebooting state") + + t.Logf("Node %q is Rebooting", nodeName) + + // Phase 4: Wait for Idle with the update digest — proves the full // update lifecycle completed (staging, reboot, boot into new image). - // We don't assert on intermediate states (Staging, Rebooting) because - // they are too transient to catch reliably with polling. g.Eventually(func(g Gomega) { g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &bn)).To(Succeed()) g.Expect(bn.Status.Booted).NotTo(BeNil()) From d7822d77a4ee9333d0d577d65d458a970d43d8d4 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 14:45:57 -0400 Subject: [PATCH 2/7] daemon: move reset() to stageOp, drop rebootIssued clearing The `reset()` method only needs to clear the backoff retry counter, so it belongs on `stageOp` rather than `BootcNodeReconciler`. Clearing `rebootIssued` is unnecessary in production; it doesn't make sense to "un-issue" a reboot. The way it gets "reset" is by the daemon naturally getting restarted as part of the reboot. That said, add a `resetDaemon()` test helper to make resetting everything easier. Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- internal/daemon/reconciler.go | 13 ++++++------- internal/daemon/reconciler_test.go | 21 +++++++-------------- internal/daemon/suite_test.go | 8 ++++++++ 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index 13a13c3..534c3a8 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -178,9 +178,9 @@ func (r *BootcNodeReconciler) reconcileBootcNode(ctx context.Context, bn *bootcv } // Nothing to do the desired image matches the booted ones. - // Rest the reconciler to start from a clean state + // Reset the stage backoff to start from a clean state. if digested.Digest().String() == bn.Status.Booted.ImageDigest { - r.reset() + r.inflight.reset() return reconcileResult{}, nil } @@ -268,11 +268,10 @@ func (s *stageOp) acquire(log logr.Logger, image string, cancel context.CancelFu s.err = nil } -func (r *BootcNodeReconciler) reset() { - r.rebootIssued = false - r.inflight.mu.Lock() - defer r.inflight.mu.Unlock() - r.inflight.retries = 0 +func (s *stageOp) reset() { + s.mu.Lock() + defer s.mu.Unlock() + s.retries = 0 } // run executes bootc stage in a goroutine. The results are delivered via the done channel. diff --git a/internal/daemon/reconciler_test.go b/internal/daemon/reconciler_test.go index bd9f3d1..cdf3abc 100644 --- a/internal/daemon/reconciler_test.go +++ b/internal/daemon/reconciler_test.go @@ -99,8 +99,7 @@ func TestReconcileBootcStatusError(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.setStatusErr(errors.New(bootcStatusErrMsg)) bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefA) @@ -127,8 +126,7 @@ func TestStagingTriggered(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) @@ -165,8 +163,7 @@ func TestStagingError(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) fake.setStageErr(errors.New(stageErrMsg)) @@ -199,8 +196,7 @@ func TestAlreadyStaged(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -229,8 +225,7 @@ func TestRebootingSet(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -259,8 +254,7 @@ func TestRollback(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) @@ -291,8 +285,7 @@ func TestCancelInflightStage(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.reset() - reconciler.reset() + resetDaemon() fake.status = newBootcStatus(testutil.DigestA) firstBlock := make(chan struct{}) diff --git a/internal/daemon/suite_test.go b/internal/daemon/suite_test.go index a695879..ab288c4 100644 --- a/internal/daemon/suite_test.go +++ b/internal/daemon/suite_test.go @@ -28,6 +28,14 @@ var ( reconciler *BootcNodeReconciler ) +// resetDaemon resets all volatile state on the shared reconciler and +// fake executor so each test starts from a clean slate. +func resetDaemon() { + fake.reset() + reconciler.rebootIssued = false + reconciler.inflight.reset() +} + func TestMain(m *testing.M) { ctrl.SetLogger(zap.New(zap.UseDevMode(true))) From a04cda323c1fc0e73f69fe120ea6d50ed173f249 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 15:34:06 -0400 Subject: [PATCH 3/7] daemon: skip stage when context is already cancelled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After acquiring runMu, check ctx.Err() before calling Stage(). This prevents stale goroutines from briefly starting a bootc switch process with the wrong image after rapid spec changes (e.g. B→C→D). Multiple goroutines may still pile up on the mutex, but each cancelled one exits immediately without spawning a process. Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- internal/daemon/reconciler.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index 534c3a8..e3aea86 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -275,12 +275,22 @@ func (s *stageOp) reset() { } // run executes bootc stage in a goroutine. The results are delivered via the done channel. +// +// Technically, multiple goroutines may pile up here waiting on runMu if +// there's multiple rapid image changes (e.g. B→C→D spawns three goroutines). +// This is harmless: each cancelled goroutine checks ctx.Err() after acquiring +// the lock and exits immediately without starting a process. func (s *stageOp) run(ctx context.Context, nodeName, image string, executor bootc.Executor, done chan<- event.GenericEvent) { s.runMu.Lock() defer s.runMu.Unlock() log := logf.FromContext(context.Background()).WithValues("node", nodeName, "image", image) + if ctx.Err() != nil { + log.Info("Stage skipped, context already cancelled") + return + } + // TODO: exec bootc switch async and select on the cancel channel to send SIGINT for graceful shutdown. err := executor.Stage(ctx, image) From 389c2b0c787d965dd0383e9de3cff945c7fc1b6b Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 15:42:24 -0400 Subject: [PATCH 4/7] daemon: remove stale TODO about SIGINT for bootc cancellation exec.CommandContext already sends SIGKILL on context cancellation, and bootc has no SIGINT handler, so graceful vs. hard termination is equivalent. Replace the TODO with a brief note about the existing behavior. Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- internal/daemon/reconciler.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index e3aea86..c3ea238 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -291,7 +291,8 @@ func (s *stageOp) run(ctx context.Context, nodeName, image string, executor boot return } - // TODO: exec bootc switch async and select on the cancel channel to send SIGINT for graceful shutdown. + // NB: this uses exec.CommandContext, which will SIGKILL bootc if the + // context is cancelled. err := executor.Stage(ctx, image) s.mu.Lock() From 78d312a36fc566ec11ad7838fc5be09f536ed582 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 22:15:39 -0400 Subject: [PATCH 5/7] daemon: run bootc switch via systemd-run on the host Run bootc switch as a transient systemd unit on the host rather than directly inside the daemon pod's cgroup. It's just cleaner to structure it this way (e.g. we don't have to account for bootc memory requirements in our own, and also the operation transparently survives daemon pod restarts), and will also allow in the future using various systemd knobs, such as those around IO scheduling (see #61). Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- internal/bootc/executor.go | 93 +++++++++++++++++++++++++++++++++++--- 1 file changed, 87 insertions(+), 6 deletions(-) diff --git a/internal/bootc/executor.go b/internal/bootc/executor.go index 83ff77f..3452930 100644 --- a/internal/bootc/executor.go +++ b/internal/bootc/executor.go @@ -7,7 +7,9 @@ import ( "fmt" "os/exec" "strings" + "time" + "github.com/go-logr/logr" logf "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -31,7 +33,7 @@ func NewHostExecutor() *HostExecutor { func (e *HostExecutor) nsenterCmd(ctx context.Context, args ...string) *exec.Cmd { base := []string{ "--target", "1", - "--mount", "--pid", + "--mount", "--pid", "--cgroup", "--setuid", "0", "--setgid", "0", "--env", "--", } @@ -47,19 +49,98 @@ func (e *HostExecutor) Status(ctx context.Context) ([]byte, error) { return out, nil } +// stageUnitName is the systemd transient unit name used for bootc switch. +const stageUnitName = "bootc-operator-switch.service" + func (e *HostExecutor) Stage(ctx context.Context, image string) error { log := logf.FromContext(ctx) - // TODO: use --download-only once available (https://github.com/bootc-dev/bootc/issues/2137) - cmd := e.nsenterCmd(ctx, "bootc", "switch", image) + // Stop any stale unit. This does also mean that if e.g. the daemon is + // restarted while a unit is outstanding, we'll still stop it even if + // it was working towards the same target image. This is fine though, + // we still eventually converge. And at least e.g. any layers already + // pulled will be skipped. Note also that `bootc switch` returns rc=0 + // if the staged image already matches, so even if the previous unit + // just completed, this new unit will essentially be a no-op. + e.stopStageUnit() + + // Ideally we'd use systemd-run's `--pipe` here, which would avoid + // having to fetch the unit journal down below, but SELinux blocks it + // (dbus-broker can't access container-labeled fds). + cmd := e.nsenterCmd(ctx, + "systemd-run", "--wait", "--collect", + "--unit", stageUnitName, + // TODO: use --download-only once available + // (https://github.com/bootc-dev/bootc/issues/2137) + "bootc", "switch", image, + ) + cmd.Cancel = func() error { + e.stopStageUnit() + return nil + } log.Info("Executing", "cmd", strings.Join(cmd.Args, " ")) - out, err := cmd.CombinedOutput() - if err != nil { - return fmt.Errorf("running bootc switch: %s: %w", out, err) + cursor := e.journalCursor() + if err := cmd.Run(); err != nil { + // Were we cancelled? + if ctx.Err() != nil { + return ctx.Err() + } + e.copyJournalUnitLogs(log, stageUnitName, cursor) + return fmt.Errorf("running bootc switch: %w", err) } return nil } +// stopStageUnit stops the bootc-operator-switch transient unit if it +// is running. Errors are ignored: the unit may not exist. +func (e *HostExecutor) stopStageUnit() { + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + _ = e.nsenterCmd(ctx, "systemctl", "stop", stageUnitName).Run() +} + +// journalCursor returns the current journal cursor position. Used to scope a +// future journal query to only entries after the cursor position. +func (e *HostExecutor) journalCursor() string { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + out, err := e.nsenterCmd(ctx, + "journalctl", "--show-cursor", "-n", "0", "-o", "cat", + ).Output() + if err != nil { + return "" + } + // Output format: "-- cursor: s=...;i=...;b=...;..." + if after, ok := strings.CutPrefix(strings.TrimSpace(string(out)), "-- cursor: "); ok { + return after + } + return "" +} + +// copyJournalUnitLogs logs recent journal output from the given systemd unit. +// If cursor is non-empty, only entries after that cursor are shown; otherwise +// shows all entries. +func (e *HostExecutor) copyJournalUnitLogs(log logr.Logger, unit string, cursor string) { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + args := []string{"journalctl", "-o", "cat", "--no-pager", + "-u", unit, + } + if cursor != "" { + args = append(args, "--after-cursor="+cursor) + } + out, err := e.nsenterCmd(ctx, args...).Output() + if err != nil { + log.Error(err, "Failed to read unit journal", "unit", unit) + return + } + for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { + if line != "" { + log.Info(line, "unit", unit) + } + } +} + func (e *HostExecutor) Reboot(ctx context.Context) error { log := logf.FromContext(ctx) From b2b98330f9cdc9e948691eeae9de1979a851b45e Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 22:16:15 -0400 Subject: [PATCH 6/7] Revert "daemon: increase memory resource" This reverts commit 7c5e31c6aa15e12cd23def80464d9183169a643a. The memory bump to 512Mi is no longer needed: bootc switch now runs as a transient systemd unit on the host (outside the pod cgroup), so its memory usage does not count against the daemon pod limit. 128Mi is sufficient for the daemon itself (Go binary, controller-runtime, API watches). Assisted-by: Pi (Claude Opus 4.6) Signed-off-by: Jonathan Lebon --- config/daemon/daemon.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/daemon/daemon.yaml b/config/daemon/daemon.yaml index a592b3f..d4a7e25 100644 --- a/config/daemon/daemon.yaml +++ b/config/daemon/daemon.yaml @@ -38,7 +38,7 @@ spec: resources: limits: cpu: 500m - memory: 512Mi + memory: 128Mi requests: cpu: 10m memory: 64Mi From 26e05cd5a941b2b9e6153dbb6cd758270756b66a Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Fri, 19 Jun 2026 22:54:36 -0400 Subject: [PATCH 7/7] docs: mark milestones 4b and 4c as complete Signed-off-by: Jonathan Lebon --- docs/IMPLEMENTATION_PLAN.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/IMPLEMENTATION_PLAN.md b/docs/IMPLEMENTATION_PLAN.md index bba08cc..d6c6bd6 100644 --- a/docs/IMPLEMENTATION_PLAN.md +++ b/docs/IMPLEMENTATION_PLAN.md @@ -207,7 +207,7 @@ the full controller+daemon loop can be tested end-to-end. DaemonSet, label a node with `bootc.dev/managed`, and verify the daemon pod starts on that node. -### 4b. Core loop +### 4b. Core loop ✅ - Binary identifies its node name (downward API env var) - Watches its single BootcNode via field selector on `metadata.name` @@ -221,7 +221,7 @@ pod starts on that node. **Validation (e2e):** enhance the existing e2e test to verify the daemon populates BootcNode status from `bootc status`. -### 4c. State machine +### 4c. State machine ✅ - Detect `spec.desiredImage != booted` → set `Idle=False reason=Staging`, run `bootc switch ` (no `--download-only` for now;