diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/audit.go b/cmd/stellar-rpc/internal/fullhistory/streaming/audit.go
new file mode 100644
index 000000000..c27b2cd31
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/audit.go
@@ -0,0 +1,227 @@
+package streaming
+
+import (
+	"fmt"
+	"strings"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/metastore"
+)
+
+// The `audit` operation — the executable form of the design's invariant audits
+// (design-docs/full-history-streaming-workflow.md "Correctness", line 1364:
+// "an `audit` admin command can implement them directly"). It composes the
+// catalog's key-walking primitives and a filesystem walk against the layout
+// bijection; it NEVER reaches into the phase scans that MAINTAIN the invariants
+// (the resolver, freeze, discard, prune), so a bug in any of those surfaces here
+// as a real violation rather than being silently judged acceptable by the same
+// code that produced it (the design's "None of the invariants reference the
+// phase scans" requirement).
+//
+// Quiescence makes the walks meaningful: between lifecycle ticks the daemon is
+// idle, so the structural invariants (INV-2 at-quiescence clauses, INV-3, INV-4)
+// hold. The audit is therefore meant to run against a daemon sitting idle
+// between ticks (or a stopped one). It does NOT itself take locks or open the
+// store — Audit operates on an already-open Catalog, and RunAudit is the
+// read-only operator entrypoint that opens the store for a stopped daemon.
+//
+// Each invariant maps to one check, exactly as the design prescribes:
+//
+//   - INV-2 (single canonical state): walk meta-store keys, cross-check the
+//     FORBIDDEN co-existences — a "freezing"/"pruning" artifact key surviving
+//     quiescence; a hot key for a chunk cold artifacts fully serve. The two
+//     transients the design explicitly TOLERATES are excluded: a hot key reading
+//     "transient" (an in-flight directory op bracket), and a "freezing" artifact
+//     key for a chunk strictly ABOVE completeThrough (the hot-volume-loss tail no
+//     source can yet repair).
+//   - INV-3 (disk matches meta-store): walk the filesystem against the meta store
+//     in BOTH directions — every artifact/hot path on disk must trace back to a
+//     key (no orphan files, no duplicate artifacts), and every key naming an
+//     expected path that is in a final/tolerated state must have its file (no
+//     dangling keys).
+//   - INV-4 (retention bound): walk meta-store keys, compare each key's ledger
+//     range to effectiveRetentionFloor; nothing strictly below the floor may
+//     persist.
+//   - INV-1 (read correctness): OPTIONAL deep mode — re-derive sampled frozen
+//     artifacts via a conformant LedgerBackend and byte-compare against the
+//     on-disk file. The heavy re-derivation is injected (DeepDeriver) rather than
+//     hardcoded, matching the design's "via a conformant LedgerBackend" framing;
+//     when no deriver is supplied the deep check is skipped.
+
+// Invariant names a checked invariant for reporting.
+type Invariant string
+
+const (
+	InvSingleCanonicalState Invariant = "INV-2" // single canonical state
+	InvDiskMatchesMeta      Invariant = "INV-3" // disk matches meta store
+	InvRetentionBound       Invariant = "INV-4" // retention bound
+	InvReadCorrectness      Invariant = "INV-1" // read correctness (deep mode)
+)
+
+// Violation is one detected invariant breach: which invariant, the offending key
+// and/or path, and a human-readable explanation. Key or Path may be empty when a
+// violation is not tied to one (e.g. a per-window count).
+type Violation struct {
+	Invariant Invariant
+	Key       string // meta-store key, when applicable
+	Path      string // on-disk path, when applicable
+	Detail    string
+}
+
+func (v Violation) String() string {
+	var b strings.Builder
+	b.WriteString(string(v.Invariant))
+	b.WriteString(": ")
+	b.WriteString(v.Detail)
+	if v.Key != "" {
+		fmt.Fprintf(&b, " [key=%s]", v.Key)
+	}
+	if v.Path != "" {
+		fmt.Fprintf(&b, " [path=%s]", v.Path)
+	}
+	return b.String()
+}
+
+// AuditReport is the full result of an audit pass. Clean reports zero
+// violations; otherwise Violations lists every breach found (the audit does not
+// stop at the first — an operator wants the whole picture).
+type AuditReport struct {
+	// CompleteThrough is the completeThrough snapshot the audit derived; the
+	// floor and the INV-2 above-completeThrough tolerance are computed from it.
+	CompleteThrough uint32
+	// Floor is the effective retention floor at CompleteThrough.
+	Floor uint32
+	// Violations are every breach found, in check order (INV-2, INV-3, INV-4,
+	// then INV-1 deep) and within a check in key/path order.
+	Violations []Violation
+	// DeepChecked is the number of artifacts the deep (INV-1) mode byte-compared;
+	// 0 when no deriver was supplied.
+	DeepChecked int
+}
+
+// Clean reports whether the audit found no violations.
+func (r AuditReport) Clean() bool { return len(r.Violations) == 0 }
+
+// DeepDeriver re-derives one per-chunk cold artifact from a conformant
+// LedgerBackend and returns its canonical bytes, for the INV-1 deep mode's
+// byte-compare against the on-disk file. It is injected so the audit composes
+// the heavy re-derivation rather than hardcoding the cold pipeline: production
+// wires a deriver backed by the same RunColdChunk extractors; ok=false means the
+// deriver declines to sample this (chunk, kind) (e.g. an unsupported kind), which
+// the audit treats as "not sampled", never as a violation.
+type DeepDeriver interface {
+	DeriveArtifact(c chunk.ID, kind Kind) (data []byte, ok bool, err error)
+}
+
+// AuditOptions tunes one audit pass.
+type AuditOptions struct {
+	// RetentionChunks is the sliding-floor width the daemon runs with — the same
+	// knob the prune scan and reader gate read. The audit derives the floor from
+	// it so INV-4 checks against the EXACT floor the daemon enforces.
+	RetentionChunks uint32
+
+	// Deep, when non-nil, enables the INV-1 deep check: every Nth frozen cold
+	// artifact (DeepSampleEvery) is re-derived and byte-compared. nil skips INV-1.
+	Deep DeepDeriver
+
+	// DeepSampleEvery is the sampling stride for the deep check: 1 compares every
+	// frozen artifact, N compares every Nth. <=0 is treated as 1. Ignored when
+	// Deep is nil.
+	DeepSampleEvery int
+}
+
+// Audit runs every structural invariant check (INV-2, INV-3, INV-4) against the
+// catalog at its current quiescent state, plus the optional INV-1 deep check
+// when opts.Deep is set. It is a PURE READ: it opens no hot DB for writing,
+// mutates no key, and unlinks nothing. Returns a report listing every violation;
+// an error is returned only for an I/O failure that prevents the audit from
+// completing (a backing-store or filesystem error), never for a violation.
+func (c *Catalog) Audit(opts AuditOptions) (AuditReport, error) {
+	// completeThrough is the chunk-granularity progress bound the at-quiescence
+	// clauses key off (the INV-2 above-completeThrough tolerance and the INV-4
+	// floor). Derived purely from durable keys — no hot DB read — so the audit
+	// stays a read-only key/filesystem walk.
+	through, err := lastCommittedLedger(c, nil)
+	if err != nil {
+		return AuditReport{}, fmt.Errorf("streaming: audit derive completeThrough: %w", err)
+	}
+	earliest, _, err := c.EarliestLedger()
+	if err != nil {
+		return AuditReport{}, fmt.Errorf("streaming: audit read earliest_ledger: %w", err)
+	}
+	floor := effectiveRetentionFloor(through, opts.RetentionChunks, earliest)
+
+	report := AuditReport{CompleteThrough: through, Floor: floor}
+
+	if err := c.auditSingleCanonicalState(through, &report); err != nil {
+		return AuditReport{}, err
+	}
+	if err := c.auditDiskMatchesMeta(through, &report); err != nil {
+		return AuditReport{}, err
+	}
+	if err := c.auditRetentionBound(floor, &report); err != nil {
+		return AuditReport{}, err
+	}
+	if opts.Deep != nil {
+		if err := c.auditReadCorrectness(opts, &report); err != nil {
+			return AuditReport{}, err
+		}
+	}
+	return report, nil
+}
+
+// ---------------------------------------------------------------------------
+// ---------------------------------------------------------------------------
+// RunAudit — the read-only operator entrypoint. Opens the store for a stopped
+// (or quiescent) daemon, runs the audit, returns the report. Like
+// RunSurgicalRecovery it takes the storage-root flocks so a concurrently
+// recovering process is locked out; UNLIKE recovery it mutates nothing, so
+// running it against a live daemon (which today does not hold these flocks) is
+// harmless beyond RocksDB's metastore single-writer LOCK, which will reject the
+// open with an opaque error — run it against a stopped daemon for a clean open.
+// ---------------------------------------------------------------------------
+
+func RunAudit(cfg Config, opts AuditOptions, logger *supportlog.Entry) (AuditReport, error) {
+	if logger == nil {
+		logger = supportlog.New()
+	}
+	cfg = cfg.WithDefaults()
+	paths := cfg.ResolvePaths()
+
+	if cfg.Streaming.RetentionChunks != nil && opts.RetentionChunks == 0 {
+		opts.RetentionChunks = *cfg.Streaming.RetentionChunks
+	}
+
+	locks, err := LockRoots(paths.LockRoots()...)
+	if err != nil {
+		return AuditReport{}, fmt.Errorf("streaming: audit lock roots: %w", err)
+	}
+	defer locks.Release()
+
+	store, err := metastore.New(paths.Catalog, logger)
+	if err != nil {
+		return AuditReport{}, fmt.Errorf("streaming: audit open meta store: %w", err)
+	}
+	defer func() { _ = store.Close() }()
+
+	cat := NewCatalog(store, NewLayoutFromPaths(paths))
+
+	logger.WithField("retention_chunks", opts.RetentionChunks).
+		WithField("deep", opts.Deep != nil).
+		Info("audit: starting invariant walk")
+
+	report, err := cat.Audit(opts)
+	if err != nil {
+		return AuditReport{}, err
+	}
+
+	logger.WithField("complete_through", report.CompleteThrough).
+		WithField("floor", report.Floor).
+		WithField("violations", len(report.Violations)).
+		WithField("deep_checked", report.DeepChecked).
+		Info("audit: complete")
+
+	return report, nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/audit_invariants.go b/cmd/stellar-rpc/internal/fullhistory/streaming/audit_invariants.go
new file mode 100644
index 000000000..7e68d3185
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/audit_invariants.go
@@ -0,0 +1,441 @@
+package streaming
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io/fs"
+	"os"
+	"path/filepath"
+)
+
+// INV-2 — single canonical state. Walk meta-store keys, cross-check forbidden
+// co-existence. Excludes exactly the two transients the design tolerates.
+// ---------------------------------------------------------------------------
+
+func (c *Catalog) auditSingleCanonicalState(through uint32, report *AuditReport) error {
+	refs, err := c.ChunkArtifactKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-2 scan chunk keys: %w", err)
+	}
+	hot, err := c.HotChunkKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-2 scan hot keys: %w", err)
+	}
+
+	// Clause 1: at quiescence no artifact key is "freezing" or "pruning", with the
+	// ONE tolerated exception — a "freezing" per-chunk key strictly ABOVE
+	// completeThrough (the hot-volume-loss tail, outside every plan range and the
+	// retention window, that no source can yet repair). A "pruning" key is never
+	// tolerated above completeThrough; only "freezing" is the loss-tail signal.
+	for _, ref := range refs {
+		switch ref.State {
+		case StateFreezing:
+			if ref.Chunk.LastLedger() <= through {
+				report.Violations = append(report.Violations, Violation{
+					Invariant: InvSingleCanonicalState,
+					Key:       ref.Key(),
+					Detail: fmt.Sprintf(
+						"artifact key is %q at quiescence within [floor, completeThrough] "+
+							"(chunk %s last ledger %d <= completeThrough %d): re-materialization was skipped",
+						StateFreezing, ref.Chunk, ref.Chunk.LastLedger(), through,
+					),
+				})
+			}
+			// else: chunk strictly above completeThrough — the tolerated
+			// hot-volume-loss "freezing" tail. No violation.
+		case StatePruning:
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvSingleCanonicalState,
+				Key:       ref.Key(),
+				Detail: fmt.Sprintf(
+					"artifact key is %q at quiescence: the sweep should have finished this demotion",
+					StatePruning,
+				),
+			})
+		case StateFrozen:
+			// The expected quiescent state — every in-range artifact is frozen.
+		}
+	}
+
+	// Clause 2: no hot key for a chunk whose cold artifacts fully serve it (all
+	// artifacts durable). A "transient" hot key is the tolerated in-flight
+	// bracket — skip it. The orphan-hot check applies to "ready" keys (and any
+	// non-transient value).
+	for _, hc := range hot {
+		hs, herr := c.HotState(hc)
+		if herr != nil {
+			return fmt.Errorf("streaming: audit INV-2 hot state %s: %w", hc, herr)
+		}
+		if hs == HotTransient {
+			// Tolerated in-flight directory-op bracket — not an orphan.
+			continue
+		}
+		pending, perr := pendingArtifacts(hc, c)
+		if perr != nil {
+			return fmt.Errorf("streaming: audit INV-2 pending artifacts %s: %w", hc, perr)
+		}
+		if pending.Empty() {
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvSingleCanonicalState,
+				Key:       hotChunkKey(hc),
+				Detail: fmt.Sprintf(
+					"hot DB key persists for chunk %s whose cold artifacts fully serve it "+
+						"(all artifacts frozen): the discard scan missed it",
+					hc,
+				),
+			})
+		}
+	}
+
+	return nil
+}
+
+// ---------------------------------------------------------------------------
+// INV-3 — disk matches meta-store, BOTH directions. Walk the filesystem against
+// meta (orphan files, duplicate artifacts) and meta against the filesystem
+// (dangling keys).
+// ---------------------------------------------------------------------------
+
+//nolint:gocognit,cyclop // walks meta→disk and disk→meta in one pass
+func (c *Catalog) auditDiskMatchesMeta(through uint32, report *AuditReport) error {
+	refs, err := c.ChunkArtifactKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-3 scan chunk keys: %w", err)
+	}
+	hot, err := c.HotChunkKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-3 scan hot keys: %w", err)
+	}
+
+	// Build the set of paths the meta store EXPECTS to exist on disk. The
+	// expected-path set is the union of every key's bijected path(s). We track it
+	// as a set so the disk->meta direction is a membership test, and separately
+	// record which keys are in a state that REQUIRES the file (final or tolerated)
+	// so the meta->disk direction can flag dangling keys without faulting a
+	// "pruning" key whose unlink legitimately preceded the (not-yet-deleted) key.
+	expected := map[string]struct{}{}
+	addExpected := func(paths ...string) {
+		for _, p := range paths {
+			expected[p] = struct{}{}
+		}
+	}
+
+	// meta -> disk (dangling keys): a key in a state that mandates its file but
+	// whose file is gone. "frozen" mandates the file. "freezing" mandates it too
+	// (the mark-before-write rule keeps even a partial file reachable). "pruning"
+	// does NOT — the sweep unlinks before deleting the key, so a "pruning" key
+	// with no file is the legitimate mid-sweep window, not a dangling key. We
+	// still register its path as expected (so a file under it is not an orphan).
+	for _, ref := range refs {
+		paths := c.layout.ArtifactPaths(ref.Chunk, ref.Kind)
+		addExpected(paths...)
+		if ref.State == StatePruning {
+			continue
+		}
+		for _, p := range paths {
+			ok, ferr := fileExists(p)
+			if ferr != nil {
+				return fmt.Errorf("streaming: audit INV-3 stat %s: %w", p, ferr)
+			}
+			if !ok {
+				report.Violations = append(report.Violations, Violation{
+					Invariant: InvDiskMatchesMeta,
+					Key:       ref.Key(),
+					Path:      p,
+					Detail: fmt.Sprintf(
+						"meta key is %q but its file is missing: dangling key", ref.State,
+					),
+				})
+			}
+		}
+	}
+	// Hot DB dirs: a "ready" (or any non-transient) hot key mandates its dir; a
+	// "transient" key is the tolerated in-flight bracket where the dir may be
+	// absent. Register every hot dir as expected either way.
+	expectedHotDir := map[string]struct{}{}
+	for _, hc := range hot {
+		dir := c.layout.HotChunkPath(hc)
+		expectedHotDir[dir] = struct{}{}
+		hs, herr := c.HotState(hc)
+		if herr != nil {
+			return fmt.Errorf("streaming: audit INV-3 hot state %s: %w", hc, herr)
+		}
+		if hs == HotTransient {
+			continue
+		}
+		ok, ferr := dirExists(dir)
+		if ferr != nil {
+			return fmt.Errorf("streaming: audit INV-3 stat hot dir %s: %w", dir, ferr)
+		}
+		if !ok {
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvDiskMatchesMeta,
+				Key:       hotChunkKey(hc),
+				Path:      dir,
+				Detail: fmt.Sprintf(
+					"hot key is %q but its hot DB directory is missing: dangling key (hot-volume loss?)", hs,
+				),
+			})
+		}
+	}
+
+	// disk -> meta (orphan files, duplicate artifacts): walk every artifact tree
+	// and flag any regular file whose path is not in the expected set. A
+	// duplicate artifact (a stray .pack) is just a path the meta store does not
+	// name, so it is caught by the same membership test — the design's "the
+	// meta-store names one expected path; the extras are orphans".
+	for _, root := range c.artifactFileRoots() {
+		if err := walkRegularFiles(root, func(path string) {
+			if _, ok := expected[path]; ok {
+				return
+			}
+			// The per-root single-process flock file (LockRoots) is a legitimate
+			// non-artifact file the daemon plants at the top of every storage root
+			// it locks; it names no meta key and is not an orphan artifact. Exclude
+			// it so the audit does not flag a live (or cleanly-stopped) deployment's
+			// own locks. Nothing else non-artifact is expected in these trees.
+			if filepath.Base(path) == lockFileName {
+				return
+			}
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvDiskMatchesMeta,
+				Path:      path,
+				Detail:    "file on disk has no meta-store key naming it: orphan or duplicate artifact",
+			})
+		}); err != nil {
+			return fmt.Errorf("streaming: audit INV-3 walk %s: %w", root, err)
+		}
+	}
+
+	// disk -> meta for hot dirs: a hot DB directory on disk with no hot:chunk key
+	// is an orphan tier. We check the immediate children of the hot root against
+	// the expected hot-dir set (each child is one chunk's hot DB dir).
+	hotRoot := c.layout.HotRoot()
+	if err := walkImmediateSubdirs(hotRoot, func(dir string) {
+		if _, ok := expectedHotDir[dir]; ok {
+			return
+		}
+		report.Violations = append(report.Violations, Violation{
+			Invariant: InvDiskMatchesMeta,
+			Path:      dir,
+			Detail:    "hot DB directory on disk has no hot:chunk key: orphan hot tier",
+		})
+	}); err != nil {
+		return fmt.Errorf("streaming: audit INV-3 walk hot root %s: %w", hotRoot, err)
+	}
+
+	_ = through // reserved: INV-3 correspondence holds at quiescence regardless of through.
+	return nil
+}
+
+// ---------------------------------------------------------------------------
+// INV-4 — retention bound. Walk meta-store keys, compare ledger ranges to the
+// floor. Nothing strictly below effectiveRetentionFloor may persist.
+// ---------------------------------------------------------------------------
+
+func (c *Catalog) auditRetentionBound(floor uint32, report *AuditReport) error {
+	// A chunk is below the floor when its LAST ledger is below the floor (the same
+	// ChunkBelowFloor predicate the prune/discard scans use). We do not flag a
+	// chunk merely straddling the floor: the reader retention contract masks the
+	// below-floor tail of a straddling chunk's window, and the prune scan only
+	// sweeps keys WHOLLY below the floor.
+	refs, err := c.ChunkArtifactKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-4 scan chunk keys: %w", err)
+	}
+	for _, ref := range refs {
+		if ref.Chunk.LastLedger() < floor {
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvRetentionBound,
+				Key:       ref.Key(),
+				Detail: fmt.Sprintf(
+					"chunk %s (last ledger %d) is wholly below the retention floor %d: pruning failed past the floor",
+					ref.Chunk, ref.Chunk.LastLedger(), floor,
+				),
+			})
+		}
+	}
+
+	hot, err := c.HotChunkKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-4 scan hot keys: %w", err)
+	}
+	for _, hc := range hot {
+		if hc.LastLedger() < floor {
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvRetentionBound,
+				Key:       hotChunkKey(hc),
+				Detail: fmt.Sprintf(
+					"hot DB for chunk %s (last ledger %d) is wholly below the retention floor %d: discard failed past the floor",
+					hc, hc.LastLedger(), floor,
+				),
+			})
+		}
+	}
+	return nil
+}
+
+// ---------------------------------------------------------------------------
+// INV-1 — read correctness, OPTIONAL deep mode. Re-derive sampled frozen
+// artifacts via the injected conformant LedgerBackend and byte-compare.
+// ---------------------------------------------------------------------------
+
+func (c *Catalog) auditReadCorrectness(opts AuditOptions, report *AuditReport) error {
+	stride := opts.DeepSampleEvery
+	if stride <= 0 {
+		stride = 1
+	}
+	refs, err := c.ChunkArtifactKeys()
+	if err != nil {
+		return fmt.Errorf("streaming: audit INV-1 scan chunk keys: %w", err)
+	}
+	// Sample only FROZEN artifacts: a read resolves only frozen cold artifacts, so
+	// INV-1's "content matches a conformant LedgerBackend" applies to exactly
+	// those. ChunkArtifactKeys returns key-sorted, so the stride is deterministic.
+	sampled := 0
+	for _, ref := range refs {
+		if ref.State != StateFrozen {
+			continue
+		}
+		if sampled%stride != 0 {
+			sampled++
+			continue
+		}
+		sampled++
+
+		want, ok, derr := opts.Deep.DeriveArtifact(ref.Chunk, ref.Kind)
+		if derr != nil {
+			return fmt.Errorf("streaming: audit INV-1 re-derive %s: %w", ref.Key(), derr)
+		}
+		if !ok {
+			// Deriver declined to sample this (chunk, kind) — not a violation.
+			continue
+		}
+		report.DeepChecked++
+
+		// A frozen per-chunk artifact may map to multiple files (events). The deep
+		// deriver returns the canonical bytes for the kind's PRIMARY file; we
+		// byte-compare against that. The primary file is the first ArtifactPaths
+		// entry (the .pack / -events.pack / .bin).
+		paths := c.layout.ArtifactPaths(ref.Chunk, ref.Kind)
+		if len(paths) == 0 {
+			continue
+		}
+		got, rerr := os.ReadFile(paths[0])
+		if rerr != nil {
+			if errors.Is(rerr, fs.ErrNotExist) {
+				// A missing file under a frozen key is already an INV-3 dangling-key
+				// violation; do not double-report it as INV-1.
+				continue
+			}
+			return fmt.Errorf("streaming: audit INV-1 read %s: %w", paths[0], rerr)
+		}
+		if !bytes.Equal(want, got) {
+			report.Violations = append(report.Violations, Violation{
+				Invariant: InvReadCorrectness,
+				Key:       ref.Key(),
+				Path:      paths[0],
+				Detail: fmt.Sprintf(
+					"on-disk artifact for chunk %s kind %s (%d bytes) does not match the re-derived bytes "+
+						"(%d bytes) from a conformant LedgerBackend",
+					ref.Chunk, ref.Kind, len(got), len(want),
+				),
+			})
+		}
+	}
+	return nil
+}
+
+// ---------------------------------------------------------------------------
+// Filesystem helpers — the audit's ONLY filesystem access (it otherwise walks
+// keys). Kept here so the disk<->meta walk has one source of truth, mirroring
+// how paths.go owns the durability primitives.
+// ---------------------------------------------------------------------------
+
+// artifactFileRoots returns the per-chunk cold trees — the dirs that hold
+// key-named files. The hot tree is walked separately (by directory, not file).
+// These come straight off the bound Layout's per-tree roots, so they honor any
+// [immutable_storage.*] path override exactly as the data path and the flock
+// (Paths.LockRoots) do.
+func (c *Catalog) artifactFileRoots() []string {
+	return []string{
+		c.layout.LedgersRoot(),
+	}
+}
+
+// walkRegularFiles invokes fn for every regular file under root. A missing root
+// is not an error (a tree may never have been created on a young store).
+func walkRegularFiles(root string, fn func(path string)) error {
+	err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error {
+		if err != nil {
+			if errors.Is(err, fs.ErrNotExist) {
+				return nil
+			}
+			return err
+		}
+		if d.IsDir() {
+			return nil
+		}
+		// Only regular files are artifacts; skip symlinks/sockets/etc.
+		info, ierr := d.Info()
+		if ierr != nil {
+			if errors.Is(ierr, fs.ErrNotExist) {
+				return nil
+			}
+			return ierr
+		}
+		if info.Mode().IsRegular() {
+			fn(path)
+		}
+		return nil
+	})
+	if errors.Is(err, fs.ErrNotExist) {
+		return nil
+	}
+	return err
+}
+
+// walkImmediateSubdirs invokes fn for every immediate subdirectory of root (not
+// recursive — hot DB dirs are one level under the hot root). A missing root is
+// not an error.
+func walkImmediateSubdirs(root string, fn func(dir string)) error {
+	entries, err := os.ReadDir(root)
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return nil
+		}
+		return err
+	}
+	for _, e := range entries {
+		if e.IsDir() {
+			fn(filepath.Join(root, e.Name()))
+		}
+	}
+	return nil
+}
+
+// fileExists reports whether path is an existing regular file. A non-existent
+// path is (false, nil); any other stat error surfaces.
+func fileExists(path string) (bool, error) {
+	info, err := os.Stat(path)
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return false, nil
+		}
+		return false, err
+	}
+	return info.Mode().IsRegular(), nil
+}
+
+// dirExists reports whether path is an existing directory.
+func dirExists(path string) (bool, error) {
+	info, err := os.Stat(path)
+	if err != nil {
+		if errors.Is(err, fs.ErrNotExist) {
+			return false, nil
+		}
+		return false, err
+	}
+	return info.IsDir(), nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/audit_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/audit_test.go
new file mode 100644
index 000000000..720db37b6
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/audit_test.go
@@ -0,0 +1,361 @@
+package streaming
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// freezeChunkArtifacts marks+writes+freezes every per-chunk artifact kind for a
+// chunk (currently ledgers) and writes the real files, so the audit's INV-3
+// disk<->meta walk sees a fully materialized chunk.
+func freezeChunkArtifacts(t *testing.T, cat *Catalog, c chunk.ID, kinds ...Kind) {
+	t.Helper()
+	if len(kinds) == 0 {
+		kinds = AllKinds()
+	}
+	require.NoError(t, cat.MarkChunkFreezing(c, kinds...))
+	for _, kind := range kinds {
+		for _, p := range cat.layout.ArtifactPaths(c, kind) {
+			writeArtifact(t, p)
+		}
+	}
+	require.NoError(t, cat.FlipChunkFrozen(c, kinds...))
+}
+
+// hasViolation reports whether the report contains a violation for inv whose key
+// matches wantKey (empty wantKey matches any).
+func hasViolation(r AuditReport, inv Invariant, wantKey string) bool {
+	for _, v := range r.Violations {
+		if v.Invariant != inv {
+			continue
+		}
+		if wantKey == "" || v.Key == wantKey {
+			return true
+		}
+	}
+	return false
+}
+
+func countInvariant(r AuditReport, inv Invariant) int {
+	n := 0
+	for _, v := range r.Violations {
+		if v.Invariant == inv {
+			n++
+		}
+	}
+	return n
+}
+
+// ---------------------------------------------------------------------------
+// Clean store — a fully materialized, in-retention chunk set yields zero
+// violations across every invariant.
+// ---------------------------------------------------------------------------
+
+func TestAudit_CleanStoreNoViolations(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	freezeChunkArtifacts(t, cat, 1, KindLedgers)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, report.Clean(), "expected clean audit, got: %v", report.Violations)
+}
+
+// ---------------------------------------------------------------------------
+// INV-2 — single canonical state.
+// ---------------------------------------------------------------------------
+
+func TestAudit_INV2_FreezingArtifactWithinRetentionIsViolation(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A "freezing" ledgers key for chunk 0, and a fully-frozen chunk 5 so
+	// completeThrough advances ABOVE chunk 0 (chunk 0 is within
+	// [floor, completeThrough]). Re-materialization was skipped -> INV-2.
+	freezeChunkArtifacts(t, cat, 5, KindLedgers)
+	require.NoError(t, cat.MarkChunkFreezing(0, KindLedgers))
+	writeArtifact(t, cat.layout.LedgerPackPath(0))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvSingleCanonicalState, chunkKey(0, KindLedgers)),
+		"expected INV-2 within-retention freezing violation: %v", report.Violations)
+}
+
+func TestAudit_INV2_FreezingArtifactAboveCompleteThroughIsTolerated(t *testing.T) {
+	cat, root := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// No frozen chunks at all => completeThrough is pre-genesis. A "freezing" key
+	// for chunk 3 lies ABOVE completeThrough — the tolerated hot-volume-loss tail.
+	require.NoError(t, cat.MarkChunkFreezing(3, KindLedgers))
+	writeArtifact(t, cat.layout.LedgerPackPath(3))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.False(t, hasViolation(report, InvSingleCanonicalState, chunkKey(3, KindLedgers)),
+		"above-completeThrough freezing key must be tolerated: %v", report.Violations)
+	_ = root
+}
+
+func TestAudit_INV2_PruningArtifactIsAlwaysViolation(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A "pruning" key surviving quiescence — the sweep should have finished it.
+	// No completeThrough carve-out applies to "pruning" (only "freezing").
+	require.NoError(t, cat.MarkChunkFreezing(7, KindLedgers))
+	require.NoError(t, cat.store.Put(chunkKey(7, KindLedgers), string(StatePruning)))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvSingleCanonicalState, chunkKey(7, KindLedgers)),
+		"expected INV-2 pruning violation: %v", report.Violations)
+}
+
+func TestAudit_INV2_OrphanHotForFullyServedChunk(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// Chunk 0 fully served by cold artifacts (ledgers frozen) yet a "ready" hot DB
+	// persists — the discard scan missed it.
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	freezeChunkArtifacts(t, cat, 1, KindLedgers)
+	readyHot(t, cat, 0)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvSingleCanonicalState, hotChunkKey(0)),
+		"expected INV-2 orphan-hot violation: %v", report.Violations)
+}
+
+func TestAudit_INV2_TransientHotIsTolerated(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	freezeChunkArtifacts(t, cat, 1, KindLedgers)
+	// A "transient" hot key for the same fully-served chunk is the tolerated
+	// in-flight bracket — NOT an orphan, and its missing dir is NOT a dangling key.
+	require.NoError(t, cat.PutHotTransient(0))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.False(t, hasViolation(report, InvSingleCanonicalState, hotChunkKey(0)),
+		"transient hot key must be tolerated by INV-2: %v", report.Violations)
+	require.False(t, hasViolation(report, InvDiskMatchesMeta, hotChunkKey(0)),
+		"transient hot key with no dir must be tolerated by INV-3: %v", report.Violations)
+}
+
+// ---------------------------------------------------------------------------
+// INV-3 — disk matches meta-store, both directions.
+// ---------------------------------------------------------------------------
+
+func TestAudit_INV3_OrphanFileNoKey(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A file on disk at chunk 9's ledgers path with NO meta key — orphan.
+	orphan := cat.layout.LedgerPackPath(9)
+	writeArtifact(t, orphan)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	found := false
+	for _, v := range report.Violations {
+		if v.Invariant == InvDiskMatchesMeta && v.Path == orphan {
+			found = true
+		}
+	}
+	require.True(t, found, "expected INV-3 orphan-file violation for %s: %v", orphan, report.Violations)
+}
+
+func TestAudit_INV3_DuplicateArtifactIsOrphan(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// Chunk 0 ledgers frozen (one legit .pack). A stray SECOND file the meta store
+	// does not name (in the same bucket dir) is a duplicate -> orphan.
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	dupe := filepath.Join(filepath.Dir(cat.layout.LedgerPackPath(0)), "00000000.dupe")
+	writeArtifact(t, dupe)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	found := false
+	for _, v := range report.Violations {
+		if v.Invariant == InvDiskMatchesMeta && v.Path == dupe {
+			found = true
+		}
+	}
+	require.True(t, found, "expected INV-3 duplicate-artifact orphan for %s: %v", dupe, report.Violations)
+}
+
+func TestAudit_INV3_DanglingKeyNoFile(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A "frozen" ledgers key for chunk 2 but no file on disk — dangling key.
+	require.NoError(t, cat.MarkChunkFreezing(2, KindLedgers))
+	require.NoError(t, cat.FlipChunkFrozen(2, KindLedgers))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvDiskMatchesMeta, chunkKey(2, KindLedgers)),
+		"expected INV-3 dangling-key violation: %v", report.Violations)
+}
+
+func TestAudit_INV3_PruningKeyNoFileIsTolerated(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A "pruning" key whose file the sweep already unlinked (before deleting the
+	// key) is the legitimate mid-sweep window, NOT a dangling key.
+	require.NoError(t, cat.MarkChunkFreezing(2, KindLedgers))
+	require.NoError(t, cat.store.Put(chunkKey(2, KindLedgers), string(StatePruning)))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.False(t, hasViolation(report, InvDiskMatchesMeta, chunkKey(2, KindLedgers)),
+		"pruning key with no file must NOT be an INV-3 dangling key: %v", report.Violations)
+}
+
+func TestAudit_INV3_OrphanHotDir(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A hot DB directory on disk for chunk 4 with no hot:chunk key — orphan tier.
+	require.NoError(t, os.MkdirAll(cat.layout.HotChunkPath(4), 0o755))
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	found := false
+	for _, v := range report.Violations {
+		if v.Invariant == InvDiskMatchesMeta && v.Path == cat.layout.HotChunkPath(4) {
+			found = true
+		}
+	}
+	require.True(t, found, "expected INV-3 orphan-hot-dir violation: %v", report.Violations)
+}
+
+// ---------------------------------------------------------------------------
+// INV-4 — retention bound.
+// ---------------------------------------------------------------------------
+
+func TestAudit_INV4_ChunkBelowFloor(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// Pin earliest_ledger to chunk 5's first ledger -> floor is chunk 5's first
+	// ledger, so chunk 0..4 are wholly below the floor.
+	require.NoError(t, cat.PutEarliestLedger(chunk.ID(5).FirstLedger()))
+
+	// A frozen chunk 1 below the floor (its files exist so INV-3 is clean) — but
+	// it's below floor, so INV-4 fires.
+	freezeChunkArtifacts(t, cat, 1, KindLedgers)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvRetentionBound, chunkKey(1, KindLedgers)),
+		"expected INV-4 below-floor violation: %v", report.Violations)
+}
+
+func TestAudit_INV4_StraddlingFloorNotFlagged(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// earliest at chunk 0 first ledger + 1 (mid chunk 0). floor =
+	// effectiveRetentionFloor with earliest just above genesis; chunk 0's last
+	// ledger is ABOVE that, so chunk 0 straddles and must NOT be flagged.
+	require.NoError(t, cat.PutEarliestLedger(chunk.ID(0).FirstLedger()+1))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+
+	report, err := cat.Audit(AuditOptions{})
+	require.NoError(t, err)
+	require.Equal(t, 0, countInvariant(report, InvRetentionBound),
+		"a chunk straddling the floor must not be an INV-4 violation: %v", report.Violations)
+}
+
+// ---------------------------------------------------------------------------
+// INV-1 — deep mode.
+// ---------------------------------------------------------------------------
+
+type fakeDeriver struct {
+	bytesFor map[string][]byte // keyed by chunkKey(c, kind)
+	declined map[string]bool
+	err      error
+}
+
+func (f *fakeDeriver) DeriveArtifact(c chunk.ID, kind Kind) ([]byte, bool, error) {
+	if f.err != nil {
+		return nil, false, f.err
+	}
+	k := chunkKey(c, kind)
+	if f.declined[k] {
+		return nil, false, nil
+	}
+	b, ok := f.bytesFor[k]
+	return b, ok, nil
+}
+
+func TestAudit_INV1_DeepByteMatchClean(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	// writeArtifact writes "artifact"; deriver returns the same bytes -> match.
+	dv := &fakeDeriver{bytesFor: map[string][]byte{chunkKey(0, KindLedgers): []byte("artifact")}}
+
+	report, err := cat.Audit(AuditOptions{Deep: dv})
+	require.NoError(t, err)
+	require.Equal(t, 0, countInvariant(report, InvReadCorrectness), "%v", report.Violations)
+	require.Equal(t, 1, report.DeepChecked)
+}
+
+func TestAudit_INV1_DeepByteMismatch(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	dv := &fakeDeriver{bytesFor: map[string][]byte{chunkKey(0, KindLedgers): []byte("DIFFERENT")}}
+
+	report, err := cat.Audit(AuditOptions{Deep: dv})
+	require.NoError(t, err)
+	require.True(t, hasViolation(report, InvReadCorrectness, chunkKey(0, KindLedgers)),
+		"expected INV-1 byte-mismatch violation: %v", report.Violations)
+}
+
+func TestAudit_INV1_DeclinedSampleNotChecked(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	dv := &fakeDeriver{declined: map[string]bool{chunkKey(0, KindLedgers): true}}
+
+	report, err := cat.Audit(AuditOptions{Deep: dv})
+	require.NoError(t, err)
+	require.Equal(t, 0, report.DeepChecked)
+	require.Equal(t, 0, countInvariant(report, InvReadCorrectness))
+}
+
+func TestAudit_INV1_DeriverErrorSurfaces(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+	dv := &fakeDeriver{err: errors.New("backend down")}
+
+	_, err := cat.Audit(AuditOptions{Deep: dv})
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "backend down")
+}
+
+func TestAudit_INV1_NoDeriverSkipsDeep(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	freezeChunkArtifacts(t, cat, 0, KindLedgers)
+
+	report, err := cat.Audit(AuditOptions{}) // no Deep
+	require.NoError(t, err)
+	require.Equal(t, 0, report.DeepChecked)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate.go b/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate.go
new file mode 100644
index 000000000..3f7dd9ae8
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate.go
@@ -0,0 +1,195 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strconv"
+	"time"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// validateConfig is the design's config gate (the "Configuration" /
+// validateConfig pseudocode), run BEFORE startStreaming. It does three things,
+// in order:
+//
+//  1. Stateless form validation — workers >= 1, max_retries >= 0, and
+//     earliest_ledger a well-formed "genesis" | "now" | chunk-aligned numeric.
+//     Validating the full static form here keeps every later parse well-formed.
+//
+//  2. Restart vs first start — the layout pin (config:earliest_ledger) is
+//     committed on first start. Present ⟹ a prior first start completed and the
+//     layout is immutable: confirm earliest_ledger is unchanged — with the
+//     "now"-on-restart no-op rule (a frontfill deployment keeps "now" in its
+//     config across restarts and must not abort).
+//
+//  3. First start — resolve earliest_ledger (genesis needs no tip; "now" and a
+//     numeric floor each require a reachable, ready backend through the SAME
+//     injected NetworkTipBackend startStreaming uses), then commit the pin in
+//     one atomic synced batch via the Catalog.
+//
+// It returns the RESOLVED earliest ledger (chunk-aligned, >= genesis) the caller
+// threads into StartConfig — the same value startStreaming reads back from the
+// pin. Errors are plain returns (no os.Exit): the daemon's top-level loop owns
+// the fatal-and-surface decision, and tests assert the errors directly.
+func validateConfig(
+	ctx context.Context,
+	cfg Config,
+	cat *Catalog,
+	tip NetworkTipBackend,
+	tipBackoff time.Duration,
+	tipMaxAttempts int,
+) (uint32, error) {
+	if cat == nil {
+		return 0, errors.New("streaming: validateConfig requires a non-nil Catalog")
+	}
+
+	workers := derefInt(cfg.Backfill.Workers)
+	maxRetries := derefInt(cfg.Backfill.MaxRetries)
+
+	// --- 1. Stateless form validation. ---
+	if workers < 1 {
+		return 0, fmt.Errorf("streaming: workers must be >= 1 (got %d) — a zero pool deadlocks executePlan", workers)
+	}
+	if maxRetries < 0 {
+		return 0, fmt.Errorf("streaming: max_retries must be >= 0 (got %d) — 0 means run once, no retry", maxRetries)
+	}
+	// earliest_ledger must be "genesis", "now", or a chunk-aligned ledger >=
+	// genesis. Form-validating the numeric case here keeps it out of
+	// chunk.IDFromLedger's sub-genesis panic domain below.
+	if err := validateEarliestForm(cfg.Streaming.EarliestLedger); err != nil {
+		return 0, err
+	}
+
+	// --- 2/3. Pin inspection. ---
+	earliestStored, earliestPinned, err := cat.EarliestLedger()
+	if err != nil {
+		return 0, fmt.Errorf("streaming: read earliest_ledger pin: %w", err)
+	}
+
+	if earliestPinned { //nolint:nestif // first-start vs restart immutability branch
+		// --- 2. Restart: the layout is committed — confirm nothing changed. ---
+		// earliest_ledger immutability. The backend tip is NOT re-sampled — it
+		// may lag below the pinned floor and the catch-up loop's
+		// max(tip, lastCommitted) handles that. A genesis/numeric value must
+		// equal the stored pin or startup aborts; "now" is a deliberate no-op
+		// meaning "keep the pinned floor", so a frontfill deployment leaves "now"
+		// in its config across restarts without aborting.
+		if cfg.Streaming.EarliestLedger != EarliestNow {
+			want := uint32(chunk.FirstLedgerSeq)
+			if cfg.Streaming.EarliestLedger != EarliestGenesis {
+				// Already form-validated as a parseable chunk-aligned uint32.
+				want = mustParseUint32(cfg.Streaming.EarliestLedger)
+			}
+			if want != earliestStored {
+				return 0, fmt.Errorf("streaming: earliest_ledger changed: stored=%d, config=%q. "+
+					"Wipe the data directory to change earliest_ledger (or use the future "+
+					"set-earliest-ledger admin command)", earliestStored, cfg.Streaming.EarliestLedger)
+			}
+		}
+		return earliestStored, nil
+	}
+
+	// --- 3. First start (or an incomplete prior start — no artifacts yet). ---
+	// Resolve earliest_ledger, then commit the layout pin in one atomic batch.
+	earliest, err := resolveEarliestFirstStart(ctx, cfg.Streaming.EarliestLedger, tip, tipBackoff, tipMaxAttempts)
+	if err != nil {
+		return 0, err
+	}
+	if err := cat.PinLayout(earliest); err != nil {
+		return 0, fmt.Errorf("streaming: pin layout (earliest=%d): %w", earliest, err)
+	}
+	return earliest, nil
+}
+
+// validateEarliestForm checks the static form of earliest_ledger: "genesis",
+// "now", or a chunk-aligned decimal ledger >= genesis. It does NOT resolve "now"
+// or validate a numeric floor against the tip — that is first-start-only work.
+func validateEarliestForm(earliest string) error {
+	if earliest == EarliestGenesis || earliest == EarliestNow {
+		return nil
+	}
+	n, err := strconv.ParseUint(earliest, 10, 32)
+	if err != nil {
+		return fmt.Errorf("streaming: earliest_ledger must be %q, %q, or a chunk-aligned "+
+			"ledger >= %d; got %q", EarliestGenesis, EarliestNow, chunk.FirstLedgerSeq, earliest)
+	}
+	ledger := uint32(n)
+	if ledger < chunk.FirstLedgerSeq || ledger != chunk.IDFromLedger(ledger).FirstLedger() {
+		return fmt.Errorf("streaming: earliest_ledger must be %q, %q, or a chunk-aligned "+
+			"ledger >= %d; got %q (not chunk-aligned or sub-genesis)",
+			EarliestGenesis, EarliestNow, chunk.FirstLedgerSeq, earliest)
+	}
+	return nil
+}
+
+// resolveEarliestFirstStart turns the form-validated earliest_ledger string
+// into the chunk-aligned ledger to pin on a first start. A genesis floor needs
+// no tip (genesis is always a valid lower bound); "now" and a numeric floor each
+// require a reachable, ready backend through the injected NetworkTipBackend —
+// "now" has no other way to resolve, and a numeric floor is rejected if it is
+// past the tip, so neither can pin a garbage or future floor.
+func resolveEarliestFirstStart(
+	ctx context.Context, earliest string, tip NetworkTipBackend, backoff time.Duration, maxAttempts int,
+) (uint32, error) {
+	switch earliest {
+	case EarliestGenesis:
+		return chunk.FirstLedgerSeq, nil
+
+	case EarliestNow:
+		// No local substitute for "now": resolving the floor requires a tip.
+		t, err := networkTip(ctx, tip, backoff, maxAttempts)
+		if err != nil {
+			return 0, fmt.Errorf("streaming: earliest_ledger=%q needs a reachable, ready backend: %w",
+				EarliestNow, err)
+		}
+		// chunkFirstLedger(chunkID(tip)) <= tip, so never past the tip.
+		return chunk.IDFromLedger(t).FirstLedger(), nil
+
+	default:
+		// Numeric: already form-validated (parseable, >= genesis, chunk-aligned).
+		// It is pinned immutably, so it MUST be validated against a real tip
+		// first — skipping the check when the backend is down would let a floor
+		// AHEAD of the network become permanent (the catch-up loop's
+		// max(tip, earliest-1) anchor would then collapse the range to empty and
+		// resume from a future ledger with the bad floor pinned). Like "now", a
+		// numeric first-start floor therefore requires a reachable, ready backend.
+		floor := mustParseUint32(earliest)
+		t, err := networkTip(ctx, tip, backoff, maxAttempts)
+		if err != nil {
+			return 0, fmt.Errorf("streaming: first start with a numeric earliest_ledger needs a "+
+				"reachable, ready backend to validate the floor against the network tip: %w", err)
+		}
+		if floor > t {
+			return 0, fmt.Errorf("streaming: earliest_ledger (%d) is past the current network tip (%d); reject",
+				floor, t)
+		}
+		return floor, nil
+	}
+}
+
+// mustParseUint32 parses a decimal uint32 that the caller has already
+// form-validated. A parse failure here is a programming error (the form check
+// passed), so it panics rather than returning an error nobody can handle.
+func mustParseUint32(s string) uint32 {
+	n, err := strconv.ParseUint(s, 10, 32)
+	if err != nil {
+		panic(fmt.Sprintf("streaming: mustParseUint32(%q): %v (caller must form-validate first)", s, err))
+	}
+	return uint32(n)
+}
+
+func derefU32(p *uint32) uint32 {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
+
+func derefInt(p *int) int {
+	if p == nil {
+		return 0
+	}
+	return *p
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate_test.go
new file mode 100644
index 000000000..e99092f25
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/config_validate_test.go
@@ -0,0 +1,272 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"strconv"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// validCfg builds a documented-valid Config with the validateConfig-relevant
+// knobs set; callers mutate one field to drive a rejection case.
+func validCfg(workers, maxRetries int, earliest string) Config {
+	return Config{
+		Service:   ServiceConfig{DefaultDataDir: "/data"},
+		Backfill:  BackfillConfig{Workers: &workers, MaxRetries: &maxRetries},
+		Streaming: StreamingConfig{EarliestLedger: earliest, CaptiveCoreConfig: "/cc"},
+	}
+}
+
+// readyTip returns a tip backend that always reports the given ledger.
+func readyTip(ledger uint32) *fakeTipBackend {
+	return &fakeTipBackend{tips: []uint32{ledger}}
+}
+
+// downTip returns a tip backend that never comes up.
+func downTip() *fakeTipBackend {
+	return &fakeTipBackend{err: errors.New("backend unreachable"), errFirst: 99}
+}
+
+func callValidate(t *testing.T, cfg Config, cat *Catalog, tip NetworkTipBackend) (uint32, error) {
+	t.Helper()
+	return validateConfig(context.Background(), cfg, cat, tip, time.Millisecond, 3)
+}
+
+// requireEarliestPin reads the layout pin straight back from the live metastore
+// and asserts it equals the expected value. Used right after a first-start or a
+// restart call so a metastore read-visibility anomaly surfaces LOUDLY here as a
+// direct "pin readback missed" failure. Also the anchor for the
+// restart-mutates-nothing assertions: a successful restart must leave the pin
+// byte-identical.
+func requireEarliestPin(t *testing.T, cat *Catalog, wantEarliest uint32) {
+	t.Helper()
+	el, ok, err := cat.EarliestLedger()
+	require.NoError(t, err, "readback of earliest_ledger pin")
+	require.True(t, ok, "earliest_ledger pin must be present after validateConfig")
+	require.Equal(t, wantEarliest, el, "earliest_ledger pin readback")
+}
+
+// ---------------------------------------------------------------------------
+// Accept the documented-valid forms.
+// ---------------------------------------------------------------------------
+
+func TestValidateConfig_AcceptsGenesisFirstStart(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// Genesis needs no tip: a down backend is fine.
+	earliest, err := callValidate(t, validCfg(4, 3, "genesis"), cat, downTip())
+	require.NoError(t, err)
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), earliest)
+
+	// The pin is committed.
+	el, ok, err := cat.EarliestLedger()
+	require.NoError(t, err)
+	require.True(t, ok)
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), el)
+}
+
+func TestValidateConfig_AcceptsNowFirstStart(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// chunk 5 first ledger is 50002; a tip mid-chunk-5 resolves "now" to 50002.
+	tipLedger := chunk.ID(5).FirstLedger() + 1234
+	earliest, err := callValidate(t, validCfg(4, 3, "now"), cat, readyTip(tipLedger))
+	require.NoError(t, err)
+	assert.Equal(t, chunk.ID(5).FirstLedger(), earliest)
+
+	el, _, _ := cat.EarliestLedger()
+	assert.Equal(t, chunk.ID(5).FirstLedger(), el)
+}
+
+func TestValidateConfig_AcceptsNumericFirstStart(t *testing.T) {
+	cat, _ := testCatalog(t)
+	floor := chunk.ID(3).FirstLedger() // 30002, chunk-aligned
+	tipLedger := chunk.ID(10).FirstLedger()
+	earliest, err := callValidate(t, validCfg(4, 3, itoa(floor)), cat, readyTip(tipLedger))
+	require.NoError(t, err)
+	assert.Equal(t, floor, earliest)
+}
+
+func TestValidateConfig_AcceptsZeroRetries(t *testing.T) {
+	cat, _ := testCatalog(t)
+	_, err := callValidate(t, validCfg(1, 0, "genesis"), cat, downTip())
+	require.NoError(t, err)
+}
+
+// ---------------------------------------------------------------------------
+// Reject the malformed forms (stateless).
+// ---------------------------------------------------------------------------
+
+func TestValidateConfig_RejectsMalformed(t *testing.T) {
+	tests := []struct {
+		name string
+		cfg  Config
+		want string
+	}{
+		{"zero workers", validCfg(0, 3, "genesis"), "workers"},
+		{"negative workers", validCfg(-1, 3, "genesis"), "workers"},
+		{"negative max_retries", validCfg(4, -1, "genesis"), "max_retries"},
+		{"bogus earliest string", validCfg(4, 3, "yesterday"), "earliest_ledger"},
+		{"sub-genesis numeric floor", validCfg(4, 3, "1"), "earliest_ledger"},
+		{"misaligned numeric floor", validCfg(4, 3, "12345"), "earliest_ledger"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			cat, _ := testCatalog(t)
+			_, err := callValidate(t, tc.cfg, cat, readyTip(chunk.ID(10).FirstLedger()))
+			require.Error(t, err)
+			assert.Contains(t, err.Error(), tc.want)
+
+			// A rejected config pins nothing.
+			_, ok, _ := cat.EarliestLedger()
+			assert.False(t, ok, "no earliest pin on a rejected config")
+		})
+	}
+}
+
+// ---------------------------------------------------------------------------
+// First start pins the earliest_ledger key.
+// ---------------------------------------------------------------------------
+
+func TestValidateConfig_FirstStartPinsEarliest(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// Before: not pinned.
+	_, ok, _ := cat.EarliestLedger()
+	require.False(t, ok)
+
+	_, err := callValidate(t, validCfg(4, 3, "genesis"), cat, downTip())
+	require.NoError(t, err)
+
+	// After: present.
+	el, ok, _ := cat.EarliestLedger()
+	require.True(t, ok)
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), el)
+}
+
+// ---------------------------------------------------------------------------
+// First start with "now" / numeric requires a reachable, ready tip.
+// ---------------------------------------------------------------------------
+
+func TestValidateConfig_NowFirstStartNeedsTip(t *testing.T) {
+	cat, _ := testCatalog(t)
+	_, err := callValidate(t, validCfg(4, 3, "now"), cat, downTip())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "now")
+	_, ok, _ := cat.EarliestLedger()
+	assert.False(t, ok, "nothing pinned when the tip is unavailable")
+}
+
+func TestValidateConfig_NumericFirstStartNeedsTip(t *testing.T) {
+	cat, _ := testCatalog(t)
+	floor := chunk.ID(3).FirstLedger()
+	_, err := callValidate(t, validCfg(4, 3, itoa(floor)), cat, downTip())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "network tip")
+}
+
+func TestValidateConfig_NumericFloorPastTipRejected(t *testing.T) {
+	cat, _ := testCatalog(t)
+	floor := chunk.ID(100).FirstLedger()       // way ahead
+	tipLedger := chunk.ID(5).FirstLedger() + 1 // tip far below the floor
+	_, err := callValidate(t, validCfg(4, 3, itoa(floor)), cat, readyTip(tipLedger))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "past the current network tip")
+	_, ok, _ := cat.EarliestLedger()
+	assert.False(t, ok, "a future floor is never pinned")
+}
+
+func TestValidateConfig_SubGenesisTipRejectedAsNotReady(t *testing.T) {
+	cat, _ := testCatalog(t)
+	_, err := callValidate(t, validCfg(4, 3, "now"), cat, readyTip(chunk.FirstLedgerSeq-1))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "now")
+}
+
+// ---------------------------------------------------------------------------
+// Restart immutability.
+// ---------------------------------------------------------------------------
+
+func TestValidateConfig_RestartAcceptsUnchanged(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// First start pins earliest=genesis. Read the pin straight back so a metastore
+	// visibility anomaly fails here, not as a downstream nil error.
+	_, err := callValidate(t, validCfg(4, 3, "genesis"), cat, downTip())
+	require.NoError(t, err)
+	requireEarliestPin(t, cat, uint32(chunk.FirstLedgerSeq))
+
+	// Restart with the identical earliest: no error, no re-sample needed.
+	earliest, err := callValidate(t, validCfg(8, 1, "genesis"), cat, downTip())
+	require.NoError(t, err)
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), earliest)
+
+	// A successful restart MUTATES NOTHING: the pin is byte-identical to the
+	// first-start value.
+	requireEarliestPin(t, cat, uint32(chunk.FirstLedgerSeq))
+}
+
+func TestValidateConfig_RestartAbortsOnChangedEarliest(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// First start pins a numeric floor. Read it straight back so a metastore
+	// visibility anomaly surfaces here as a missed pin, not downstream as the
+	// restart branch spuriously returning nil.
+	floor := chunk.ID(3).FirstLedger()
+	_, err := callValidate(t, validCfg(4, 3, itoa(floor)), cat, readyTip(chunk.ID(50).FirstLedger()))
+	require.NoError(t, err)
+	requireEarliestPin(t, cat, floor)
+
+	// Restart with a different numeric floor aborts.
+	other := chunk.ID(7).FirstLedger()
+	_, err = callValidate(t, validCfg(4, 3, itoa(other)), cat, readyTip(chunk.ID(50).FirstLedger()))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "earliest_ledger changed")
+
+	// The aborted restart left the original pin untouched.
+	requireEarliestPin(t, cat, floor)
+}
+
+func TestValidateConfig_RestartGenesisVsNumericAborts(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// First start: genesis (earliest pinned = 2).
+	_, err := callValidate(t, validCfg(4, 3, "genesis"), cat, downTip())
+	require.NoError(t, err)
+	requireEarliestPin(t, cat, uint32(chunk.FirstLedgerSeq))
+
+	// Restart edited to a numeric floor != genesis: abort.
+	_, err = callValidate(t, validCfg(4, 3, itoa(chunk.ID(3).FirstLedger())), cat,
+		readyTip(chunk.ID(50).FirstLedger()))
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "earliest_ledger changed")
+
+	// The aborted restart left the genesis pin untouched.
+	requireEarliestPin(t, cat, uint32(chunk.FirstLedgerSeq))
+}
+
+// "now" on restart is a deliberate no-op — it keeps the pinned floor and never
+// aborts, even when a backend would resolve it to a different ledger. A
+// frontfill deployment leaves "now" in its config across restarts.
+func TestValidateConfig_RestartNowIsNoOp(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// First start: "now" resolves against a tip in chunk 5 -> pin 50002.
+	_, err := callValidate(t, validCfg(4, 3, "now"), cat, readyTip(chunk.ID(5).FirstLedger()+10))
+	require.NoError(t, err)
+	requireEarliestPin(t, cat, chunk.ID(5).FirstLedger())
+
+	// Restart with "now" and a tip that now sits in a DIFFERENT chunk: no
+	// abort, no re-resolve — the original pin is kept, and a down backend is
+	// even tolerated (no tip sample at all).
+	earliest, err := callValidate(t, validCfg(4, 3, "now"), cat, downTip())
+	require.NoError(t, err)
+	assert.Equal(t, chunk.ID(5).FirstLedger(), earliest, "restart with now keeps the original pin")
+
+	// A "now" restart MUTATES NOTHING: the original pin is byte-identical, even
+	// though a live backend would have resolved "now" to a different chunk.
+	requireEarliestPin(t, cat, chunk.ID(5).FirstLedger())
+}
+
+// itoa is the test-local uint32 -> decimal-string helper for building numeric
+// earliest_ledger config values.
+func itoa(n uint32) string { return strconv.FormatUint(uint64(n), 10) }
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/convergence_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/convergence_test.go
new file mode 100644
index 000000000..4be2a68db
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/convergence_test.go
@@ -0,0 +1,499 @@
+package streaming
+
+import (
+	"context"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+)
+
+// =============================================================================
+// Crash-injection + convergence suite — the design's strongest validation
+// (design-docs/full-history-streaming-workflow.md "Convergence", "Scenario
+// coverage", "What a bug looks like").
+//
+// Each case (1) CONSTRUCTS a durable crash / partial-completion state on a real
+// Catalog + real hotchunk DB + temp artifact dirs — by driving the REAL protocol
+// ops (MarkChunkFreezing, SurgicalRecovery, the hot-tier open/ingest) to a chunk
+// boundary and then STOPPING before the next op runs, and/or by directly
+// planting the durable keys+files a crash at that instant would leave. (2) runs
+// the REAL convergence path — a lifecycle tick (runLifecycleTick) and/or a
+// re-derivation (deriveCompleteThrough / deriveWatermark). (3) ASSERTS the
+// system converges to quiescence satisfying INV-2..4 by calling the REAL
+// Catalog.Audit and requiring report.Clean(), PLUS idempotency (re-running the
+// convergence op changes nothing) and that the derived watermark equals the
+// durable state.
+//
+// The point of using the real ops + real audit (rather than hand-rolled
+// assertions) is the design's "None of the invariants reference the phase
+// scans": a bug in freeze / discard / prune / sweep surfaces here as a genuine
+// Audit violation, not something the same code that produced it judges
+// acceptable.
+//
+// CAVEAT — INV-1's deep byte-compare (audit_test.go's DeepDeriver) is NOT wired
+// here — this suite asserts INV-1 only structurally (no orphan/dangling/
+// duplicate, single canonical state); content re-derivation is audit_test.go's
+// job.
+// =============================================================================
+
+// convergenceHarness bundles the catalog, its lifecycle config (real production
+// primitives — a real RocksHotProbe over the catalog's hot layout), a fatal
+// recorder, and a probe so a case can run real ticks and derivations.
+type convergenceHarness struct {
+	cat   *Catalog
+	cfg   LifecycleConfig
+	rec   *fatalRecorder
+	probe HotProbe
+}
+
+// newConvergenceHarness builds a harness over a catalog with the genesis
+// earliest_ledger pin and the given retention width.
+//
+//nolint:unparam // retentionChunks varies across slices' convergence tests
+func newConvergenceHarness(t *testing.T, retentionChunks uint32) *convergenceHarness {
+	t.Helper()
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+	cfg, rec := lifecycleTestConfig(t, cat, retentionChunks)
+	return &convergenceHarness{
+		cat:   cat,
+		cfg:   cfg,
+		rec:   rec,
+		probe: cfg.Process.HotProbe,
+	}
+}
+
+// tick runs one real lifecycle tick — driven the way ingestion would, with the
+// highest complete chunk derived from the catalog as lastChunk — and asserts it
+// did not abort the daemon.
+func (h *convergenceHarness) tick(t *testing.T) {
+	t.Helper()
+	runTickForCatalog(context.Background(), t, h.cfg, h.cat)
+	require.False(t, h.rec.fired(), "convergence tick must not abort the daemon: %v", h.rec.last.Load())
+}
+
+// auditClean runs the REAL audit and requires zero violations. retentionChunks
+// matches the harness so INV-4 checks against the EXACT floor the daemon
+// enforces.
+func (h *convergenceHarness) auditClean(t *testing.T) AuditReport {
+	t.Helper()
+	report, err := h.cat.Audit(AuditOptions{RetentionChunks: h.cfg.RetentionChunks})
+	require.NoError(t, err, "audit must complete (error only for I/O)")
+	require.True(t, report.Clean(),
+		"after convergence the store must satisfy INV-2..4; violations:\n%s", violationsString(report))
+	return report
+}
+
+// requireQuiescent asserts re-running the tick's three derivations schedules no
+// further work (idempotency: convergence reached a fixed point).
+func (h *convergenceHarness) requireQuiescent(t *testing.T) {
+	t.Helper()
+	through, err := deriveCompleteThrough(h.cat)
+	require.NoError(t, err)
+	assertQuiescent(t, h.cfg, h.cat, through)
+}
+
+// requireWatermarkMatchesDurable asserts the derived watermark equals the
+// expected durable frontier — the design's "the startup derivation equals
+// exactly the durable state".
+func (h *convergenceHarness) requireWatermarkMatchesDurable(t *testing.T, want uint32) {
+	t.Helper()
+	got, err := deriveWatermark(h.cat, h.probe)
+	require.NoError(t, err, "watermark derivation must succeed at quiescence")
+	require.Equal(t, want, got, "derived watermark must equal the durable frontier")
+}
+
+func violationsString(r AuditReport) string {
+	s := ""
+	var sSb111 strings.Builder
+	for _, v := range r.Violations {
+		sSb111.WriteString("  - " + v.String() + "\n")
+	}
+	s += sSb111.String()
+	if s == "" {
+		return "  (none)"
+	}
+	return s
+}
+
+// =============================================================================
+// Per-chunk artifact crash states (freezing / pruning) — the "freezing" tail
+// is re-materialized by the freeze stage from its still-present hot DB
+// (processChunk's hot branch, the design's "freeze from a live hot DB"); the
+// "pruning" demoted artifact is swept by the prune scan.
+// =============================================================================
+
+// TestConvergence_PerChunkFreezingReMaterializesFromHotDB constructs the
+// per-chunk "freezing" crash state WITHIN retention (a crashed freeze that
+// marked the key but did not finish): chunk 0's ledgers are "freezing" with a
+// complete hot DB still behind the chunk. The freeze stage re-derives the cold
+// artifact FROM that hot DB (backfillSource's hot branch), then discards the
+// now-redundant hot DB — converging to a clean, quiescent store satisfying
+// INV-2..4.
+func TestConvergence_PerChunkFreezingReMaterializesFromHotDB(t *testing.T) {
+	// full-chunk ingest; isolated TempDir/catalog — overlaps the other heavy
+	// tests to fit the gate's go-test timeout.
+	t.Parallel()
+	h := newConvergenceHarness(t, 0) // a chunk finalizes at chunk 0
+
+	// Chunk 0: a COMPLETE hot DB on disk (every ledger ingested, write handle
+	// closed — the just-closed-chunk shape). This is the source the freeze stage
+	// re-materializes from.
+	ingestFullHotChunk(t, h.cat, 0)
+	// The live chunk 1 above the partition (held open by "ingestion").
+	live := openLiveHotDB(t, h.cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+
+	// Now plant the crash: chunk 0's cold artifact marked "freezing" (a crashed
+	// freeze that pre-marked but did not fsync+flip). Mark via the REAL protocol.
+	require.NoError(t, h.cat.MarkChunkFreezing(0, KindLedgers))
+	require.Equal(t, StateFreezing, mustState(t, h.cat, 0, KindLedgers))
+
+	// Converge: one real tick. The freeze stage's resolver sees the non-frozen
+	// key, re-materializes chunk 0 from its hot DB, and the discard stage retires
+	// the hot DB.
+	h.tick(t)
+	h.auditClean(t)
+	h.requireQuiescent(t)
+
+	// The chunk is now frozen and its hot DB discarded.
+	require.Equal(t, StateFrozen, mustState(t, h.cat, 0, KindLedgers))
+	has, err := h.cat.Has(hotChunkKey(0))
+	require.NoError(t, err)
+	require.False(t, has, "chunk 0's hot DB was discarded after the freeze")
+
+	// Idempotency.
+	before := snapshotAllKeys(t, h.cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, h.cat), "second tick is a no-op")
+	h.auditClean(t)
+}
+
+// TestConvergence_PerChunkPruningArtifactSwept constructs the per-chunk
+// "pruning" crash state: a recovery-demoted ledger artifact whose sweep did not
+// run, sitting in-retention. The prune scan sweeps it (file + key), converging
+// to INV-2..4 clean.
+func TestConvergence_PerChunkPruningArtifactSwept(t *testing.T) {
+	h := newConvergenceHarness(t, 0)
+
+	// A live chunk 1 above the partition so chunk 0 is below it and complete.
+	require.NoError(t, h.cat.PutHotTransient(1))
+
+	// The crash leftover: a chunk:0:ledgers key demoted to "pruning" with its pack
+	// file still on disk (a demotion whose sweep did not unlink).
+	writeArtifact(t, h.cat.layout.LedgerPackPath(0))
+	require.NoError(t, h.cat.store.Put(chunkKey(0, KindLedgers), string(StatePruning)))
+
+	// Before convergence the audit FAILS (a "pruning" key surviving quiescence is
+	// an INV-2 violation) — proving the suite catches the bug class.
+	pre, err := h.cat.Audit(AuditOptions{RetentionChunks: h.cfg.RetentionChunks})
+	require.NoError(t, err)
+	require.False(t, pre.Clean(), "the unswept pruning artifact must be a detectable violation pre-convergence")
+
+	// Converge: the prune scan sweeps the "pruning" ref.
+	h.tick(t)
+	h.auditClean(t)
+	h.requireQuiescent(t)
+
+	require.Equal(t, State(""), mustState(t, h.cat, 0, KindLedgers), "the pruning key is swept")
+	require.NoFileExists(t, h.cat.layout.LedgerPackPath(0), "the pruning file is unlinked")
+
+	before := snapshotAllKeys(t, h.cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, h.cat))
+	h.auditClean(t)
+}
+
+// =============================================================================
+// Boundary crash — recovered by the watermark refinement. A crash at a chunk
+// boundary can leave the just-completed chunk's hot key "ready" and C+1's hot
+// key "transient". deriveWatermark's ONE read of the highest *ready* chunk
+// recovers the chunk-level frontier the "transient" key no longer advertises.
+// =============================================================================
+
+// TestConvergence_BoundaryCrashWatermarkRefinement plants the boundary-crash
+// durable state the design's progress.go describes: chunk 0's hot DB complete
+// and "ready" (the just-completed chunk), chunk 1's hot key "transient" (the next
+// bracket's key was written — close-before-create-key — but the crash hit before
+// it became "ready", so its completion no key now advertises). The POSITIONAL
+// term under-counts here (highest *ready* is chunk 0, so positional = -1); the
+// design's recovery is deriveWatermark's ONE MaxCommittedSeq read of the highest
+// ready chunk, which supplies chunk 0's frontier. We assert that refinement, then
+// that ingestion resuming (chunk 1 becomes "ready") lets a tick converge.
+func TestConvergence_BoundaryCrashWatermarkRefinement(t *testing.T) {
+	// full-chunk ingest; isolated TempDir/catalog — overlaps the other heavy
+	// tests to fit the gate's go-test timeout.
+	t.Parallel()
+	h := newConvergenceHarness(t, 0)
+
+	// Chunk 0: a complete, "ready" hot DB (every ledger committed). Chunk 1:
+	// "transient" only (the next bracket opened its key but crashed before "ready").
+	ingestFullHotChunk(t, h.cat, 0) // closes the write handle, leaves key "ready" + full dir
+	require.Equal(t, HotReady, mustHotState(t, h.cat, 0))
+	require.NoError(t, h.cat.PutHotTransient(1))
+	require.Equal(t, HotTransient, mustHotState(t, h.cat, 1))
+
+	// completeThrough alone under-counts (positional term sees no ready chunk above
+	// chunk 0): it lands at the genesis sentinel.
+	through, err := deriveCompleteThrough(h.cat)
+	require.NoError(t, err)
+	require.Equal(t, preGenesisLedger, through, "completeThrough under-counts at a boundary crash")
+
+	// The WATERMARK refinement recovers the real frontier: deriveWatermark's one
+	// MaxCommittedSeq read of the highest ready chunk (chunk 0) yields chunk 0's
+	// last committed seq — the design's boundary-crash recovery.
+	h.requireWatermarkMatchesDurable(t, chunk.ID(0).LastLedger())
+
+	// Pre-resume the store is already INV-2..4 clean (chunk 0's hot DB is the live
+	// tier from the lifecycle's view; nothing is orphaned or dangling).
+	h.auditClean(t)
+
+	// Ingestion resumes: chunk 1's bracket completes ("ready"), moving the partition
+	// above chunk 0. Now a tick freezes chunk 0 from its ready hot DB and discards
+	// the hot DB — converging to INV-2..4 clean and quiescent.
+	live := openLiveHotDB(t, h.cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+	h.tick(t)
+	h.auditClean(t)
+	h.requireQuiescent(t)
+	require.Equal(t, StateFrozen, mustState(t, h.cat, 0, KindLedgers))
+}
+
+// =============================================================================
+// Surgical recovery (case 3, tainted cold data) — the operator demotes the
+// tainted range to "freezing"/"transient" (one atomic batch), then the next
+// startup converges: backfill re-derives the "freezing" cold artifacts from the
+// surviving hot DB (or the bulk backend in production). We drive the demotion
+// through the REAL SurgicalRecovery and the re-derivation through a REAL tick.
+// =============================================================================
+
+// TestConvergence_SurgicalRecoveryCase3ReDerives ties case 3 end to end on real
+// state: a fully-converged chunk 0 (frozen cold) is tainted by a cold+hot
+// surgical recovery (cold -> "freezing"); the next tick re-derives the cold
+// artifact from a re-ingested hot DB, returning to INV-2..4 clean.
+func TestConvergence_SurgicalRecoveryCase3ReDerives(t *testing.T) {
+	// full-chunk ingest; isolated TempDir/catalog — overlaps the other heavy
+	// tests to fit the gate's go-test timeout.
+	t.Parallel()
+	h := newConvergenceHarness(t, 0)
+
+	// Converged steady state for chunk 0: frozen cold artifact, served PURELY by
+	// cold (no hot DB — the hot tier was already discarded in steady state). A live
+	// chunk 1 sits above the partition.
+	live := openLiveHotDB(t, h.cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+	freezeChunkArtifacts(t, h.cat, 0, KindLedgers)
+	h.auditClean(t) // sanity: the pre-recovery state is already clean and quiescent
+
+	// Operator runs the case-3 recovery over chunk 0 (cold + hot). The present cold
+	// key (ledgers) drops to "freezing" — one atomic batch. There is no hot key for
+	// chunk 0 to demote (it was discarded in steady state), so the recovery's hot
+	// tier is a no-op for this chunk; the cold demotion is what regresses it.
+	plan, err := h.cat.SurgicalRecovery(RecoveryRequest{Lo: 0, Hi: 0, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+	require.False(t, plan.Empty())
+	require.Equal(t, StateFreezing, mustState(t, h.cat, 0, KindLedgers))
+
+	// Re-ingestion refills the chunk's hot tail (the design's "captive core
+	// re-ingests the un-frozen tail forward" / "openHotDB wipes and recreates one
+	// when re-ingestion re-opens that chunk") — the local source the freeze stage
+	// re-derives the cold artifact from (production uses the bulk backend).
+	ingestFullHotChunk(t, h.cat, 0)
+	require.Equal(t, HotReady, mustHotState(t, h.cat, 0))
+
+	// Converge: the tick re-materializes chunk 0's cold artifact, then discards the
+	// hot DB. Back to INV-2..4 clean and quiescent.
+	h.tick(t)
+	h.auditClean(t)
+	h.requireQuiescent(t)
+	require.Equal(t, StateFrozen, mustState(t, h.cat, 0, KindLedgers))
+
+	before := snapshotAllKeys(t, h.cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, h.cat))
+	h.auditClean(t)
+}
+
+// =============================================================================
+// Hot-volume loss (case 4) — a "ready" hot key whose dir is gone is FATAL
+// (ErrHotVolumeLost), never silently healed; the operator demotes it hot-only
+// to "transient", the fatal stops, the watermark falls to the last frozen
+// boundary, and re-ingestion fills forward. We assert BOTH halves.
+// =============================================================================
+
+// TestConvergence_HotVolumeLossCase4 plants the case-4 state (cold survives,
+// hot dir gone), asserts the fatal fires, runs the REAL hot-only recovery, then
+// asserts the watermark heals to the last frozen boundary, a re-ingested hot DB
+// converges, and the audit is clean.
+func TestConvergence_HotVolumeLossCase4(t *testing.T) {
+	h := newConvergenceHarness(t, 0)
+
+	// Durable cold history through chunk 0 (survives on durable storage): frozen
+	// ledgers. Chunk 0's last ledger is the last frozen boundary the watermark must
+	// heal to.
+	freezeChunkArtifacts(t, h.cat, 0, KindLedgers)
+
+	// The lost live chunk 1: "ready" with its hot dir GONE (the ephemeral volume
+	// died while the meta store survived).
+	live := chunk.ID(1)
+	require.NoError(t, h.cat.PutHotTransient(live))
+	require.NoError(t, h.cat.FlipHotReady(live))
+	require.NoError(t, os.RemoveAll(h.cat.layout.HotChunkPath(live)))
+
+	// Half 1: the fatal fires (ready key + missing dir = ErrHotVolumeLost). It is
+	// NOT silently healed — derivation REFUSES rather than guessing.
+	_, err := deriveWatermark(h.cat, h.probe)
+	require.ErrorIs(t, err, ErrHotVolumeLost,
+		"a ready hot key with a missing dir must fatal as ErrHotVolumeLost")
+
+	// Half 2: the operator runs the case-4 (hot-only) recovery over the orphaned
+	// chunk. The hot key -> "transient"; the fatal stops firing.
+	_, err = h.cat.SurgicalRecovery(RecoveryRequest{Lo: live, Hi: live, Tier: RecoverHotOnly})
+	require.NoError(t, err)
+	require.Equal(t, HotTransient, mustHotState(t, h.cat, live))
+
+	// The watermark heals to chunk 0's last ledger — the last frozen boundary; no
+	// "ready" key with a missing dir remains.
+	h.requireWatermarkMatchesDurable(t, chunk.ID(0).LastLedger())
+
+	// Re-ingestion opens a fresh hot DB for the lost chunk and fills it forward.
+	db := openLiveHotDB(t, h.cat, live)
+	committed := live.FirstLedger() + 3
+	require.NoError(t, db.Ledgers().AddLedgers(ledger.Entry{Seq: committed, Bytes: []byte("refill")}))
+	require.NoError(t, db.Close())
+
+	// The watermark now reflects the re-ingested frontier. The convergence value of
+	// this case lives in the two halves above — the ErrHotVolumeLost fatal and the
+	// watermark healing to the last frozen boundary — NOT in the tick: the cold
+	// history survived intact and the re-ingested chunk is the new live tier, so
+	// nothing is dirty for the tick to repair.
+	h.requireWatermarkMatchesDurable(t, committed)
+	h.auditClean(t) // already clean BEFORE the tick — the recovery left nothing dirty
+	before := snapshotAllKeys(t, h.cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, h.cat),
+		"case 4's post-reingest tick is a no-op: nothing below the live chunk is tainted")
+	h.auditClean(t)
+	h.requireQuiescent(t)
+}
+
+// =============================================================================
+// Retention widen / shorten — the floor recomputes; convergence prunes below a
+// raised floor (shorten) and the next tick is a no-op once below-floor data is
+// gone.
+// =============================================================================
+
+// TestConvergence_RetentionShortenPrunesBelowRaisedFloor seeds several finalized
+// chunks, then SHORTENS retention so a higher floor leaves the lowest chunks
+// wholly below it. One tick prunes them (keys + files + hot DBs) and the store
+// converges to INV-2..4 clean against the NEW (shorter) retention.
+func TestConvergence_RetentionShortenPrunesBelowRaisedFloor(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// Six finalized chunks (0..5) with real files, plus a live chunk 6.
+	for c := chunk.ID(0); c <= 5; c++ {
+		freezeChunkArtifacts(t, cat, c, KindLedgers)
+		writeArtifact(t, cat.layout.LedgerPackPath(c))
+	}
+	makeReadyHotDirNoData(t, cat, 1) // a below-floor hot DB too
+	live := openLiveHotDB(t, cat, 6)
+	t.Cleanup(func() { _ = live.Close() })
+
+	// Shorten retention to 2 chunks. through = chunk 5's last ledger, so floor =
+	// lastCompleteChunkAt(through)-2+1 = chunk 4's first ledger; chunks 0..3 fall
+	// wholly below it and must be pruned.
+	cfg, rec := lifecycleTestConfig(t, cat, 2)
+	h := &convergenceHarness{cat: cat, cfg: cfg, rec: rec, probe: cfg.Process.HotProbe}
+
+	h.tick(t)
+	h.auditClean(t)
+	h.requireQuiescent(t)
+
+	for c := chunk.ID(0); c <= 3; c++ {
+		require.Equal(t, State(""), mustState(t, cat, c, KindLedgers), "chunk %s pruned below the raised floor", c)
+		require.NoFileExists(t, cat.layout.LedgerPackPath(c), "chunk %s pack pruned", c)
+		has, herr := cat.Has(hotChunkKey(c))
+		require.NoError(t, herr)
+		require.False(t, has, "chunk %s hot key pruned", c)
+	}
+	for c := chunk.ID(4); c <= 5; c++ {
+		require.Equal(t, StateFrozen, mustState(t, cat, c, KindLedgers), "chunk %s in retention survives", c)
+	}
+
+	before := snapshotAllKeys(t, cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, cat))
+	h.auditClean(t)
+}
+
+// TestConvergence_RetentionWidenIsTickNoOpAuditClean asserts the widen-side
+// claim from the tick's perspective: a lowered floor does NOT make the tick
+// prune (it never does) NOR materialize new bottom storage (that is backfill's
+// job). The tick over already-converged storage with a wider retention window is
+// a clean no-op, and the store stays INV-2..4 clean — the bottom-extension is
+// deferred to the next backfill, not the tick.
+func TestConvergence_RetentionWidenIsTickNoOpAuditClean(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// Chunks 3..5 finalized (the existing bottom of storage is chunk 3), live 6.
+	for c := chunk.ID(3); c <= 5; c++ {
+		freezeChunkArtifacts(t, cat, c, KindLedgers)
+		writeArtifact(t, cat.layout.LedgerPackPath(c))
+	}
+	live := openLiveHotDB(t, cat, 6)
+	t.Cleanup(func() { _ = live.Close() })
+
+	// A WIDE retention (100 chunks) lowers the floor below chunk 3, but the tick's
+	// production range is raised to lowestMaterializedChunk (chunk 3): it must NOT
+	// try to materialize chunks 0..2 (no source) and must NOT prune anything.
+	cfg, rec := lifecycleTestConfig(t, cat, 100)
+	h := &convergenceHarness{cat: cat, cfg: cfg, rec: rec, probe: cfg.Process.HotProbe}
+
+	before := snapshotAllKeys(t, cat)
+	h.tick(t)
+	require.False(t, rec.fired(), "widening must not fail the tick (no source for the new bottom): %v", rec.last.Load())
+	require.Equal(t, before, snapshotAllKeys(t, cat),
+		"the tick neither prunes nor materializes on a widen — that is backfill's job")
+	h.auditClean(t)
+	h.requireQuiescent(t)
+}
+
+// =============================================================================
+// Young network — no complete chunk exists yet. The tick produces nothing (the
+// freeze stage's range is empty), and the empty store trivially satisfies
+// INV-2..4. The convergence here is "no spurious work, no fatal".
+// =============================================================================
+
+// TestConvergence_YoungNetworkNoOp seeds a network younger than one complete
+// chunk: only a live (transient/ready) hot chunk 0, no frozen artifacts, no
+// complete chunk below the live one. A tick must do nothing and the audit must
+// be clean.
+func TestConvergence_YoungNetworkNoOp(t *testing.T) {
+	h := newConvergenceHarness(t, 0)
+
+	// A live chunk 0's hot DB, mid-ingest (a few ledgers, not the whole chunk), so
+	// nothing below it is complete and no chunk has frozen.
+	db := openLiveHotDB(t, h.cat, 0)
+	require.NoError(t, db.Ledgers().AddLedgers(ledger.Entry{Seq: chunk.ID(0).FirstLedger() + 2, Bytes: []byte("young")}))
+	t.Cleanup(func() { _ = db.Close() })
+
+	// completeThrough is the genesis sentinel (no frozen, the only ready chunk is
+	// the live one whose predecessor is below genesis), so the freeze range is
+	// empty and the tick is a pure no-op.
+	through, err := deriveCompleteThrough(h.cat)
+	require.NoError(t, err)
+	require.Equal(t, preGenesisLedger, through, "no complete chunk exists on a young network")
+
+	before := snapshotAllKeys(t, h.cat)
+	h.tick(t)
+	require.Equal(t, before, snapshotAllKeys(t, h.cat), "a young-network tick is a no-op")
+	h.auditClean(t)
+	h.requireQuiescent(t)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/daemon.go b/cmd/stellar-rpc/internal/fullhistory/streaming/daemon.go
new file mode 100644
index 000000000..7df864f6d
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/daemon.go
@@ -0,0 +1,482 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+
+	"github.com/sirupsen/logrus"
+
+	"github.com/stellar/go-stellar-sdk/ingest/ledgerbackend"
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/ingest"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/metastore"
+)
+
+// RunDaemon is the full-history streaming daemon's process entrypoint — the
+// design's "Daemon flow" from a cold start. It owns everything startStreaming
+// cannot construct itself, in the order the design mandates:
+//
+//  1. LOAD + form-validate the TOML config (LoadConfig).
+//  2. LOCK every configured storage root (one flock per root, design
+//     "Single-process enforcement") — fail fast if a second daemon is using one.
+//  3. OPEN the catalog store and bind the Catalog (the single durable-state view
+//     both startup and the lifecycle goroutine read).
+//  4. validateConfig — the stateful config gate: pin the two immutable layout
+//     values on first start, confirm them unchanged on restart, and resolve the
+//     earliest_ledger floor (consulting the bulk backend's tip for "now"/numeric
+//     floors). It pins config:earliest_ledger BEFORE startStreaming reads it.
+//  5. BUILD the production boundaries (captive core, the bulk ChunkSource +
+//     its tip/coverage adapter, the read server) — injectable so a test drives
+//     the whole flow with fakes.
+//  6. RUN the supervised startStreaming loop: startStreaming returns nil only on
+//     a clean shutdown (ctx canceled); any other return is a restartable error
+//     this loop surfaces and retries on a backoff, which is the design's
+//     "startup is the recovery path" (a fresh start re-runs catch-up + the first
+//     lifecycle tick, finishing crash debris and pruning downtime leftovers).
+//
+// The locks are held for the daemon's whole life (released on return). ctx
+// cancellation propagates cleanly through every stage: a cancel during the
+// supervised loop returns nil (clean shutdown), a cancel mid-build returns the
+// build error.
+func RunDaemon(ctx context.Context, configPath string) error {
+	return RunDaemonWith(ctx, configPath, DaemonOptions{})
+}
+
+// DaemonOptions carries the daemon's injectable seams. Production leaves every
+// field zero (RunDaemon), so the real captive core / bulk backend / RPC server
+// are wired by buildProductionBoundaries. Tests set BuildBoundaries (and,
+// optionally, RestartBackoff) to drive the whole RunDaemon flow — config load,
+// locking, validateConfig, the supervised loop — against fakes, without standing
+// up captive core or a real object store.
+type DaemonOptions struct {
+	// BuildBoundaries assembles the injected external boundaries from the loaded
+	// config, the resolved paths, the bound catalog, and the logger. nil ⇒
+	// buildProductionBoundaries (the real captive core + bulk datastore source).
+	// A test passes fakes here to exercise RunDaemon end to end.
+	BuildBoundaries func(
+		ctx context.Context, cfg Config, paths Paths, cat *Catalog, logger *supportlog.Entry,
+	) (Boundaries, error)
+
+	// RestartBackoff is the supervised loop's inter-restart sleep after a
+	// restartable startStreaming error. Zero ⇒ defaultRestartBackoff. A clean
+	// shutdown (ctx canceled) never sleeps.
+	RestartBackoff time.Duration
+
+	// Logger overrides the daemon logger. nil ⇒ a logger built from
+	// [logging].level / [logging].format.
+	Logger *supportlog.Entry
+
+	// Metrics is the streaming control-plane observability sink threaded into
+	// catch-up, the ingestion loop, and the lifecycle tick. nil ⇒ nopMetrics (the
+	// daemon runs uninstrumented). Production wires a *PrometheusMetrics built from
+	// the daemon's MetricsRegistry via NewPrometheusMetrics; tests pass a recorder
+	// to assert the phase signals.
+	Metrics Metrics
+}
+
+const defaultRestartBackoff = 5 * time.Second
+
+// Boundaries bundles the four external boundaries startStreaming and
+// validateConfig inject. buildProductionBoundaries fills them from a Config;
+// startConfig threads them into the StartConfig startStreaming consumes. They
+// are gathered here (rather than passed positionally) so the production builder
+// and a test builder return the same shape and RunDaemon wires it one way.
+type Boundaries struct {
+	// NetworkTip samples the bulk backend's current network tip — consulted by
+	// validateConfig (resolving "now"/numeric floors) and by catch-up. Required.
+	NetworkTip NetworkTipBackend
+
+	// BackendWaiter bounds backfillSource's wait-for-coverage on a backend-only
+	// chunk. Required iff Backend is set (paired with it in ProcessConfig).
+	BackendWaiter BackendWaiter
+
+	// Backend is the bulk LedgerBackend as a ChunkSource (BSB by default), the
+	// only source for a chunk with no local copy. May be nil in a frontfill-only
+	// deployment that never backfills.
+	Backend ingest.ChunkSource
+
+	// Core starts captive core at the resume ledger and yields the live getter
+	// the ingestion loop polls. Required.
+	Core CoreOpener
+
+	// ServeReads launches the RPC read server (it must return promptly, not block
+	// until shutdown). Required.
+	//
+	// TODO(#772): this is the v1-cutover seam. Today buildProductionBoundaries
+	// supplies a no-op ServeReads — the SQLite read path is still the v1 daemon's
+	// (cmd/.../internal/daemon/daemon.go), and the full SQLite→full-history
+	// cutover is issue #772. When #772 flips the read path, ServeReads wires the
+	// full-history RPC handlers here; nothing else in this entrypoint changes.
+	ServeReads func(ctx context.Context) error
+}
+
+func (b Boundaries) validate() error {
+	if b.NetworkTip == nil {
+		return errors.New("streaming: Boundaries.NetworkTip is nil")
+	}
+	if b.Core == nil {
+		return errors.New("streaming: Boundaries.Core is nil")
+	}
+	if b.ServeReads == nil {
+		return errors.New("streaming: Boundaries.ServeReads is nil")
+	}
+	if b.Backend != nil && b.BackendWaiter == nil {
+		return errors.New("streaming: Boundaries.BackendWaiter is required when Backend is set")
+	}
+	return nil
+}
+
+// RunDaemonWith is RunDaemon with explicit options — the seam tests drive. The
+// stages are documented on RunDaemon.
+func RunDaemonWith(ctx context.Context, configPath string, opts DaemonOptions) error {
+	// --- 1. Load + form-validate the config. ---
+	cfg, err := LoadConfig(configPath)
+	if err != nil {
+		return err
+	}
+	if cfg.Service.DefaultDataDir == "" {
+		return errors.New("streaming: [service].default_data_dir is required")
+	}
+
+	logger := opts.Logger
+	if logger == nil {
+		logger, err = newLogger(cfg.Logging)
+		if err != nil {
+			return err
+		}
+	}
+
+	paths := cfg.ResolvePaths()
+
+	// --- 2. Lock every configured storage root for the daemon's whole life. ---
+	locks, err := LockRoots(paths.LockRoots()...)
+	if err != nil {
+		return err
+	}
+	defer locks.Release()
+
+	// --- 3. Open the catalog store and bind the catalog. ---
+	store, err := metastore.New(paths.Catalog, logger)
+	if err != nil {
+		return fmt.Errorf("streaming: open catalog %q: %w", paths.Catalog, err)
+	}
+	defer func() { _ = store.Close() }()
+
+	cat := NewCatalog(store, NewLayoutFromPaths(paths))
+
+	// --- 5a. Build the external boundaries (validateConfig needs NetworkTip). ---
+	build := opts.BuildBoundaries
+	if build == nil {
+		build = buildProductionBoundaries
+	}
+	boundaries, err := build(ctx, cfg, paths, cat, logger)
+	if err != nil {
+		return fmt.Errorf("streaming: build boundaries: %w", err)
+	}
+	if err := boundaries.validate(); err != nil {
+		return err
+	}
+
+	tipBackoff, tipMaxAttempts := defaultTipBackoff, defaultTipMaxAttempts
+
+	// --- 4. validateConfig: pin/confirm the layout, resolve the earliest floor. ---
+	if _, err := validateConfig(ctx, cfg, cat, boundaries.NetworkTip, tipBackoff, tipMaxAttempts); err != nil {
+		return err
+	}
+
+	// --- 5b/6. Assemble the StartConfig and run the supervised startStreaming loop. ---
+	start := startConfig(cfg, cat, logger, boundaries, opts.Metrics, tipBackoff, tipMaxAttempts)
+
+	backoff := opts.RestartBackoff
+	if backoff <= 0 {
+		backoff = defaultRestartBackoff
+	}
+	return superviseStreaming(ctx, start, logger, backoff)
+}
+
+// startConfig threads the loaded Config, the bound catalog/logger, and the
+// assembled boundaries into the StartConfig startStreaming consumes. The Exec
+// and Lifecycle bundles share ONE catalog, worker pool, and retention floor (the
+// design's "catch-up and the lifecycle goroutine share one set of
+// postconditions"), so Lifecycle embeds the same ExecConfig.
+func startConfig(
+	cfg Config, cat *Catalog, logger *supportlog.Entry, b Boundaries, metrics Metrics,
+	tipBackoff time.Duration, tipMaxAttempts int,
+) StartConfig {
+	exec := ExecConfig{
+		Catalog:    cat,
+		Logger:     logger,
+		Metrics:    metricsOrNop(metrics),
+		Workers:    derefInt(cfg.Backfill.Workers),
+		MaxRetries: derefInt(cfg.Backfill.MaxRetries),
+		Process: ProcessConfig{
+			HotProbe:      NewRocksHotProbe(cat.Layout().HotChunkPath, logger),
+			Backend:       b.Backend,
+			BackendWaiter: b.BackendWaiter,
+		},
+	}
+	life := LifecycleConfig{
+		ExecConfig:      exec,
+		RetentionChunks: derefU32(cfg.Streaming.RetentionChunks),
+	}
+	return StartConfig{
+		Exec:           exec,
+		Lifecycle:      life,
+		NetworkTip:     b.NetworkTip,
+		Core:           b.Core,
+		ServeReads:     b.ServeReads,
+		TipBackoff:     tipBackoff,
+		TipMaxAttempts: tipMaxAttempts,
+	}
+}
+
+// superviseStreaming is the daemon's top-level loop: it runs startStreaming and,
+// per the design ("startup is the recovery path"), restarts it on a restartable
+// error after a backoff. A clean shutdown (startStreaming returns nil, which it
+// only does on ctx cancellation) returns nil. A canceled ctx during the backoff
+// also returns nil — no restart after a shutdown request.
+//
+// It does NOT swallow the fatal sentinels (ErrHotVolumeLost, ErrFirstStartNoTip):
+// those are returned UP so an operator/supervisor sees them. The retry here is
+// for transient restartable failures (a backfill/ingest hiccup, a captive core
+// crash) where a fresh start converges; the unrecoverable ones surface.
+func superviseStreaming(
+	ctx context.Context, start StartConfig, logger *supportlog.Entry, backoff time.Duration,
+) error {
+	for {
+		err := startStreaming(ctx, start)
+		if err == nil {
+			return nil // clean shutdown
+		}
+		if ctx.Err() != nil {
+			//nolint:nilerr // ctx canceled is a clean shutdown, not an error to surface
+			return nil
+		}
+		// Unrecoverable: surface up rather than spin restarting on a condition a
+		// fresh start cannot heal.
+		if errors.Is(err, ErrHotVolumeLost) || errors.Is(err, ErrFirstStartNoTip) {
+			return err
+		}
+		logger.WithError(err).Warnf("streaming: daemon run failed; restarting in %s", backoff)
+		timer := time.NewTimer(backoff)
+		select {
+		case <-ctx.Done():
+			timer.Stop()
+			return nil
+		case <-timer.C:
+		}
+	}
+}
+
+// ---------------------------------------------------------------------------
+// Production boundary construction.
+// ---------------------------------------------------------------------------
+
+// buildProductionBoundaries assembles the real external boundaries from the
+// loaded config:
+//
+//   - Core: captive stellar-core via NewCaptiveCoreStream, wrapped so
+//     OpenLedgerStream hands the live stream to the ingestion loop (the stream
+//     owns the core process lifecycle — started on the first RawLedgers pull,
+//     torn down when iteration ends — so this builder constructs it without
+//     sequencing PrepareRange/Close itself).
+//   - Backend: the bulk datastore ChunkSource (NewDataStoreSource) when a bucket
+//     path is configured; nil for a frontfill-only deployment.
+//   - NetworkTip / BackendWaiter: an adapter over the bulk backend's tip.
+//
+// TODO(#772): the bulk-backend TIP boundary is the one piece still entangled
+// with config that does not yet exist on this branch (the datastore TYPE +
+// schema — only [backfill.bsb].bucket_path is in Config today) and with the lake
+// tip-resolution the v1 path performs differently. Until #772 lands the cutover,
+// a deployment that needs catch-up against a real lake must wire NetworkTip/
+// BackendWaiter/Backend through DaemonOptions.BuildBoundaries; buildProduction-
+// Boundaries supplies the captive-core Core (fully wired) and a tip adapter that
+// errors clearly when no bulk backend is configured, so a frontfill ("genesis"
+// or "now" with no backfill) deployment runs unchanged.
+func buildProductionBoundaries(
+	_ context.Context, cfg Config, _ Paths, _ *Catalog, logger *supportlog.Entry,
+) (Boundaries, error) {
+	core, err := newCaptiveCoreOpener(cfg.Streaming.CaptiveCoreConfig, logger)
+	if err != nil {
+		return Boundaries{}, err
+	}
+
+	b := Boundaries{
+		Core: core,
+		// TODO(#772): wire the full-history RPC read server. The SQLite read path
+		// is still the v1 daemon's; until the #772 cutover, serving is a no-op here
+		// so the streaming daemon ingests + freezes without double-serving reads.
+		ServeReads: func(context.Context) error { return nil },
+	}
+
+	// The bulk tip/coverage/source. Absent a configured backend this is a
+	// frontfill-only deployment: NetworkTip degrades to an explicit
+	// not-configured error (catch-up classifies it first-start-fatal vs degrade),
+	// and Backend stays nil (backfillSource errors loudly only if a chunk actually
+	// reaches the bulk branch).
+	tip := &notConfiguredTip{}
+	b.NetworkTip = tip
+	return b, nil
+}
+
+// captiveCoreOpener is the production CoreOpener: it prepares captive core at the
+// resume ledger and hands back a LedgerGetter the ingestion loop polls by
+// sequence (the design's core.GetLedger(ctx, seq)) plus a closer.
+type captiveCoreOpener struct {
+	backend ledgerbackend.LedgerBackend
+}
+
+//nolint:unparam // returns (nil, err) until the #772 captive-core wiring lands
+func newCaptiveCoreOpener(captiveCoreConfigPath string, _ *supportlog.Entry) (*captiveCoreOpener, error) {
+	if captiveCoreConfigPath == "" {
+		return nil, errors.New("streaming: [streaming].captive_core_config is required")
+	}
+	// TODO(#772): the captive-core CaptiveCoreConfig (binary path, network
+	// passphrase, history-archive URLs, storage path) is assembled from the v1
+	// daemon config today; threading those through the streaming Config is part
+	// of the cutover. The factory below is the wiring point — once the fields are
+	// in Config, build a ledgerbackend.CaptiveCoreConfig from
+	// NewCaptiveCoreTomlFromFile(captiveCoreConfigPath, ...) and NewCaptive, then
+	// PrepareRange(UnboundedRange(resume)) in OpenCore. The seam (a LedgerGetter
+	// behind CoreOpener) is final; only the config plumbing is deferred.
+	return nil, fmt.Errorf("streaming: production captive-core wiring is deferred to #772 "+
+		"(config %q parsed; pass a CoreOpener via DaemonOptions.BuildBoundaries to run today)",
+		captiveCoreConfigPath)
+}
+
+// OpenCore prepares the backend over the unbounded range from resumeLedger and
+// returns a getter wrapping GetLedger plus the backend's Close.
+func (c *captiveCoreOpener) OpenCore(
+	ctx context.Context, resumeLedger uint32,
+) (LedgerGetter, func() error, error) {
+	if err := c.backend.PrepareRange(ctx, ledgerbackend.UnboundedRange(resumeLedger)); err != nil {
+		return nil, nil, fmt.Errorf("streaming: captive core prepare range from %d: %w", resumeLedger, err)
+	}
+	return backendGetter{backend: c.backend}, c.backend.Close, nil
+}
+
+// backendGetter adapts a ledgerbackend.LedgerBackend to LedgerGetter: GetLedger
+// blocks until the ledger is available and returns its raw wire bytes.
+type backendGetter struct {
+	backend ledgerbackend.LedgerBackend
+}
+
+func (g backendGetter) GetLedger(ctx context.Context, seq uint32) (xdr.LedgerCloseMetaView, error) {
+	lcm, err := g.backend.GetLedger(ctx, seq)
+	if err != nil {
+		return nil, err
+	}
+	raw, err := lcm.MarshalBinary()
+	if err != nil {
+		return nil, fmt.Errorf("streaming: marshal ledger %d: %w", seq, err)
+	}
+	return xdr.LedgerCloseMetaView(raw), nil
+}
+
+// notConfiguredTip is the NetworkTipBackend for a deployment with no bulk
+// backend configured: every sample returns a clear not-configured error. It is
+// the honest placeholder until the #772 cutover wires the real lake tip.
+//
+// It is benign for the genesis-floor steady state: validateConfig resolves a
+// genesis floor without a tip, and once there is local progress catch-up
+// degrades on a tip error rather than fatals. It DOES block the cases that
+// genuinely require a tip — a first-start "now"/numeric floor (validateConfig
+// must resolve it) and a catch-up that needs to extend storage downward — which
+// is correct: those cannot proceed against a backend that was never configured.
+// A deployment needing either must wire a real NetworkTip via
+// DaemonOptions.BuildBoundaries (or wait for #772).
+type notConfiguredTip struct{}
+
+func (notConfiguredTip) NetworkTip(context.Context) (uint32, error) {
+	return 0, errors.New("streaming: no bulk backend configured ([backfill.bsb].bucket_path empty); " +
+		"cannot sample the network tip (configure a backend, or this is a frontfill-only deployment)")
+}
+
+// ---------------------------------------------------------------------------
+// Bulk-backend tip/coverage adapter. Production wires these over a real
+// ledgerbackend.LedgerBackend (a BufferedStorageBackend); they are split out so
+// the #772 cutover can hand RunDaemon a prepared backend and reuse them verbatim.
+// ---------------------------------------------------------------------------
+
+// backendTip adapts a ledgerbackend.LedgerBackend to NetworkTipBackend +
+// BackendWaiter. NetworkTip reads the backend's latest available ledger;
+// WaitForCoverage polls it until the tip covers a target ledger or ctx/deadline
+// elapses.
+type backendTip struct {
+	backend   ledgerbackend.LedgerBackend
+	pollEvery time.Duration
+	deadline  time.Duration
+}
+
+// newBackendTip wraps a prepared LedgerBackend. pollEvery is the coverage-poll
+// interval; deadline bounds WaitForCoverage. Zero values fall back to sane
+// defaults.
+func newBackendTip(backend ledgerbackend.LedgerBackend, pollEvery, deadline time.Duration) *backendTip {
+	if pollEvery <= 0 {
+		pollEvery = time.Second
+	}
+	if deadline <= 0 {
+		deadline = 10 * time.Minute
+	}
+	return &backendTip{backend: backend, pollEvery: pollEvery, deadline: deadline}
+}
+
+func (t *backendTip) NetworkTip(ctx context.Context) (uint32, error) {
+	return t.backend.GetLatestLedgerSequence(ctx)
+}
+
+// WaitForCoverage blocks until the backend's tip covers chunkLastLedger, polling
+// on pollEvery, returning ErrBackendCoverageTimeout (wrapped) past the deadline.
+// A chunk with a local copy never reaches here, so this never gates a normal
+// restart whose range is entirely local.
+func (t *backendTip) WaitForCoverage(ctx context.Context, chunkLastLedger uint32) error {
+	deadline := time.Now().Add(t.deadline)
+	for {
+		if err := ctx.Err(); err != nil {
+			return err
+		}
+		tip, err := t.backend.GetLatestLedgerSequence(ctx)
+		if err == nil && tip >= chunkLastLedger {
+			return nil
+		}
+		if time.Now().After(deadline) {
+			return fmt.Errorf("%w: tip never reached ledger %d within %s",
+				ErrBackendCoverageTimeout, chunkLastLedger, t.deadline)
+		}
+		timer := time.NewTimer(t.pollEvery)
+		select {
+		case <-ctx.Done():
+			timer.Stop()
+			return ctx.Err()
+		case <-timer.C:
+		}
+	}
+}
+
+// newLogger builds a daemon logger from the [logging] config (level + format).
+func newLogger(cfg LoggingConfig) (*supportlog.Entry, error) {
+	level, err := logrus.ParseLevel(cfg.Level)
+	if err != nil {
+		return nil, fmt.Errorf("streaming: invalid logging.level %q: %w", cfg.Level, err)
+	}
+	logger := supportlog.New()
+	logger.SetLevel(level)
+	if cfg.Format == "json" {
+		logger.UseJSONFormatter()
+	}
+	return logger, nil
+}
+
+// compile-time assertions: the production adapters satisfy the injected
+// interfaces startStreaming/processChunk consume.
+var (
+	_ CoreOpener        = (*captiveCoreOpener)(nil)
+	_ LedgerGetter      = backendGetter{}
+	_ NetworkTipBackend = (*backendTip)(nil)
+	_ BackendWaiter     = (*backendTip)(nil)
+	_ NetworkTipBackend = notConfiguredTip{}
+)
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/daemon_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/daemon_test.go
new file mode 100644
index 000000000..ff384d5a6
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/daemon_test.go
@@ -0,0 +1,444 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/go-stellar-sdk/ingest/ledgerbackend"
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/metastore"
+)
+
+// openMetaAt opens a metastore.Store at path for read-back assertions.
+func openMetaAt(t *testing.T, path string) (*metastore.Store, error) {
+	t.Helper()
+	return metastore.New(path, silentLogger())
+}
+
+// writeTempConfig writes a minimal-but-valid streaming-daemon TOML rooted at a
+// temp data dir and returns the config path plus the data dir. A genesis
+// earliest_ledger needs no tip, so the daemon validates and starts without a
+// reachable backend — the wiring the entrypoint test exercises.
+//
+//nolint:nonamedreturns // named outputs label the (config path, data dir) pair
+func writeTempConfig(t *testing.T, extra string) (configPath, dataDir string) {
+	t.Helper()
+	dataDir = t.TempDir()
+	configPath = filepath.Join(t.TempDir(), "daemon.toml")
+	body := fmt.Sprintf(`
+[service]
+default_data_dir = %q
+
+[streaming]
+earliest_ledger = "genesis"
+captive_core_config = "/dev/null"
+
+[logging]
+level = "debug"
+format = "text"
+%s
+`, dataDir, extra)
+	require.NoError(t, os.WriteFile(configPath, []byte(body), 0o644))
+	return configPath, dataDir
+}
+
+// fakeBoundaries returns a BuildBoundaries func that hands RunDaemon a set of
+// faked external boundaries (a young-network tip ⇒ no backfill, a fake core
+// stream that blocks until ctx cancel, a recording ServeReads). It also records
+// the resolved config/paths the daemon passed the builder, so a test asserts the
+// daemon threaded LoadConfig+ResolvePaths through correctly.
+type capturedBuild struct {
+	called   atomic.Int32
+	gotCfg   Config
+	gotPaths Paths
+	served   atomic.Int32
+	core     *fakeCore
+}
+
+func (c *capturedBuild) build(
+	_ context.Context, cfg Config, paths Paths, _ *Catalog, _ *supportlog.Entry,
+) (Boundaries, error) {
+	c.called.Add(1)
+	c.gotCfg = cfg
+	c.gotPaths = paths
+	return Boundaries{
+		// A young-network tip (inside chunk 0) ⇒ backfill is a no-op, so the
+		// daemon needs no real backend to reach serve+ingest.
+		NetworkTip: &fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq + 10}},
+		Core:       c.core,
+		ServeReads: func(context.Context) error { c.served.Add(1); return nil },
+	}, nil
+}
+
+// ---------------------------------------------------------------------------
+// RunDaemonWith — the full entrypoint flow against faked boundaries.
+// ---------------------------------------------------------------------------
+
+// The happy path: load TOML → lock → open meta store → validateConfig (pins the
+// genesis floor) → build boundaries → startStreaming → clean shutdown on ctx
+// cancel. Asserts the daemon pinned the layout, served reads, started core at
+// genesis, and threaded the resolved config/paths into the boundary builder.
+func TestRunDaemon_LoadValidateWireStartCleanShutdown(t *testing.T) {
+	configPath, dataDir := writeTempConfig(t, "")
+
+	capture := &capturedBuild{core: &fakeCore{getter: &fakeLedgerGetter{frames: map[uint32][]byte{}, blockOnCtx: true}}}
+	opts := DaemonOptions{BuildBoundaries: capture.build, Logger: silentLogger()}
+
+	ctx, cancel := context.WithCancel(context.Background())
+	errCh := make(chan error, 1)
+	go func() { errCh <- RunDaemonWith(ctx, configPath, opts) }()
+
+	// Wait until reads are served (the daemon is parked on the blocking stream).
+	require.Eventually(t, func() bool { return capture.served.Load() == 1 }, 3*time.Second, 5*time.Millisecond)
+	cancel()
+
+	select {
+	case err := <-errCh:
+		require.NoError(t, err, "ctx cancel is a clean shutdown")
+	case <-time.After(3 * time.Second):
+		t.Fatal("RunDaemonWith did not return after ctx cancel")
+	}
+
+	assert.Equal(t, int32(1), capture.called.Load(), "boundary builder invoked once")
+	assert.Equal(t, int32(1), capture.served.Load(), "reads served once")
+	assert.Equal(t, int32(1), capture.core.openedCount.Load(), "captive core started once")
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), capture.core.resumeSeen.Load(),
+		"resume ledger is genesis on a fresh start")
+
+	// The daemon threaded the loaded config + resolved paths into the builder.
+	assert.Equal(t, dataDir, capture.gotCfg.Service.DefaultDataDir)
+	assert.Equal(t, filepath.Join(dataDir, "hot"), capture.gotPaths.HotStorage)
+	assert.Equal(t, filepath.Join(dataDir, "catalog", "rocksdb"), capture.gotPaths.Catalog)
+
+	// validateConfig pinned the immutable layout (earliest) before start.
+	store, err := openMetaAt(t, capture.gotPaths.Catalog)
+	require.NoError(t, err)
+	defer func() { _ = store.Close() }()
+	cat := NewCatalog(store, NewLayout(dataDir))
+	earliest, pinned, err := cat.EarliestLedger()
+	require.NoError(t, err)
+	require.True(t, pinned, "validateConfig must pin earliest_ledger before startStreaming")
+	assert.Equal(t, uint32(chunk.FirstLedgerSeq), earliest)
+}
+
+// Storage-path overrides must be HONORED by the data path, not just locked. The
+// daemon resolves [catalog]/[immutable_storage.*]/[streaming.hot_storage]
+// overrides into Paths, flocks them, and binds the Catalog via
+// NewLayoutFromPaths(paths) — so the Layout the data path reads/writes must
+// place every artifact and the hot DB under the OVERRIDE, never under DataDir.
+// Before the fix the Layout derived all paths from DataDir alone: the lock and
+// the data location diverged silently. This test pins both halves: (1) the
+// bound Layout's paths all live under the overrides, and (2) actually opening a
+// hot DB through the data path (openHotTierForChunk) lands the dir under the hot
+// override with NOTHING under {DataDir}/hot.
+func TestRunDaemon_StoragePathOverridesHonored(t *testing.T) {
+	dataDir := t.TempDir()
+	overrideRoot := t.TempDir() // a distinct mount, e.g. /mnt/nvme
+	hotOverride := filepath.Join(overrideRoot, "hot")
+	coldOverride := filepath.Join(overrideRoot, "cold")
+	catalogOverride := filepath.Join(overrideRoot, "meta")
+
+	cfg := Config{
+		Service:          ServiceConfig{DefaultDataDir: dataDir},
+		Catalog:          CatalogConfig{Path: catalogOverride},
+		ImmutableStorage: ImmutableStorageConfig{Path: coldOverride},
+		Streaming:        StreamingConfig{HotStorage: StoragePathConfig{Path: hotOverride}},
+	}.WithDefaults()
+
+	paths := cfg.ResolvePaths()
+	layout := NewLayoutFromPaths(paths) // exactly the daemon's binding
+
+	// (1) Every path the Layout composes lives under the override, NOT DataDir.
+	const cid = chunk.ID(5350)
+	assert.Equal(t, catalogOverride, layout.CatalogPath())
+	assert.Equal(t, hotOverride, layout.HotRoot())
+	assert.Equal(t, filepath.Join(hotOverride, cid.String()), layout.HotChunkPath(cid))
+	ledgersRoot := filepath.Join(coldOverride, "ledgers") // ledgers is a fixed subdir of the cold root
+	assert.Equal(t, filepath.Join(ledgersRoot, cid.BucketID(), cid.String()+".pack"),
+		layout.LedgerPackPath(cid))
+	assert.Equal(t, ledgersRoot, layout.LedgersRoot())
+	// Nothing resolves under {DataDir}/hot or {DataDir}/ledgers.
+	assert.NotEqual(t, filepath.Join(dataDir, "hot", cid.String()), layout.HotChunkPath(cid))
+
+	// (2) The data path actually creates the hot DB under the override. Bind a
+	// real catalog on this Layout and open a hot tier through the same call the
+	// ingestion loop uses.
+	store, err := metastore.New(paths.Catalog, silentLogger())
+	require.NoError(t, err)
+	defer func() { _ = store.Close() }()
+	cat := NewCatalog(store, layout)
+
+	db, err := openHotTierForChunk(cat, cid, silentLogger())
+	require.NoError(t, err)
+	require.NoError(t, db.Close())
+
+	// The hot DB dir exists under the override...
+	hotDir := filepath.Join(hotOverride, cid.String())
+	info, err := os.Stat(hotDir)
+	require.NoError(t, err, "hot DB must be created under the hot_storage override")
+	assert.True(t, info.IsDir())
+	// ...and NOTHING was written under {DataDir}/hot (the old, buggy location).
+	_, err = os.Stat(filepath.Join(dataDir, "hot"))
+	assert.True(t, os.IsNotExist(err), "no hot data may land under DataDir when an override is set")
+}
+
+// A second daemon on the same data dir fails fast on the storage-root flock — the
+// single-process invariant the entrypoint must enforce before opening any store.
+func TestRunDaemon_LockContentionFailsFast(t *testing.T) {
+	configPath, dataDir := writeTempConfig(t, "")
+
+	// Hold the hot-root lock as a "first daemon" for the test's duration.
+	paths := Paths{HotStorage: filepath.Join(dataDir, "hot")}
+	locks, err := LockRoots(paths.HotStorage)
+	require.NoError(t, err)
+	defer locks.Release()
+
+	capture := &capturedBuild{core: &fakeCore{}}
+	err = RunDaemonWith(context.Background(), configPath,
+		DaemonOptions{BuildBoundaries: capture.build, Logger: silentLogger()})
+	require.ErrorIs(t, err, ErrRootLocked)
+	assert.Zero(t, capture.called.Load(), "boundary build never reached when a root is locked")
+}
+
+// A first start with a missing tip and a "now" floor is fatal at validateConfig:
+// "now" cannot resolve without a reachable backend, and the daemon must surface
+// it rather than start serving an empty history.
+func TestRunDaemon_NowFloorRequiresTip(t *testing.T) {
+	configPath, _ := writeTempConfigNow(t)
+
+	capture := &capturedBuild{core: &fakeCore{}}
+	// The builder returns an unreachable tip, so "now" cannot resolve.
+	build := func(_ context.Context, cfg Config, paths Paths, c *Catalog, l *supportlog.Entry) (Boundaries, error) {
+		b, _ := capture.build(context.Background(), cfg, paths, c, l) //nolint:contextcheck // fresh ctx is intentional (test)
+		b.NetworkTip = &fakeTipBackend{err: errors.New("unreachable"), errFirst: 99}
+		return b, nil
+	}
+	err := RunDaemonWith(context.Background(), configPath,
+		DaemonOptions{BuildBoundaries: build, Logger: silentLogger(), RestartBackoff: time.Millisecond})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "now")
+}
+
+//nolint:nonamedreturns // named outputs label the (config path, data dir) pair
+func writeTempConfigNow(t *testing.T) (configPath, dataDir string) {
+	t.Helper()
+	dataDir = t.TempDir()
+	configPath = filepath.Join(t.TempDir(), "daemon.toml")
+	body := fmt.Sprintf(`
+[service]
+default_data_dir = %q
+[streaming]
+earliest_ledger = "now"
+captive_core_config = "/dev/null"
+`, dataDir)
+	require.NoError(t, os.WriteFile(configPath, []byte(body), 0o644))
+	return configPath, dataDir
+}
+
+// A boundary-build failure surfaces (the daemon cannot start without its
+// external boundaries) and never reaches startStreaming.
+func TestRunDaemon_BuildBoundariesError(t *testing.T) {
+	configPath, _ := writeTempConfig(t, "")
+	wantErr := errors.New("captive core binary missing")
+	build := func(context.Context, Config, Paths, *Catalog, *supportlog.Entry) (Boundaries, error) {
+		return Boundaries{}, wantErr
+	}
+	err := RunDaemonWith(context.Background(), configPath,
+		DaemonOptions{BuildBoundaries: build, Logger: silentLogger()})
+	require.ErrorIs(t, err, wantErr)
+}
+
+// A missing default_data_dir is rejected before any store opens.
+func TestRunDaemon_RequiresDataDir(t *testing.T) {
+	configPath := filepath.Join(t.TempDir(), "daemon.toml")
+	require.NoError(t, os.WriteFile(configPath, []byte(`
+[streaming]
+earliest_ledger = "genesis"
+captive_core_config = "/dev/null"
+`), 0o644))
+	err := RunDaemonWith(context.Background(), configPath, DaemonOptions{Logger: silentLogger()})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "default_data_dir")
+}
+
+// A nonexistent config path errors at load.
+func TestRunDaemon_MissingConfigFile(t *testing.T) {
+	err := RunDaemonWith(context.Background(), "/no/such/config.toml", DaemonOptions{Logger: silentLogger()})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "read config")
+}
+
+// ---------------------------------------------------------------------------
+// superviseStreaming — the top-level restart loop.
+// ---------------------------------------------------------------------------
+
+// A restartable error retries on a backoff, then a clean ctx cancel during the
+// backoff returns nil (no restart after a shutdown request).
+func TestSuperviseStreaming_RetriesThenCleanShutdown(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+
+	var attempts atomic.Int32
+	core := &fakeCore{openErr: errors.New("transient core open failure")}
+	tip := &fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq + 10}} // young: no backfill
+	start := startTestConfig(t, cat, tip, core, nil)
+	// Count startStreaming attempts by observing core opens (one per attempt past
+	// backfill); openErr makes each attempt a restartable failure.
+	start.ServeReads = func(context.Context) error { return nil }
+
+	ctx, cancel := context.WithCancel(context.Background())
+	errCh := make(chan error, 1)
+	go func() { errCh <- superviseStreaming(ctx, start, silentLogger(), 5*time.Millisecond) }()
+
+	// Let a few restarts happen, then cancel.
+	require.Eventually(t, func() bool {
+		attempts.Store(core.openedCount.Load())
+		return attempts.Load() >= 2
+	}, 3*time.Second, 5*time.Millisecond)
+	cancel()
+
+	select {
+	case err := <-errCh:
+		require.NoError(t, err, "ctx cancel during backoff returns nil")
+	case <-time.After(3 * time.Second):
+		t.Fatal("superviseStreaming did not return after cancel")
+	}
+	assert.GreaterOrEqual(t, core.openedCount.Load(), int32(2), "restarted on the transient failure")
+}
+
+// The fatal sentinels are surfaced UP, not retried (a fresh start cannot heal
+// them).
+func TestSuperviseStreaming_FatalSentinelSurfaces(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// Unreachable tip + no local progress ⇒ ErrFirstStartNoTip, a fatal that must
+	// surface rather than spin.
+	tip := &fakeTipBackend{err: errors.New("unreachable"), errFirst: 99}
+	start := startTestConfig(t, cat, tip, &fakeCore{}, nil)
+
+	err := superviseStreaming(context.Background(), start, silentLogger(), time.Hour)
+	require.ErrorIs(t, err, ErrFirstStartNoTip, "fatal sentinel surfaces immediately, no retry")
+}
+
+// ---------------------------------------------------------------------------
+// backendTip — the production tip/coverage adapter over a LedgerBackend.
+// ---------------------------------------------------------------------------
+
+// fakeLedgerBackend is a minimal ledgerbackend.LedgerBackend whose latest ledger
+// is programmable; only GetLatestLedgerSequence is exercised by backendTip.
+type fakeLedgerBackend struct {
+	latest atomic.Uint32
+	err    error
+}
+
+func (b *fakeLedgerBackend) GetLatestLedgerSequence(context.Context) (uint32, error) {
+	if b.err != nil {
+		return 0, b.err
+	}
+	return b.latest.Load(), nil
+}
+
+func (b *fakeLedgerBackend) GetLedger(context.Context, uint32) (xdr.LedgerCloseMeta, error) {
+	return xdr.LedgerCloseMeta{}, errors.New("not implemented")
+}
+func (b *fakeLedgerBackend) PrepareRange(context.Context, ledgerbackend.Range) error { return nil }
+func (b *fakeLedgerBackend) IsPrepared(context.Context, ledgerbackend.Range) (bool, error) {
+	return true, nil
+}
+func (b *fakeLedgerBackend) Close() error { return nil }
+
+func TestBackendTip_NetworkTip(t *testing.T) {
+	be := &fakeLedgerBackend{}
+	be.latest.Store(123_456)
+	adapter := newBackendTip(be, time.Millisecond, time.Second)
+	tip, err := adapter.NetworkTip(context.Background())
+	require.NoError(t, err)
+	assert.Equal(t, uint32(123_456), tip)
+}
+
+func TestBackendTip_WaitForCoverageReady(t *testing.T) {
+	be := &fakeLedgerBackend{}
+	be.latest.Store(500)
+	adapter := newBackendTip(be, time.Millisecond, time.Second)
+	require.NoError(t, adapter.WaitForCoverage(context.Background(), 400), "tip already covers target")
+}
+
+func TestBackendTip_WaitForCoverageAdvances(t *testing.T) {
+	be := &fakeLedgerBackend{}
+	be.latest.Store(100)
+	adapter := newBackendTip(be, time.Millisecond, 2*time.Second)
+	// Advance the tip past the target after a few polls.
+	go func() {
+		time.Sleep(20 * time.Millisecond)
+		be.latest.Store(1000)
+	}()
+	require.NoError(t, adapter.WaitForCoverage(context.Background(), 900))
+}
+
+func TestBackendTip_WaitForCoverageTimeout(t *testing.T) {
+	be := &fakeLedgerBackend{}
+	be.latest.Store(10) // never reaches the target
+	adapter := newBackendTip(be, time.Millisecond, 20*time.Millisecond)
+	err := adapter.WaitForCoverage(context.Background(), 1_000_000)
+	require.ErrorIs(t, err, ErrBackendCoverageTimeout)
+}
+
+func TestBackendTip_WaitForCoverageCtxCancel(t *testing.T) {
+	be := &fakeLedgerBackend{}
+	be.latest.Store(10)
+	adapter := newBackendTip(be, 10*time.Millisecond, time.Hour)
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	err := adapter.WaitForCoverage(ctx, 1_000_000)
+	require.ErrorIs(t, err, context.Canceled)
+}
+
+// ---------------------------------------------------------------------------
+// notConfiguredTip — frontfill-only deployment behavior.
+// ---------------------------------------------------------------------------
+
+func TestNotConfiguredTip_ErrorsClearly(t *testing.T) {
+	_, err := notConfiguredTip{}.NetworkTip(context.Background())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "no bulk backend configured")
+}
+
+// ---------------------------------------------------------------------------
+// buildProductionBoundaries — captive-core wiring is deferred to #772.
+// ---------------------------------------------------------------------------
+
+func TestBuildProductionBoundaries_CaptiveCoreDeferred(t *testing.T) {
+	cfg := Config{}.WithDefaults()
+	cfg.Streaming.CaptiveCoreConfig = "/some/core.toml"
+	_, err := buildProductionBoundaries(context.Background(), cfg, Paths{}, nil, silentLogger())
+	require.Error(t, err, "captive-core production wiring is deferred to #772")
+	assert.Contains(t, err.Error(), "#772")
+}
+
+func TestBuildProductionBoundaries_RequiresCaptiveCoreConfig(t *testing.T) {
+	cfg := Config{}.WithDefaults() // no captive_core_config
+	_, err := buildProductionBoundaries(context.Background(), cfg, Paths{}, nil, silentLogger())
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "captive_core_config")
+}
+
+func TestNewLogger(t *testing.T) {
+	l, err := newLogger(LoggingConfig{Level: "warn", Format: "json"})
+	require.NoError(t, err)
+	require.NotNil(t, l)
+
+	_, err = newLogger(LoggingConfig{Level: "bogus", Format: "text"})
+	require.Error(t, err)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
index ec278846c..06a0a7f05 100644
--- a/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/doc.go
@@ -4,9 +4,9 @@
 // (fullhistory/pkg/...). It is built ON that layer — the catalog WRAPS
 // metastore.Store rather than reinventing a RocksDB wrapper.
 //
-// This file map covers Slice 1 · Layers 1–3 (foundations + storage +
-// orchestration). Daemon assembly stacks on top in Layer 4 (see "Later layers"
-// below).
+// This file map covers all of Slice 1 (Layers 1–4) — the assembled,
+// ledgers-only daemon. Slices 2 and 3 then weave in the events and tx-hash data
+// types (see "Later slices" below).
 //
 // # Data model (keys-first)
 //
@@ -33,9 +33,10 @@
 //	                 the catalog (a metastore.Store wrapper), the one-write
 //	                 protocol (mark "freezing" → fsync file+dirent → flip
 //	                 "frozen"), and the key-driven sweep (the only deletion body).
-//	Config         config.go, config_lock.go
-//	                 the TOML schema/loader/defaults and the single-process flock
-//	                 over the catalog + storage roots.
+//	Config         config.go, config_lock.go, config_validate.go
+//	                 the TOML schema/loader/defaults, the single-process flock,
+//	                 and validateConfig (the network-dependent earliest-ledger
+//	                 resolution + the two-pin first-start commit).
 //	Cross-cutting  artifacts.go
 //	                 the ArtifactSet/Kind abstraction the later layers subset.
 //	Storage        process.go, hotsource.go
@@ -53,15 +54,21 @@
 //	                 derived progress (the resume point), the lifecycle tick
 //	                 (plan → discard → prune), and retention-floor arithmetic +
 //	                 the reader-retention gate.
+//	Daemon         startup.go, daemon.go
+//	                 startStreaming (catalog → validate → catch-up → serve+ingest
+//	                 handoff) and the daemon/CLI wiring.
+//	Operability    recovery.go, audit.go, audit_invariants.go
+//	                 surgical recovery (atomic key-demotion), the audit command,
+//	                 and the INV-1..4 invariant walks.
 //	Observability  observability.go
 //	                 the metrics sink interface and the signals it emits.
 //	Test seam      hooks.go
 //	                 test-only crash-injection points fired from inside the real
 //	                 protocol/sweep methods (every field nil in production).
 //
-// # Later layers
+// # Later slices
 //
-// Layer 4 adds startStreaming, validateConfig, surgical recovery, and the audit
-// command (daemon assembly). Slices 2 and 3 then weave in the events and
-// tx-hash data types.
+// Slice 2 weaves in the events data type (a second per-chunk artifact) and
+// Slice 3 the tx-hash data type with its per-window rolling index — both
+// additive on this ledgers-only skeleton.
 package streaming
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/e2e_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/e2e_test.go
new file mode 100644
index 000000000..322cf3fd0
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/e2e_test.go
@@ -0,0 +1,436 @@
+package streaming
+
+// =============================================================================
+// Issue 19 — in-process end-to-end integration of the streaming daemon
+// (ledgers-only slice).
+//
+// WHAT IS REAL HERE
+//   Everything inside the process is the real production code path:
+//     - RunDaemonWith (the true daemon entrypoint): TOML load + form-validate,
+//       per-root flock, meta-store open + Catalog bind, the stateful
+//       validateConfig gate (pins the immutable layout + resolves the floor),
+//       and the supervised startStreaming loop.
+//     - startStreaming → catchUp → openHotTierForChunk → runIngestionLoop (the
+//       real atomic per-ledger WriteBatch over the real per-chunk hotchunk
+//       RocksDB), the real boundary handoff, the real doorbell.
+//     - lifecycleLoop / runLifecycleTick: the real resolve + executePlan freeze
+//       (the ledger cold artifact derived FROM the live hot DB via processChunk's
+//       hot branch), the real discard + prune scans.
+//     - Catalog.Audit (INV-2..4) over the real durable keys + files.
+//
+// WHAT IS FAKED (and why that is the right boundary)
+//   Only the two EXTERNAL boundaries the daemon injects on purpose:
+//     - The ledger SOURCE (CoreStreamOpener / NetworkTipBackend), fed
+//       SYNTHETIC-BUT-WELL-FORMED zero-tx LedgerCloseMeta. No captive core, no
+//       object store, no network.
+//     - ServeReads is a no-op recorder (#772).
+//
+// This in-process test is a LIFECYCLE + STORAGE-STATE test: it drives the whole
+// freeze→discard→restart-resume→prune sequence and audits the result. It does
+// not exercise a read PATH (the tx-hash lookups were removed with the tx-hash
+// subsystem in this slice).
+// =============================================================================
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+	"github.com/stellar/go-stellar-sdk/xdr"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// e2eGetter is the FAKE captive-core ledger getter: a resumable LedgerGetter the
+// ingestion loop polls by sequence (the design's core.GetLedger(ctx, seq)). It
+// returns the frame for the requested seq when it has one, and once the poll
+// runs past the synthetic backlog it blocks until ctx is canceled (a live tip
+// stream ends only on shutdown). It records the FIRST seq it was asked for so
+// the restart step can assert the daemon re-derived the watermark and resumed
+// with no gap.
+type e2eGetter struct {
+	frames    map[uint32][]byte
+	maxSeq    uint32
+	fromSeen  *atomic.Uint32 // first GetLedger seq (for the restart assertion)
+	delivered *atomic.Uint32 // highest seq actually yielded (test sync)
+	sawFrom   atomic.Bool
+}
+
+type e2eFrame struct {
+	seq uint32
+	raw []byte
+}
+
+var _ LedgerGetter = (*e2eGetter)(nil)
+
+func (s *e2eGetter) GetLedger(ctx context.Context, seq uint32) (xdr.LedgerCloseMetaView, error) {
+	if s.sawFrom.CompareAndSwap(false, true) {
+		s.fromSeen.Store(seq)
+	}
+	if ctx.Err() != nil {
+		return nil, ctx.Err()
+	}
+	if raw, ok := s.frames[seq]; ok {
+		s.delivered.Store(seq)
+		return xdr.LedgerCloseMetaView(raw), nil
+	}
+	// Past the synthetic backlog: a live tip blocks until shutdown so the loop
+	// does not see an error that would look like a core crash.
+	<-ctx.Done()
+	return nil, ctx.Err()
+}
+
+// e2eCore is the CoreOpener handing back a fresh e2eGetter per daemon run (a
+// restart opens core anew). It records the resume ledger every open was driven
+// from.
+type e2eCore struct {
+	frames     []e2eFrame
+	resumeSeen atomic.Uint32
+	fromSeen   atomic.Uint32
+	delivered  atomic.Uint32
+	opens      atomic.Int32
+}
+
+func (c *e2eCore) OpenCore(_ context.Context, resume uint32) (LedgerGetter, func() error, error) {
+	c.opens.Add(1)
+	c.resumeSeen.Store(resume)
+	byseq := make(map[uint32][]byte, len(c.frames))
+	var maxSeq uint32
+	for _, f := range c.frames {
+		byseq[f.seq] = f.raw
+		if f.seq > maxSeq {
+			maxSeq = f.seq
+		}
+	}
+	getter := &e2eGetter{frames: byseq, maxSeq: maxSeq, fromSeen: &c.fromSeen, delivered: &c.delivered}
+	return getter, func() error { return nil }, nil
+}
+
+// e2eConfigPath writes a daemon TOML for an in-process E2E: genesis floor (no
+// tip needed to validate/start) and the given retention width.
+// captive_core_config is a stub path the test's BuildBoundaries replaces with a
+// fake stream, never opening a real core.
+func e2eConfigPath(t *testing.T, dataDir string, retentionChunks uint32) string {
+	t.Helper()
+	cfgPath := filepath.Join(t.TempDir(), "daemon.toml")
+	body := fmt.Sprintf(`
+[service]
+default_data_dir = %q
+
+[streaming]
+earliest_ledger = "genesis"
+captive_core_config = "/dev/null"
+retention_chunks = %d
+
+[logging]
+level = "error"
+format = "text"
+`, dataDir, retentionChunks)
+	require.NoError(t, os.WriteFile(cfgPath, []byte(body), 0o644))
+	return cfgPath
+}
+
+// runDaemonInBackground starts RunDaemonWith on a cancellable ctx and returns a
+// cancel func, a channel carrying its (clean-shutdown) return, and a channel
+// delivering the daemon's OWN bound *Catalog (captured from the BuildBoundaries
+// callback). The metastore is opened RocksDB-primary (exclusive LOCK), so a test
+// CANNOT open a second handle on the same path while the daemon runs — instead
+// it reads durable state through the daemon's own catalog, which is safe for
+// concurrent reads.
+//
+//nolint:nonamedreturns // named outputs label the (cancel, done, catalog) handles
+func runDaemonInBackground(
+	t *testing.T, cfgPath string, core *e2eCore, served *atomic.Int32, metrics Metrics,
+) (cancel context.CancelFunc, done <-chan error, catCh <-chan *Catalog) {
+	t.Helper()
+	ctx, cancelFn := context.WithCancel(context.Background())
+	errCh := make(chan error, 1)
+	catChan := make(chan *Catalog, 1)
+	build := func(_ context.Context, _ Config, _ Paths, cat *Catalog, _ *supportlog.Entry) (Boundaries, error) {
+		select {
+		case catChan <- cat: // hand the daemon's bound catalog to the test
+		default:
+		}
+		return Boundaries{
+			NetworkTip: &fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq + 5}},
+			Core:       core,
+			ServeReads: func(context.Context) error { served.Add(1); return nil },
+		}, nil
+	}
+	opts := DaemonOptions{
+		BuildBoundaries: build,
+		Logger:          silentLogger(),
+		Metrics:         metrics,
+		RestartBackoff:  10 * time.Millisecond,
+	}
+	go func() { errCh <- RunDaemonWith(ctx, cfgPath, opts) }()
+	return cancelFn, errCh, catChan
+}
+
+// awaitCatalog waits for the daemon to hand back its bound catalog.
+func awaitCatalog(t *testing.T, catCh <-chan *Catalog) *Catalog {
+	t.Helper()
+	select {
+	case cat := <-catCh:
+		return cat
+	case <-time.After(10 * time.Second):
+		t.Fatal("daemon did not bind a catalog")
+		return nil
+	}
+}
+
+// waitClean cancels the daemon and requires a clean (nil) shutdown.
+func waitClean(t *testing.T, cancel context.CancelFunc, done <-chan error) {
+	t.Helper()
+	cancel()
+	select {
+	case err := <-done:
+		require.NoError(t, err, "ctx cancel is a clean daemon shutdown")
+	case <-time.After(60 * time.Second):
+		// Post-cancel shutdown joins one in-flight lifecycle unit; a mid-flight
+		// freeze's Finalize fsync is unpreemptible and slow under -race +
+		// contention — the same reason the boundary-cross budget is 600s.
+		t.Fatal("daemon did not shut down cleanly after ctx cancel")
+	}
+}
+
+// ============================================================================
+// The end-to-end walk.
+// ============================================================================
+
+// TestE2E_DaemonLifecycle_FirstStartIngestFreezeRestartPrune drives the whole
+// daemon lifecycle in one process against the real stores and the fake ledger
+// source:
+//
+//	first start (genesis, young-network tip ⇒ direct ingest) →
+//	ingest a FULL chunk + cross into the next (real boundary handoff) →
+//	lifecycle tick freezes chunk 0's ledger artifact + discards its hot tier →
+//	clean shutdown →
+//	RESTART: re-derive the watermark, resume at exactly watermark+1 (no gap) →
+//	drive retention far enough to prune chunk 0, and confirm its keys/files go →
+//	finish with Catalog.Audit → Clean.
+//
+// Correctness is asserted at every step.
+//
+//nolint:funlen // full lifecycle E2E with assertions at every step
+func TestE2E_DaemonLifecycle_FirstStartIngestFreezeRestartPrune(t *testing.T) {
+	if testing.Short() {
+		t.Skip("e2e ingests a full 10k-ledger chunk; skipped in -short")
+	}
+
+	dataDir := t.TempDir()
+
+	const c0 = chunk.ID(0)
+	const c1 = chunk.ID(1)
+	const c2 = chunk.ID(2)
+
+	// --- Synthetic ledgers. We cross TWO chunk boundaries so chunks 0 AND 1 both
+	// freeze (completeThrough reaches chunk 1's last ledger), leaving chunk 2 as
+	// the live (un-frozen) chunk. That layout lets a later retention_chunks=1 run
+	// prune chunk 0 (wholly below the floor) while chunk 1 survives. Every ledger
+	// is zero-tx for speed.
+	c0First := c0.FirstLedger()
+	c2First := c2.FirstLedger()
+
+	frames := make([]e2eFrame, 0, 2*int(chunk.LedgersPerChunk)+2)
+	appendLedger := func(seq uint32) {
+		frames = append(frames, e2eFrame{seq: seq, raw: zeroTxLCMBytes(t, seq)})
+	}
+	// Chunks 0 and 1 in full (both freeze), then chunk 2's first two ledgers (the
+	// live chunk; boundary 1→2 fired, chunk 2 opened, its first ledger committed).
+	for seq := c0First; seq <= c1.LastLedger(); seq++ {
+		appendLedger(seq)
+	}
+	appendLedger(c2First)
+	appendLedger(c2First + 1)
+
+	core := &e2eCore{frames: frames}
+	var served atomic.Int32
+	metrics := newRecordingMetrics()
+
+	// =====================================================================
+	// STEP 1 — first start: config → lock → validate (pin genesis) → start →
+	// direct ingest across the chunk-0 AND chunk-1 boundaries, with the lifecycle
+	// freezing and discarding each just-closed chunk off the doorbell.
+	// =====================================================================
+	cfgPath := e2eConfigPath(t, dataDir, 0) // retention 0 (full history) for now
+	cancel, done, catCh := runDaemonInBackground(t, cfgPath, core, &served, metrics)
+
+	cat := awaitCatalog(t, catCh)
+
+	// First wait until ingestion crosses BOTH boundaries and commits into chunk 2
+	// (the new live chunk). Delivering c2First proves both boundary handoffs fired
+	// (chunks 0 and 1 closed, chunk 2 opened).
+	require.Eventually(t, func() bool {
+		return core.delivered.Load() >= c2First
+	}, 600*time.Second, 200*time.Millisecond, "ingestion must cross both boundaries into chunk 2")
+
+	// The boundary doorbells have rung. A lifecycle tick freezes each just-closed
+	// chunk's cold ledger artifact (from its closed hot DB), then discards its hot
+	// tier. The durable completion signal per chunk: the ledgers key is FROZEN AND
+	// the chunk's hot key is gone (discarded).
+	require.Eventually(t, func() bool {
+		for _, c := range []chunk.ID{c0, c1} {
+			st, err := cat.State(c, KindLedgers)
+			if err != nil || st != StateFrozen {
+				return false
+			}
+			has, err := cat.Has(hotChunkKey(c))
+			if err != nil || has {
+				return false
+			}
+		}
+		return true
+	}, 60*time.Second, 50*time.Millisecond, "the boundary ticks must freeze+discard chunks 0 and 1")
+
+	require.GreaterOrEqual(t, served.Load(), int32(1), "reads were served")
+	require.Equal(t, c0First, core.resumeSeen.Load(),
+		"first start resumes captive core at genesis (watermark+1)")
+
+	// --- Correctness: chunks 0 and 1 ledger cold artifacts froze and exist on disk. ---
+	for _, c := range []chunk.ID{c0, c1} {
+		st, err := cat.State(c, KindLedgers)
+		require.NoError(t, err)
+		assert.Equal(t, StateFrozen, st, "chunk %s ledgers is frozen", c)
+		require.FileExists(t, cat.layout.LedgerPackPath(c), "chunk %s pack exists on disk", c)
+	}
+
+	// Observability: the daemon emitted the boundary + freeze phase signals (the
+	// control-plane health gauges).
+	assert.GreaterOrEqual(t, len(metrics.snapshotBoundaries()), 1, "at least one chunk boundary was signaled")
+	assert.GreaterOrEqual(t, metrics.snapshotFreezeCount(), 1, "at least one freeze stage ran")
+
+	// =====================================================================
+	// STEP 2 — clean shutdown. The supervised loop returns nil on ctx cancel.
+	// =====================================================================
+	waitClean(t, cancel, done)
+
+	// The daemon's catalog rode its now-closed metastore handle; bind a fresh
+	// inspection catalog on the (now lock-free) data dir for the post-shutdown
+	// reads. It MUST be closed before the restart reopens the metastore.
+	postCat, closePost := e2eReadCatalog(t, dataDir)
+
+	// The durable watermark, re-derived from the post-shutdown state (the basis
+	// for the restart's resume-with-no-gap assertion).
+	wmBeforeRestart := mustDeriveWatermark(t, postCat)
+	require.GreaterOrEqual(t, wmBeforeRestart, c2First, "watermark advanced into chunk 2")
+
+	// Chunk 2 is the un-frozen live chunk: its hot key is "ready", no cold artifacts.
+	hotState, err := postCat.HotState(c2)
+	require.NoError(t, err)
+	require.Equal(t, HotReady, hotState, "chunk 2 is the un-frozen live chunk")
+	c2lfs, err := postCat.State(c2, KindLedgers)
+	require.NoError(t, err)
+	require.Equal(t, State(""), c2lfs, "the live chunk has no cold artifacts yet")
+
+	// =====================================================================
+	// STEP 3 — RESTART. A fresh RunDaemonWith re-opens everything, re-derives the
+	// watermark from durable state, and resumes captive core at watermark+1 with
+	// no gap. (The shared e2eCore records the new resume + the stream's From.)
+	// =====================================================================
+	closePost() // release the inspection metastore handle before the daemon reopens it
+	core.opens.Store(0)
+	core.resumeSeen.Store(0)
+	core.fromSeen.Store(0)
+	cancel2, done2, _ := runDaemonInBackground(t, cfgPath, core, &served, newRecordingMetrics())
+
+	require.Eventually(t, func() bool { return core.opens.Load() >= 1 }, 30*time.Second, 20*time.Millisecond,
+		"the restarted daemon re-opened captive core")
+	require.Eventually(t, func() bool { return core.fromSeen.Load() != 0 }, 30*time.Second, 20*time.Millisecond,
+		"the restarted ingestion loop requested a resume range")
+
+	wantResume := wmBeforeRestart + 1
+	assert.Equal(t, wantResume, core.resumeSeen.Load(),
+		"restart resumes captive core at the re-derived watermark+1 (no gap, no re-fetch of the bottom)")
+	assert.Equal(t, wantResume, core.fromSeen.Load(),
+		"the ingestion loop streamed from watermark+1 — the durable frontier, re-derived not stored")
+
+	waitClean(t, cancel2, done2)
+
+	// =====================================================================
+	// STEP 4 — retention prune. Re-run the daemon with retention_chunks = 1: the
+	// effective floor anchors at chunk 1, so chunk 0 (frozen) falls WHOLLY below
+	// the floor and the prune scan sweeps its files + keys, while chunk 1 (the
+	// floor chunk) survives.
+	// =====================================================================
+	prunedCfg := e2eConfigPath(t, dataDir, 1) // retain ~1 chunk
+	// Capture chunk 0's frozen pack path BEFORE the prune so we can confirm the
+	// file itself is gone afterward. (cat's layout is path-only and stays valid
+	// even though its metastore handle closed at the Step-2 shutdown.)
+	prunedPackPath := cat.layout.LedgerPackPath(c0)
+	require.FileExists(t, prunedPackPath, "chunk 0's cold pack exists before the prune")
+
+	cancel3, done3, catCh3 := runDaemonInBackground(t, prunedCfg, core, &served, newRecordingMetrics())
+	pruneCat := awaitCatalog(t, catCh3) // the pruning daemon's own catalog
+
+	// The prune scan runs on the first lifecycle tick (the at-start doorbell ring,
+	// which is startup convergence). Poll for chunk 0's per-chunk artifact key
+	// (the frozen cold ledger) to vanish.
+	require.Eventually(t, func() bool {
+		ledgers, err := pruneCat.State(c0, KindLedgers)
+		return err == nil && ledgers == State("")
+	}, 60*time.Second, 50*time.Millisecond, "retention must prune chunk 0's artifact keys")
+
+	// Chunk 1 (the floor chunk) is WITHIN retention and survives the prune.
+	c1lfs, err := pruneCat.State(c1, KindLedgers)
+	require.NoError(t, err)
+	assert.Equal(t, StateFrozen, c1lfs, "chunk 1 is at the retention floor and survives")
+
+	// The on-disk cold pack file is gone too (prune unlinks the files, not just
+	// the keys).
+	require.Eventually(t, func() bool {
+		_, statErr := os.Stat(prunedPackPath)
+		return os.IsNotExist(statErr)
+	}, 10*time.Second, 50*time.Millisecond, "the pruned cold pack file is unlinked")
+
+	waitClean(t, cancel3, done3)
+
+	// =====================================================================
+	// STEP 5 — Catalog.Audit (INV-2..4) → Clean. The store must be at a single
+	// canonical state with no orphans/dangling/duplicates and nothing below the
+	// retention floor. RetentionChunks matches the daemon's last config so INV-4
+	// checks against the EXACT floor it enforced.
+	// =====================================================================
+	auditCat, closeAudit := e2eReadCatalog(t, dataDir)
+	defer closeAudit()
+	report, err := auditCat.Audit(AuditOptions{RetentionChunks: 1})
+	require.NoError(t, err, "audit completes (error only for I/O)")
+	require.True(t, report.Clean(),
+		"after the full lifecycle the store satisfies INV-2..4; violations:\n%s", violationsString(report))
+}
+
+// ============================================================================
+// helpers
+// ============================================================================
+
+// e2eReadCatalog binds a Catalog over a SEPARATE metastore handle on the
+// daemon's data dir for read-only inspection BETWEEN daemon runs (the metastore
+// is RocksDB-primary / exclusive-LOCK, so this MUST be closed via the returned
+// close func before the next daemon run reopens it).
+func e2eReadCatalog(t *testing.T, dataDir string) (*Catalog, func()) {
+	t.Helper()
+	paths := Config{Service: ServiceConfig{DefaultDataDir: dataDir}}.WithDefaults().ResolvePaths()
+	store, err := openMetaAt(t, paths.Catalog)
+	require.NoError(t, err)
+	return NewCatalog(store, NewLayoutFromPaths(paths)), func() { _ = store.Close() }
+}
+
+// mustDeriveWatermark derives the durable watermark through the production probe.
+func mustDeriveWatermark(t *testing.T, cat *Catalog) uint32 {
+	t.Helper()
+	wm, err := deriveWatermark(cat, NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger()))
+	require.NoError(t, err)
+	return wm
+}
+
+// The E2E reuses observability_test.go's recordingMetrics (a full Metrics sink)
+// and its snapshotBoundaries; snapshotFreezeCount (added there) reports the
+// number of freeze-stage signals.
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/observability_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/observability_test.go
new file mode 100644
index 000000000..1897f43c6
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/observability_test.go
@@ -0,0 +1,596 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/sirupsen/logrus"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/hotchunk"
+)
+
+// findLog returns the first captured entry whose message equals msg, or fails.
+func findLog(t *testing.T, entries []logrus.Entry, msg string) logrus.Entry {
+	t.Helper()
+	for _, e := range entries {
+		if e.Message == msg {
+			return e
+		}
+	}
+	t.Fatalf("no log entry with message %q; got %d entries", msg, len(entries))
+	return logrus.Entry{}
+}
+
+// recordingMetrics is a Metrics sink that records every signal so a test can
+// assert the daemon drove the expected phase signals at the right points. It is
+// safe for concurrent use (the ingestion loop, lifecycle goroutine, and worker
+// pool all report concurrently).
+type recordingMetrics struct {
+	mu sync.Mutex
+
+	// last-write gauges
+	lagTip, lagCommitted     uint32
+	lastCommitted            uint32
+	wmCommitted, wmFloor     uint32
+	catchupDone, catchupGoal uint32
+	liveHot                  int
+	coldBytes                int64
+	gaugesSet                map[string]int // how many times each gauge was set
+
+	// counters / per-call records
+	boundaries  []uint32
+	catchupPass []passRec
+	freeze      []freezeRec
+	discard     []countDur
+	prune       []countDur
+	recovery    []recoveryRec
+}
+
+type passRec struct {
+	lo, hi uint32
+	d      time.Duration
+}
+type freezeRec struct {
+	chunkBuilds int
+	d           time.Duration
+}
+type countDur struct {
+	count int
+	d     time.Duration
+}
+type recoveryRec struct {
+	cold, hot int
+	d         time.Duration
+}
+
+func newRecordingMetrics() *recordingMetrics {
+	return &recordingMetrics{gaugesSet: map[string]int{}}
+}
+
+func (r *recordingMetrics) IngestionLag(tip, committed uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.lagTip, r.lagCommitted = tip, committed
+	r.gaugesSet["lag"]++
+}
+
+func (r *recordingMetrics) LastCommitted(seq uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.lastCommitted = seq
+	r.gaugesSet["last_committed"]++
+}
+
+func (r *recordingMetrics) Watermark(committed, floor uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.wmCommitted, r.wmFloor = committed, floor
+	r.gaugesSet["watermark"]++
+}
+
+func (r *recordingMetrics) CatchupProgress(done, goal uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.catchupDone, r.catchupGoal = done, goal
+	r.gaugesSet["catchup_progress"]++
+}
+
+func (r *recordingMetrics) LiveHotChunks(n int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.liveHot = n
+	r.gaugesSet["live_hot"]++
+}
+
+func (r *recordingMetrics) ColdTierBytes(b int64) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.coldBytes = b
+	r.gaugesSet["cold_bytes"]++
+}
+
+func (r *recordingMetrics) ChunkBoundary(closed uint32) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.boundaries = append(r.boundaries, closed)
+}
+
+func (r *recordingMetrics) CatchupPass(lo, hi uint32, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.catchupPass = append(r.catchupPass, passRec{lo, hi, d})
+}
+
+func (r *recordingMetrics) Freeze(chunkBuilds int, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.freeze = append(r.freeze, freezeRec{chunkBuilds, d})
+}
+
+func (r *recordingMetrics) Discard(count int, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.discard = append(r.discard, countDur{count, d})
+}
+
+func (r *recordingMetrics) Prune(count int, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.prune = append(r.prune, countDur{count, d})
+}
+
+func (r *recordingMetrics) Recovery(cold, hot int, d time.Duration) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.recovery = append(r.recovery, recoveryRec{cold, hot, d})
+}
+
+func (r *recordingMetrics) snapshotBoundaries() []uint32 {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	out := make([]uint32, len(r.boundaries))
+	copy(out, r.boundaries)
+	return out
+}
+
+// snapshotFreezeCount reports how many freeze-stage signals were recorded — used
+// by the end-to-end daemon test to assert the lifecycle ran its plan-and-execute
+// (freeze) stage.
+func (r *recordingMetrics) snapshotFreezeCount() int {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return len(r.freeze)
+}
+
+func (r *recordingMetrics) snapshotLastCommitted() (uint32, int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.lastCommitted, r.gaugesSet["last_committed"]
+}
+
+func (r *recordingMetrics) snapshotLag() (uint32, uint32, int) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	return r.lagTip, r.lagCommitted, r.gaugesSet["lag"]
+}
+
+var _ Metrics = (*recordingMetrics)(nil)
+
+// ---------------------------------------------------------------------------
+// nopMetrics / metricsOrNop
+// ---------------------------------------------------------------------------
+
+// A nil Metrics resolves to a no-op that never panics on any signal — the
+// safety net every phase relies on (WithDefaults fills the daemon path; a
+// primitive driven directly may not have).
+func TestMetricsOrNop_NilNeverPanics(t *testing.T) {
+	m := metricsOrNop(nil)
+	require.NotNil(t, m)
+	m.IngestionLag(10, 5)
+	m.LastCommitted(5)
+	m.Watermark(5, 2)
+	m.CatchupProgress(1, 9)
+	m.LiveHotChunks(3)
+	m.ColdTierBytes(1024)
+	m.ChunkBoundary(0)
+	m.CatchupPass(0, 4, time.Second)
+	m.Freeze(2, time.Second)
+	m.Discard(1, time.Second)
+	m.Prune(2, time.Second)
+	m.Recovery(1, 1, time.Second)
+}
+
+// ---------------------------------------------------------------------------
+// Ingestion loop — ChunkBoundary signal at each handoff.
+// ---------------------------------------------------------------------------
+
+// Driving a ledger that closes a chunk fires exactly one ChunkBoundary at the
+// handoff, naming the JUST-CLOSED chunk (not the next one). The watermark is
+// seeded just below chunk 0's boundary so the indexed poll resumes there and
+// crosses boundary 0->1 in one step, then ingests one interior ledger of chunk 1
+// (no boundary), then the poll errs.
+//
+// NOTE (pull seam): the push-model predecessor of this test asserted the metric
+// over TWO consecutive handoffs ([]uint32{0,1}) to also pin the "in order" of
+// multiple boundaries. That cheap two-boundary check relied on the stream
+// SKIPPING from chunk 0's last ledger straight to chunk 1's last ledger. The
+// indexed-poll loop (for seq := resume; ; seq++) cannot skip: a second real
+// boundary is 10,000 ledgers away, so two-handoff ordering can only be exercised
+// by ingesting a full chunk (~85s), which alone pushes the package past the
+// fixed 600s `go test` timeout the gate runs under. The substantive per-handoff
+// properties — exactly one boundary, naming the just-closed (not the next)
+// chunk, and the gauge set once per ingested ledger — are preserved here; the
+// multi-handoff "in order" sub-property is reported as not cheaply expressible
+// against the pull seam (see the structured report).
+func TestRunIngestionLoop_ReportsChunkBoundaries(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	c1 := c + 1
+	db := seedWatermark(t, cat, c, c.LastLedger()-1)
+
+	// last ledger of chunk 0 (boundary 0->1), then a ledger inside chunk 1 (no
+	// boundary), then the poll errs.
+	lastSeq := c1.FirstLedger()
+	getter := &fakeLedgerGetter{frames: map[uint32][]byte{
+		c.LastLedger(): zeroTxLCMBytes(t, c.LastLedger()), // boundary 0->1
+		lastSeq:        zeroTxLCMBytes(t, lastSeq),        // no boundary
+	}, endErr: errors.New("end")}
+	ingestTypes := hotchunk.Ingest{Ledgers: true}
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	rec := newRecordingMetrics()
+
+	done := make(chan error, 1)
+	go func() {
+		done <- runIngestionLoop(context.Background(), getter, db, cat, ch, ingestTypes, silentLogger(), rec)
+	}()
+
+	select {
+	case <-done: // the poll ran dry and errored; the boundary already fired
+	case <-time.After(10 * time.Second):
+		t.Fatal("ingestion loop did not finish")
+	}
+
+	// Exactly one boundary, naming the just-closed chunk (c), NOT the newly-opened
+	// one (c1) — the load-bearing "names the closed chunk" half of the property.
+	assert.Equal(t, []uint32{uint32(c)}, rec.snapshotBoundaries(),
+		"one boundary at the handoff, naming the just-closed chunk")
+
+	// Per-ledger liveness gauge: refreshed after every synced batch, so it tracks
+	// the highest committed ledger and is the moving steady-state health signal
+	// between chunk boundaries. It must equal the last ledger ingested and have
+	// been set once per ingested ledger (the two-ledger run here).
+	gotSeq, setCount := rec.snapshotLastCommitted()
+	assert.Equal(t, lastSeq, gotSeq, "last-committed gauge tracks the highest synced ledger")
+	assert.Equal(t, 2, setCount, "last-committed refreshed once per ledger")
+
+	// The ingestion loop holds no network tip, so it must NOT touch IngestionLag —
+	// that gauge is a backfill-only signal (the corrected contract). Asserting it
+	// stays untouched guards against re-introducing the stale-steady-state lag the
+	// old doc-comment falsely promised the loop would refresh.
+	_, _, lagSet := rec.snapshotLag()
+	assert.Zero(t, lagSet, "ingestion loop must not touch IngestionLag (backfill-only signal)")
+}
+
+// ---------------------------------------------------------------------------
+// Structured logging — keys, values, and level at the phase log points.
+// ---------------------------------------------------------------------------
+
+// The ingestion loop's chunk-boundary log line carries the structured keys the
+// operator dashboards/alerts join on (closed_chunk, next_chunk, last_ledger) at
+// Info level. A dropped field, mislabeled key, or wrong level here would silently
+// break those joins; the metrics tests cannot see it.
+func TestRunIngestionLoop_BoundaryLogFields(t *testing.T) {
+	cat, _ := testCatalog(t)
+	c := chunk.ID(0)
+	c1 := c + 1
+	// Seed just below the boundary so the poll crosses it in one step.
+	db := seedWatermark(t, cat, c, c.LastLedger()-1)
+
+	getter := &fakeLedgerGetter{frames: map[uint32][]byte{
+		c.LastLedger():   zeroTxLCMBytes(t, c.LastLedger()),   // boundary 0->1
+		c1.FirstLedger(): zeroTxLCMBytes(t, c1.FirstLedger()), // no boundary
+	}, endErr: errors.New("end")}
+	logger := silentLogger()
+	stop := logger.StartTest(logrus.DebugLevel)
+
+	ch := make(chan chunk.ID, lifecycleQueueDepth)
+	done := make(chan error, 1)
+	go func() {
+		done <- runIngestionLoop(context.Background(), getter, db, cat, ch,
+			hotchunk.Ingest{Ledgers: true}, logger, newRecordingMetrics())
+	}()
+	select {
+	case <-done:
+	case <-time.After(10 * time.Second):
+		t.Fatal("ingestion loop did not finish")
+	}
+	entries := stop()
+
+	e := findLog(t, entries, "streaming: ingestion chunk boundary — handed off to lifecycle")
+	assert.Equal(t, logrus.InfoLevel, e.Level, "boundary handoff is an Info-level event")
+	assert.Equal(t, c.String(), e.Data["closed_chunk"], "closed_chunk names the just-filled chunk")
+	assert.Equal(t, c1.String(), e.Data["next_chunk"], "next_chunk names the newly-opened chunk")
+	assert.Equal(t, c.LastLedger(), e.Data["last_ledger"], "last_ledger is the boundary ledger")
+}
+
+// A healthy lifecycle tick emits the derived-snapshot Debug line (through/floor)
+// and the freeze-stage Info line (chunk_builds/index_builds) with the keys the
+// operator reads. Asserts keys, values, and levels together so a relabel or
+// level regression is caught.
+func TestRunLifecycleTick_LogFields(t *testing.T) {
+	// full-chunk ingest; isolated TempDir/catalog + per-instance logger —
+	// overlaps to fit the gate's go-test timeout.
+	t.Parallel()
+	cat, _ := testCatalog(t)
+	cfg, _ := lifecycleTestConfig(t, cat, 0)
+	cfg.Metrics = newRecordingMetrics()
+
+	ingestFullHotChunk(t, cat, 0)
+	live := openLiveHotDB(t, cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+
+	logger := supportlog.New()
+	logger.SetLevel(logrus.DebugLevel)
+	cfg.Logger = logger
+	stop := logger.StartTest(logrus.DebugLevel)
+
+	runTickForCatalog(context.Background(), t, cfg, cat)
+	entries := stop()
+
+	snap := findLog(t, entries, "streaming: lifecycle tick — derived snapshot")
+	assert.Equal(t, logrus.DebugLevel, snap.Level, "the per-tick snapshot is Debug (high-frequency)")
+	assert.Contains(t, snap.Data, "through")
+	assert.Contains(t, snap.Data, "floor")
+
+	freeze := findLog(t, entries, "streaming: lifecycle freeze stage complete")
+	assert.Equal(t, logrus.InfoLevel, freeze.Level, "a non-empty freeze is Info")
+	assert.Positive(t, freeze.Data["chunk_builds"], "chunk 0 was built")
+}
+
+// ---------------------------------------------------------------------------
+// Lifecycle tick — Freeze / Discard / Prune + gauges.
+// ---------------------------------------------------------------------------
+
+// A tick that freezes a chunk, folds it into a terminal index, and discards its
+// hot DB drives the freeze (with non-zero build counts), discard (count 1), and
+// prune stages, plus the watermark, live-hot-chunk, and cold-bytes gauges.
+func TestRunLifecycleTick_ReportsPhaseSignals(t *testing.T) {
+	// full-chunk ingest; isolated TempDir/catalog — overlaps the other heavy
+	// tests to fit the gate's go-test timeout.
+	t.Parallel()
+	cat, _ := testCatalog(t) // one-chunk window finalizes immediately
+	cfg, rec := lifecycleTestConfig(t, cat, 0)
+	metrics := newRecordingMetrics()
+	cfg.Metrics = metrics
+
+	// Chunk 0 just closed (full hot DB on disk); chunk 1 is the new live chunk.
+	ingestFullHotChunk(t, cat, 0)
+	live := openLiveHotDB(t, cat, 1)
+	t.Cleanup(func() { _ = live.Close() })
+
+	runTickForCatalog(context.Background(), t, cfg, cat)
+	require.False(t, rec.fired(), "a healthy tick never aborts: %v", rec.last.Load())
+
+	// Freeze stage reported once, with a non-trivial plan (chunk 0's build).
+	require.Len(t, metrics.freeze, 1, "freeze stage reported once")
+	assert.Positive(t, metrics.freeze[0].chunkBuilds, "chunk 0 was built")
+
+	// Discard stage retired chunk 0's hot DB (cold artifacts now serve it).
+	require.Len(t, metrics.discard, 1, "discard stage reported once")
+	assert.Equal(t, 1, metrics.discard[0].count, "chunk 0's hot DB was discarded")
+
+	// Prune stage reported (it may have zero ops — the count is what matters).
+	require.Len(t, metrics.prune, 1, "prune stage reported once")
+
+	// Gauges: watermark set, live-hot count reflects only the live chunk 1 after
+	// the discard, cold footprint set (chunk 0's artifacts exist on disk).
+	assert.Positive(t, metrics.gaugesSet["watermark"], "watermark gauge set")
+	assert.Equal(t, 1, metrics.liveHot, "only the live chunk remains after discard")
+	assert.Positive(t, metrics.gaugesSet["cold_bytes"], "cold footprint gauge set")
+	assert.Positive(t, metrics.coldBytes, "chunk 0's frozen artifacts have non-zero size")
+}
+
+// An empty tick (nothing left to build, no hot DBs to discard, nothing to
+// prune) still reports the freeze/discard/prune stages so the empty-tick rate is
+// observable. Chunk 0 is already fully frozen and covered (no hot key), so the
+// plan over [0,0] resolves to nothing and the discard/prune scans find nothing.
+func TestRunLifecycleTick_EmptyTickStillReportsStages(t *testing.T) {
+	cat, _ := testCatalog(t)
+	cfg, _ := lifecycleTestConfig(t, cat, 0)
+	metrics := newRecordingMetrics()
+	cfg.Metrics = metrics
+
+	freezeKinds(t, cat, 0, KindLedgers)
+
+	// Drive the tick with chunk 0 (the just-completed chunk): the range [0,0] is
+	// already fully materialized, so no build, no discard, no prune.
+	runLifecycleTick(context.Background(), cfg, cat, 0)
+
+	require.Len(t, metrics.freeze, 1)
+	assert.Equal(t, 0, metrics.freeze[0].chunkBuilds, "no producible range — all frozen")
+	require.Len(t, metrics.discard, 1)
+	assert.Equal(t, 0, metrics.discard[0].count)
+	require.Len(t, metrics.prune, 1)
+	assert.Positive(t, metrics.gaugesSet["watermark"], "watermark gauge set even on an empty tick")
+}
+
+// ---------------------------------------------------------------------------
+// Catch-up — CatchupPass + progress/lag gauges.
+// ---------------------------------------------------------------------------
+
+// A backfill that backfills a multi-chunk range reports one CatchupPass over the
+// resolved [lo, hi], plus the progress and lag gauges. Driven through the same
+// startTestConfig the startup tests use, with a recording-plan seam so no real
+// cold I/O runs.
+func TestBackfill_ReportsPassAndProgress(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+
+	rp := &recordingPlan{}
+	// A tip well past several chunks ⇒ backfill backfills [genesis chunk, last
+	// complete chunk at tip].
+	tipLedger := chunk.ID(3).LastLedger() + 5
+	tip := &fakeTipBackend{tips: []uint32{tipLedger}}
+	start := startTestConfig(t, cat, tip, &fakeCore{}, rp)
+	metrics := newRecordingMetrics()
+	start.Exec.Metrics = metrics
+
+	got, err := catchUp(context.Background(), start, preGenesisLedger, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+
+	require.NotEmpty(t, metrics.catchupPass, "at least one backfill pass reported")
+	first := metrics.catchupPass[0]
+	assert.Equal(t, uint32(0), first.lo, "backfill starts at the genesis chunk")
+	assert.Equal(t, uint32(3), first.hi, "backfills through the last complete chunk at tip")
+
+	// Progress + lag gauges were updated.
+	assert.Positive(t, metrics.gaugesSet["catchup_progress"], "backfill progress gauge set")
+	assert.Positive(t, metrics.gaugesSet["lag"], "ingestion lag gauge set during backfill")
+	assert.Equal(t, chunk.ID(3).LastLedger(), got, "watermark advanced to the backfilled range end")
+}
+
+// ---------------------------------------------------------------------------
+// Recovery — Recovery signal with the per-tier key counts.
+// ---------------------------------------------------------------------------
+
+func TestRunSurgicalRecovery_ReportsRecoveryMetric(t *testing.T) {
+	cfg := recoveryConfig(t)
+	paths := cfg.WithDefaults().ResolvePaths()
+
+	// Seed durable state, then close (RocksDB single-writer; the entrypoint reopens).
+	seedStore, err := openMetaAt(t, paths.Catalog)
+	require.NoError(t, err)
+	seedCat := NewCatalog(seedStore, NewLayout(paths.DataDir))
+	for _, kind := range []Kind{KindLedgers} {
+		require.NoError(t, seedCat.MarkChunkFreezing(5, kind))
+		require.NoError(t, seedCat.FlipChunkFrozen(5, kind))
+	}
+	require.NoError(t, seedCat.PutHotTransient(5))
+	require.NoError(t, seedCat.FlipHotReady(5))
+	require.NoError(t, seedStore.Close())
+
+	metrics := newRecordingMetrics()
+	plan, err := RunSurgicalRecovery(cfg,
+		RecoveryRequest{Lo: 5, Hi: 5, Tier: RecoverColdAndHot}, silentLogger(), metrics)
+	require.NoError(t, err)
+
+	require.Len(t, metrics.recovery, 1, "one recovery apply reported")
+	got := metrics.recovery[0]
+	assert.Equal(t, len(plan.ColdKeys), got.cold, "cold key count matches the plan")
+	assert.Equal(t, len(plan.HotKeys), got.hot, "hot key count matches the plan")
+	assert.Equal(t, 1, got.hot, "chunk 5's hot key demoted")
+	assert.Equal(t, 1, got.cold, "chunk 5's ledger cold key demoted")
+}
+
+// ---------------------------------------------------------------------------
+// coldTierBytes — the disk-footprint helper.
+// ---------------------------------------------------------------------------
+
+// A missing tree contributes zero; populated files are summed across the cold
+// tree (ledgers); the hot tree and meta store are excluded.
+func TestColdTierBytes(t *testing.T) {
+	root := t.TempDir()
+	layout := NewLayout(root)
+
+	// Nothing materialized yet ⇒ zero, no error.
+	total, err := coldTierBytes(layout)
+	require.NoError(t, err)
+	assert.Zero(t, total, "an un-materialized cold tier is zero bytes")
+
+	// Write two files in the ledgers tree.
+	write := func(dir, name string, n int) {
+		require.NoError(t, os.MkdirAll(dir, 0o755))
+		require.NoError(t, os.WriteFile(filepath.Join(dir, name), make([]byte, n), 0o644))
+	}
+	write(filepath.Join(layout.LedgersRoot(), "00000"), "x.pack", 100)
+	write(filepath.Join(layout.LedgersRoot(), "00000"), "y.pack", 50)
+	// A file under the HOT tree must NOT be counted.
+	write(layout.HotRoot(), "ignored.sst", 9999)
+
+	total, err = coldTierBytes(layout)
+	require.NoError(t, err)
+	assert.Equal(t, int64(150), total, "only the cold tree is summed; the hot tree is excluded")
+}
+
+// ---------------------------------------------------------------------------
+// PrometheusMetrics — registration + signal recording into the registry.
+// ---------------------------------------------------------------------------
+
+// NewPrometheusMetrics registers without panicking and every signal updates the
+// underlying collectors (asserted by gathering the registry).
+func TestPrometheusMetrics_RegistersAndRecords(t *testing.T) {
+	reg := prometheus.NewRegistry()
+	m := NewPrometheusMetrics(reg, "test_ns")
+
+	m.IngestionLag(100, 60) // lag 40
+	m.LastCommitted(58)
+	m.Watermark(60, 12)
+	m.CatchupProgress(40, 100)
+	m.LiveHotChunks(7)
+	m.ColdTierBytes(2048)
+	m.ChunkBoundary(3)
+	m.CatchupPass(0, 3, 250*time.Millisecond)
+	m.Freeze(2, 100*time.Millisecond)
+	m.Discard(1, 10*time.Millisecond)
+	m.Prune(2, 5*time.Millisecond)
+	m.Recovery(3, 1, time.Millisecond)
+
+	families, err := reg.Gather()
+	require.NoError(t, err)
+
+	values := map[string]float64{}
+	counts := map[string]uint64{}
+	for _, mf := range families {
+		for _, metric := range mf.GetMetric() {
+			name := mf.GetName()
+			switch {
+			case metric.GetGauge() != nil:
+				values[name] = metric.GetGauge().GetValue()
+			case metric.GetCounter() != nil:
+				values[name] += metric.GetCounter().GetValue()
+			case metric.GetHistogram() != nil:
+				counts[name] += metric.GetHistogram().GetSampleCount()
+			}
+		}
+	}
+
+	assert.InDelta(t, float64(40), values["test_ns_fullhistory_streaming_ingestion_lag_ledgers"], 0)
+	assert.InDelta(t, float64(58), values["test_ns_fullhistory_streaming_last_committed_ledger"], 0)
+	assert.InDelta(t, float64(60), values["test_ns_fullhistory_streaming_watermark_ledger"], 0)
+	assert.InDelta(t, float64(12), values["test_ns_fullhistory_streaming_retention_floor_ledger"], 0)
+	assert.InDelta(t, float64(100), values["test_ns_fullhistory_streaming_catchup_target_ledger"], 0)
+	assert.InDelta(t, float64(7), values["test_ns_fullhistory_streaming_live_hot_chunks"], 0)
+	assert.InDelta(t, float64(2048), values["test_ns_fullhistory_streaming_cold_tier_bytes"], 0)
+	assert.InDelta(t, float64(1), values["test_ns_fullhistory_streaming_chunk_boundaries_total"], 0)
+	assert.InDelta(t, float64(1), values["test_ns_fullhistory_streaming_catchup_passes_total"], 0)
+	assert.InDelta(t, float64(2), values["test_ns_fullhistory_streaming_freeze_chunks_total"], 0)
+	assert.InDelta(t, float64(1), values["test_ns_fullhistory_streaming_discarded_hot_chunks_total"], 0)
+	assert.InDelta(t, float64(2), values["test_ns_fullhistory_streaming_pruned_ops_total"], 0)
+	assert.InDelta(t, float64(1), values["test_ns_fullhistory_streaming_recoveries_total"], 0)
+	// recovered_keys_total aggregates 3+1 = 4 across the tier label.
+	assert.InDelta(t, float64(4), values["test_ns_fullhistory_streaming_recovered_keys_total"], 0)
+
+	// Phase-duration histogram saw catchup_pass + freeze + discard + prune +
+	// recovery = 5 observations.
+	assert.Equal(t, uint64(5), counts["test_ns_fullhistory_streaming_phase_duration_seconds"])
+}
+
+// Double-registration on the same registry panics inside MustRegister — the
+// daemon convention is one sink per registry; this documents it.
+func TestPrometheusMetrics_DoubleRegisterPanics(t *testing.T) {
+	reg := prometheus.NewRegistry()
+	NewPrometheusMetrics(reg, "test_ns")
+	assert.Panics(t, func() { NewPrometheusMetrics(reg, "test_ns") },
+		"re-registering the same collectors must panic (one sink per registry)")
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/recovery.go b/cmd/stellar-rpc/internal/fullhistory/streaming/recovery.go
new file mode 100644
index 000000000..e491ac388
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/recovery.go
@@ -0,0 +1,370 @@
+package streaming
+
+import (
+	"errors"
+	"fmt"
+	"time"
+
+	supportlog "github.com/stellar/go-stellar-sdk/support/log"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/metastore"
+)
+
+// errCommitBatchFaultInjected is returned only by the test-only
+// failCommitBatch hook (hooks.go) to force a recovery batch to be dropped. It
+// never surfaces in production, where the hook is nil.
+var errCommitBatchFaultInjected = errors.New("streaming: commit batch fault-injected (test only)")
+
+// Surgical recovery — design "Scenario coverage" cases 3 (tainted data) and 4
+// (hot-volume loss). The operator NEVER touches the filesystem. Recovery is ONE
+// atomic meta-store batch that DEMOTES the affected keys — never removes them —
+// split by tier:
+//
+//   - Tainted COLD artifacts (chunk:{c}:* keys) -> "freezing", the state that
+//     already means "this file is not to be trusted: re-derive or delete".
+//     Catch-up's per-chunk re-materialization (rule 1) overwrites the .pack in
+//     place.
+//   - Tainted or LOST HOT DBs (hot:chunk, the live chunk's included) ->
+//     "transient", instantly ineligible as a source (backfillSource reads only
+//     "ready") and ignored by the watermark (deriveWatermark counts only
+//     "ready" keys). openHotTierForChunk wipes and recreates one when
+//     re-ingestion re-opens that chunk; the discard scan retires any sitting
+//     below the live chunk.
+//
+// The batch commits atomically or not at all, so there is no interruption
+// analysis and re-running it is a no-op (every demote is an idempotent overwrite
+// to a fixed value, and a key already at the target value re-writes the same
+// value).
+//
+// STOPPED-DAEMON-ONLY — what enforces it TODAY vs once the daemon-side wiring
+// lands. RunSurgicalRecovery takes every storage root's flock before opening the
+// store, so it is BUILT to fail fast with ErrRootLocked against a running
+// daemon. That guard is only fully live once the daemon-side flock is wired: the
+// top-level daemon entry (the cmd glue that owns Config + process lifetime) must
+// call LockRoots(paths.LockRoots()...) once at startup and hold the locks for
+// the process's whole life, before opening the meta store and calling
+// startStreaming. Until that wiring exists, a live daemon does NOT hold these
+// flocks, so ErrRootLocked does not fire against it. The hard safety floor that
+// is already real is RocksDB's own metastore single-writer LOCK: it rejects
+// RunSurgicalRecovery's metastore.New open while a daemon holds the store open,
+// so recovery cannot corrupt a live daemon's metastore — it just fails with an
+// opaque RocksDB "lock hold" IO error instead of the clean ErrRootLocked, and
+// that LOCK does not cover the immutable/hot trees the flock guard targets for
+// the genuinely dangerous two-distinct-metastores-sharing-a-hot-tree case.
+// OPERATOR DISCIPLINE remains required: stop the daemon before recovering.
+//
+// =========================================================================
+// RUNBOOK — surgical recovery (tainted data / hot-volume loss)
+// =========================================================================
+//
+// WHEN: an operator has determined a contiguous range of chunks holds tainted
+// cold artifacts (a bad LedgerBackend run, a detected byte mismatch against a
+// re-derive) and/or lost-or-suspect hot DBs (case 4: ephemeral hot volume died
+// while the meta store survived, so its hot:chunk keys read "ready" with missing
+// dirs and the daemon fatals with ErrHotVolumeLost on start).
+//
+// STEPS:
+//  1. STOP the daemon — this is operator discipline, not yet a hard machine
+//     guard. The recovery acquires the same per-root flocks the daemon is meant
+//     to hold for its whole life; once the daemon-side flock wiring lands (see
+//     the STOPPED-DAEMON-ONLY note above), a recovery against a running daemon
+//     fails fast with ErrRootLocked. Until then, RocksDB's metastore
+//     single-writer LOCK still prevents recovery from opening a live daemon's
+//     meta store (it fails with an opaque RocksDB lock error), so a running
+//     daemon's metastore cannot be corrupted — but stop the daemon anyway: that
+//     LOCK does not cover a hot tree shared by two distinct metastores. Do not
+//     delete or move any file or directory — the recovery is pure key demotion;
+//     the daemon's own sweeps and openHotTierForChunk handle the dirs in their
+//     existing crash-safe order on the next start.
+//  2. RUN the recovery against the SAME config the daemon uses, naming the chunk
+//     range [Lo, Hi] (inclusive) to recover and which tiers to touch:
+//       - Tiers: ColdAndHot (the general case-3 batch — re-derive cold AND
+//         re-ingest hot), or HotOnly (the case-4 batch — the hot volume is gone
+//         but the cold artifacts survive on durable storage; demote only the
+//         orphaned hot:chunk keys).
+//       - Hi MUST reach the live chunk (the highest hot:chunk) whenever you want
+//         a tainted HOT chunk RE-INGESTED. The watermark is the max over "ready"
+//         hot chunks, so it regresses below the taint only once every ready hot
+//         chunk above it — up to the live chunk — is demoted. A sub-range whose
+//         Hi stops below the live chunk leaves those higher chunks ready and the
+//         watermark pinned, so the taint is NOT replayed (intended only when you
+//         do not want re-ingestion). RunSurgicalRecovery logs a note when a
+//         demotion stops below the live chunk.
+//  3. START the daemon. On restart the case-4 fatal no longer fires (it checks
+//     "ready" keys, and the demoted ones now read "transient"); the watermark
+//     falls to the last frozen boundary below the demoted range; catch-up
+//     re-derives the "freezing" cold artifacts and rebuilds overlapped indexes;
+//     captive core re-ingests the un-frozen tail FORWARD. There is no watermark
+//     to edit and no manual rewind — the derived watermark self-corrects.
+//
+// IDEMPOTENT: re-running the exact same recovery is a no-op. Running it again
+// after a partial start (the daemon already re-froze some artifacts) re-demotes
+// only what is still present, which catch-up repairs again — safe but rarely
+// needed.
+// =========================================================================
+
+// RecoveryTier selects which storage tier(s) a surgical recovery touches.
+type RecoveryTier int
+
+const (
+	// RecoverColdAndHot is the general case-3 recovery: demote tainted cold
+	// artifacts to "freezing" AND the range's hot DBs to "transient". Use when
+	// the cold artifacts themselves are suspect (a bad backend run, a detected
+	// byte mismatch) — re-derivation rewrites them and re-ingestion refills the
+	// hot tail.
+	RecoverColdAndHot RecoveryTier = iota
+	// RecoverHotOnly is the case-4 recovery: demote ONLY the range's hot:chunk
+	// keys to "transient", leaving cold artifacts untouched. Use when the hot
+	// volume was lost (ephemeral NVMe died) but the cold artifacts survive on
+	// durable storage — there is nothing to re-derive, only an un-frozen tail to
+	// re-ingest forward.
+	RecoverHotOnly
+)
+
+func (t RecoveryTier) String() string {
+	switch t {
+	case RecoverColdAndHot:
+		return "cold+hot"
+	case RecoverHotOnly:
+		return "hot-only"
+	default:
+		return fmt.Sprintf("RecoveryTier(%d)", int(t))
+	}
+}
+
+// RecoveryRequest names the contiguous chunk range [Lo, Hi] (inclusive) to
+// recover and which tier(s) to touch. The range is the OPERATOR's assessment of
+// the tainted/lost span; the recovery demotes exactly the keys overlapping it
+// and nothing else — including a sub-range, which is a supported operation.
+//
+// Hot tier, important: the last-committed-ledger derivation is the MAX over all
+// "ready" hot chunks, so it regresses below the range only when every ready hot
+// chunk at or above Lo is demoted — i.e. when Hi reaches the live chunk (the
+// highest hot:chunk key). To RE-INGEST a tainted hot chunk, set Hi to the live
+// chunk; a sub-range whose Hi stops below it leaves the higher ready chunks (and
+// the watermark) in place. That is intended when you do NOT want re-ingestion,
+// but a too-low Hi silently will not replay the taint — RunSurgicalRecovery logs
+// an informational note when a demotion stops below the live chunk.
+type RecoveryRequest struct {
+	Lo, Hi chunk.ID
+	Tier   RecoveryTier
+}
+
+// RecoveryPlan is the exact set of keys a recovery will demote, computed from a
+// snapshot of the catalog. It is returned by PlanSurgicalRecovery so an operator
+// (or a test) can inspect — or dry-run — the demotions before committing. Every
+// listed key EXISTS in the store at plan time; absent keys are never conjured.
+type RecoveryPlan struct {
+	Request RecoveryRequest
+
+	// ColdKeys are the chunk:{c}:* keys to demote to "freezing", in key order.
+	ColdKeys []ArtifactRef
+	// HotKeys are the hot:chunk:{c} chunk ids to demote to "transient",
+	// ascending.
+	HotKeys []chunk.ID
+}
+
+// Empty reports whether the plan would demote nothing — a recovery over a range
+// with no matching keys (e.g. a range entirely below the floor, already pruned).
+func (p RecoveryPlan) Empty() bool {
+	return len(p.ColdKeys) == 0 && len(p.HotKeys) == 0
+}
+
+// PlanSurgicalRecovery computes — but does not apply — the demotion plan for req
+// against the catalog's current durable state. It reads every relevant key once
+// and keeps only those that EXIST and fall in (cold/hot) or overlap (index) the
+// requested range, so applying the plan never creates a key and re-planning
+// after a partial repair shrinks naturally.
+func PlanSurgicalRecovery(cat *Catalog, req RecoveryRequest) (RecoveryPlan, error) {
+	if req.Lo > req.Hi {
+		return RecoveryPlan{}, fmt.Errorf(
+			"streaming: surgical recovery range lo %s > hi %s", req.Lo, req.Hi,
+		)
+	}
+	plan := RecoveryPlan{Request: req}
+
+	// Cold tier: chunk:{c}:* artifact keys in [Lo, Hi]. Skipped entirely for the
+	// hot-only (case-4) recovery.
+	if req.Tier == RecoverColdAndHot {
+		coldRefs, err := cat.ChunkArtifactKeys()
+		if err != nil {
+			return RecoveryPlan{}, err
+		}
+		for _, ref := range coldRefs {
+			if req.Lo <= ref.Chunk && ref.Chunk <= req.Hi {
+				plan.ColdKeys = append(plan.ColdKeys, ref)
+			}
+		}
+	}
+
+	// Hot tier: every hot:chunk:{c} key (any value) in [Lo, Hi]. Demoting the
+	// live chunk's key is allowed and intended — it is what regresses the
+	// watermark to the last frozen boundary. Both tiers touch the hot keys; the
+	// hot-only recovery touches ONLY them.
+	hotIDs, err := cat.HotChunkKeys()
+	if err != nil {
+		return RecoveryPlan{}, err
+	}
+	for _, id := range hotIDs {
+		if req.Lo <= id && id <= req.Hi {
+			plan.HotKeys = append(plan.HotKeys, id)
+		}
+	}
+
+	return plan, nil
+}
+
+// ApplySurgicalRecovery commits the plan's demotions in ONE atomic synced
+// meta-store batch: every cold artifact key -> "freezing", every hot key ->
+// "transient". The batch only ever demotes existing keys and unlinks nothing —
+// file/dir surgery is left to the daemon's sweeps and openHotTierForChunk on
+// the next start. Re-applying an already-committed plan re-writes the same
+// values (a no-op in effect).
+//
+// An empty plan commits an empty batch (harmless) rather than erroring, so a
+// recovery over an already-repaired or fully-pruned range is a clean no-op.
+func (c *Catalog) ApplySurgicalRecovery(plan RecoveryPlan) error {
+	return c.store.Batch(func(w *metastore.BatchWriter) error {
+		for _, ref := range plan.ColdKeys {
+			w.Put(ref.Key(), string(StateFreezing))
+		}
+		for _, id := range plan.HotKeys {
+			w.Put(hotChunkKey(id), string(HotTransient))
+		}
+		// Fault injection: returning an error here makes metastore drop the
+		// whole batch, so a test can assert NONE of the cold/hot demotions above
+		// became observable — the all-or-nothing property the runbook's "no
+		// interruption analysis" claim depends on. nil in production.
+		if c.hooks.commitBatchShouldFail() {
+			return errCommitBatchFaultInjected
+		}
+		return nil
+	})
+}
+
+// SurgicalRecovery is the catalog-level entrypoint: plan + apply in one call,
+// returning the plan that was committed so the caller can log/report exactly
+// what changed. The daemon must be stopped; the caller is responsible for
+// holding the storage-root locks (RunSurgicalRecovery does this; a test holding
+// an exclusive store may call this directly).
+func (c *Catalog) SurgicalRecovery(req RecoveryRequest) (RecoveryPlan, error) {
+	plan, err := PlanSurgicalRecovery(c, req)
+	if err != nil {
+		return RecoveryPlan{}, err
+	}
+	if err := c.ApplySurgicalRecovery(plan); err != nil {
+		return RecoveryPlan{}, err
+	}
+	return plan, nil
+}
+
+// ErrRecoveryEmptyRange is returned by RunSurgicalRecovery when the requested
+// range matches no keys at all. It is informational — the commit (an empty
+// batch) is harmless — but surfaced so an operator who fat-fingered a range
+// learns nothing was touched rather than assuming success.
+var ErrRecoveryEmptyRange = errors.New("streaming: surgical recovery matched no keys in range")
+
+// RunSurgicalRecovery is the OPERATOR ENTRYPOINT: it is run against a stopped
+// daemon to recover a tainted/lost chunk range. It resolves the same storage
+// roots the daemon uses and takes the SAME per-root flocks — so it fails fast
+// with ErrRootLocked against any OTHER process holding them. Note the daemon
+// itself does not yet take these flocks (the cmd glue must wire LockRoots at
+// startup; see the STOPPED-DAEMON-ONLY note on this file's recovery doc), so
+// today the live-daemon guard is RocksDB's metastore single-writer LOCK at the
+// metastore.New open below, not ErrRootLocked. It then opens the meta store,
+// computes and commits the demotion plan in one atomic batch, then releases
+// everything.
+//
+// It returns the committed plan so the caller can log exactly which keys were
+// demoted, and ErrRecoveryEmptyRange (with the plan still returned) when the
+// range matched nothing — see that error's doc. Any other error means the batch
+// did NOT commit (the store is unchanged, the operation is safe to retry).
+//
+// This is deliberately a standalone function, not a daemon mode: it opens the
+// store with exclusive locks, mutates exactly the recovery keys, and exits — the
+// next ordinary daemon start converges everything (case 3/4 in the design's
+// Scenario coverage).
+func RunSurgicalRecovery(
+	cfg Config, req RecoveryRequest, logger *supportlog.Entry, metrics Metrics,
+) (RecoveryPlan, error) {
+	if logger == nil {
+		logger = supportlog.New()
+	}
+	metrics = metricsOrNop(metrics)
+	cfg = cfg.WithDefaults()
+	paths := cfg.ResolvePaths()
+
+	// Take EVERY storage root's flock — the exact set the daemon is meant to hold
+	// for its whole life once the daemon-side LockRoots wiring lands. If another
+	// process holds one (a second recovery, or a daemon that DOES wire the flock),
+	// we fail fast with ErrRootLocked. Until the daemon takes these flocks the
+	// live-daemon guard against the metastore is RocksDB's single-writer LOCK at
+	// the metastore.New open below; see the STOPPED-DAEMON-ONLY note on the
+	// file's recovery doc.
+	locks, err := LockRoots(paths.LockRoots()...)
+	if err != nil {
+		return RecoveryPlan{}, fmt.Errorf("streaming: surgical recovery lock roots: %w", err)
+	}
+	defer locks.Release()
+
+	store, err := metastore.New(paths.Catalog, logger)
+	if err != nil {
+		return RecoveryPlan{}, fmt.Errorf("streaming: surgical recovery open meta store: %w", err)
+	}
+	defer func() { _ = store.Close() }()
+
+	cat := NewCatalog(store, NewLayoutFromPaths(paths))
+
+	logger.WithField("range_lo", req.Lo.String()).
+		WithField("range_hi", req.Hi.String()).
+		WithField("tier", req.Tier.String()).
+		Info("surgical recovery: planning demotions")
+
+	applyStart := time.Now()
+	plan, err := cat.SurgicalRecovery(req)
+	if err != nil {
+		return RecoveryPlan{}, err
+	}
+	metrics.Recovery(len(plan.ColdKeys), len(plan.HotKeys), time.Since(applyStart))
+
+	logger.WithField("cold_keys", len(plan.ColdKeys)).
+		WithField("hot_keys", len(plan.HotKeys)).
+		WithField("duration", time.Since(applyStart).String()).
+		Info("surgical recovery: demotion batch committed")
+
+	// Advisory (informational): if the hot demotion stopped BELOW the live chunk,
+	// the ready hot chunks above it keep the last-committed-ledger pinned above the
+	// demoted range — correct for a deliberate sub-range demotion, but it means a
+	// tainted hot chunk in the range will NOT be re-ingested. Surface it so an
+	// operator who meant to re-ingest learns to extend Hi to the live chunk.
+	// Best-effort and read-only: the recovery has already committed, so a failed
+	// probe here is ignored.
+	if len(plan.HotKeys) > 0 { //nolint:nestif // best-effort hot-key resume-point probe
+		if hotIDs, herr := cat.HotChunkKeys(); herr == nil {
+			var live, topDemoted chunk.ID
+			for _, id := range hotIDs {
+				if id > live {
+					live = id
+				}
+			}
+			for _, id := range plan.HotKeys {
+				if id > topDemoted {
+					topDemoted = id
+				}
+			}
+			if live > topDemoted {
+				logger.WithField("highest_demoted_hot", topDemoted.String()).
+					WithField("live_chunk", live.String()).
+					Info("surgical recovery: hot demotion stops below the live chunk — " +
+						"ready hot chunks above it keep the watermark pinned above the demoted range; " +
+						"to RE-INGEST a tainted hot chunk, set Hi to the live chunk")
+			}
+		}
+	}
+
+	if plan.Empty() {
+		return plan, ErrRecoveryEmptyRange
+	}
+	return plan, nil
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/recovery_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/recovery_test.go
new file mode 100644
index 000000000..6c0ef2ba9
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/recovery_test.go
@@ -0,0 +1,526 @@
+package streaming
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/ledger"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/stores/metastore"
+)
+
+// ---------------------------------------------------------------------------
+// Surgical recovery test helpers.
+// ---------------------------------------------------------------------------
+
+// mustState reads a per-chunk artifact key's State, asserting no error.
+//
+//nolint:unparam // kind varies in later slices (events/txhash)
+func mustState(t *testing.T, cat *Catalog, c chunk.ID, kind Kind) State {
+	t.Helper()
+	s, err := cat.State(c, kind)
+	require.NoError(t, err)
+	return s
+}
+
+// mustHotState reads a hot:chunk key's HotState, asserting no error.
+func mustHotState(t *testing.T, cat *Catalog, c chunk.ID) HotState {
+	t.Helper()
+	s, err := cat.HotState(c)
+	require.NoError(t, err)
+	return s
+}
+
+// ---------------------------------------------------------------------------
+// The demotion batch: atomic, idempotent, scoped to the range, never creating
+// absent keys.
+// ---------------------------------------------------------------------------
+
+func TestSurgicalRecovery_DemotesColdAndHot(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// In-range frozen cold artifacts on chunks 5 and 6.
+	freezeKinds(t, cat, 5, KindLedgers)
+	freezeKinds(t, cat, 6, KindLedgers)
+	// In-range ready hot DBs on chunks 5 and 6 (the live chunk 6 included).
+	readyHot(t, cat, 5)
+	readyHot(t, cat, 6)
+
+	// Out-of-range keys that MUST stay untouched.
+	freezeKinds(t, cat, 9, KindLedgers)
+	readyHot(t, cat, 9)
+
+	plan, err := cat.SurgicalRecovery(RecoveryRequest{Lo: 5, Hi: 6, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+	require.False(t, plan.Empty())
+
+	// Cold artifacts in range -> "freezing".
+	require.Equal(t, StateFreezing, mustState(t, cat, 5, KindLedgers))
+	require.Equal(t, StateFreezing, mustState(t, cat, 6, KindLedgers))
+
+	// Hot DBs in range -> "transient" (the live chunk's included).
+	require.Equal(t, HotTransient, mustHotState(t, cat, 5))
+	require.Equal(t, HotTransient, mustHotState(t, cat, 6))
+
+	// Out-of-range keys untouched.
+	require.Equal(t, StateFrozen, mustState(t, cat, 9, KindLedgers))
+	require.Equal(t, HotReady, mustHotState(t, cat, 9))
+}
+
+func TestSurgicalRecovery_Idempotent_ReRunIsNoOp(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	freezeKinds(t, cat, 2, KindLedgers)
+	readyHot(t, cat, 2)
+	readyHot(t, cat, 3)
+
+	req := RecoveryRequest{Lo: 2, Hi: 3, Tier: RecoverColdAndHot}
+
+	first, err := cat.SurgicalRecovery(req)
+	require.NoError(t, err)
+
+	// Capture the full key snapshot after the first apply.
+	before := snapshotAllKeys(t, cat)
+
+	// Re-run the EXACT same recovery — a no-op: every demote re-writes the same
+	// value, so the snapshot is byte-identical.
+	second, err := cat.SurgicalRecovery(req)
+	require.NoError(t, err)
+	after := snapshotAllKeys(t, cat)
+
+	require.Equal(t, before, after, "re-running surgical recovery must be a no-op")
+	require.Len(t, second.ColdKeys, len(first.ColdKeys))
+	require.Len(t, second.HotKeys, len(first.HotKeys))
+}
+
+// TestSurgicalRecovery_BatchIsAtomic proves ApplySurgicalRecovery commits its
+// cold/hot demotions in ONE all-or-nothing batch — the core property the
+// design's "commits atomically or not at all" / "no interruption analysis"
+// claim rests on. We fault-inject a failure INSIDE the batch callback (which
+// makes metastore drop the whole batch) and assert the FULL key snapshot is
+// byte-identical before and after: not a single demotion leaked. Rewriting
+// ApplySurgicalRecovery as separate non-atomic per-key Puts would leave some
+// demotions durable here and fail this test.
+func TestSurgicalRecovery_BatchIsAtomic(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// A fixture spanning both demotion families: frozen cold artifacts and ready
+	// hot DBs (the live chunk's included) — so a partial-commit impl would leak at
+	// least one of them.
+	freezeKinds(t, cat, 5, KindLedgers)
+	freezeKinds(t, cat, 6, KindLedgers)
+	readyHot(t, cat, 5)
+	readyHot(t, cat, 6)
+
+	req := RecoveryRequest{Lo: 5, Hi: 6, Tier: RecoverColdAndHot}
+
+	// The plan is composed against durable state first; planning does not mutate.
+	plan, err := PlanSurgicalRecovery(cat, req)
+	require.NoError(t, err)
+	require.False(t, plan.Empty())
+	require.NotEmpty(t, plan.ColdKeys)
+	require.NotEmpty(t, plan.HotKeys)
+
+	before := snapshotAllKeys(t, cat)
+
+	// Fail the batch from inside its callback: metastore drops the whole batch.
+	cat.hooks.failCommitBatch = func() bool { return true }
+	err = cat.ApplySurgicalRecovery(plan)
+	require.Error(t, err, "ApplySurgicalRecovery must surface the injected batch failure")
+	cat.hooks.failCommitBatch = nil
+
+	// All-or-nothing: the failed batch wrote NOTHING — every cold/hot key is
+	// still exactly as seeded.
+	after := snapshotAllKeys(t, cat)
+	require.Equal(t, before, after,
+		"a dropped recovery batch must leave every demotion key unchanged (atomicity)")
+
+	// And a clean re-apply (no fault) lands the whole batch.
+	require.NoError(t, cat.ApplySurgicalRecovery(plan))
+	require.Equal(t, StateFreezing, mustState(t, cat, 5, KindLedgers))
+	require.Equal(t, StateFreezing, mustState(t, cat, 6, KindLedgers))
+	require.Equal(t, HotTransient, mustHotState(t, cat, 5))
+	require.Equal(t, HotTransient, mustHotState(t, cat, 6))
+}
+
+// snapshotAllKeys returns a map of every meta-store key to its value, for
+// no-op / atomicity assertions. It walks the chunk + hot key families.
+func snapshotAllKeys(t *testing.T, cat *Catalog) map[string]string {
+	t.Helper()
+	m := map[string]string{}
+	refs, err := cat.ChunkArtifactKeys()
+	require.NoError(t, err)
+	for _, r := range refs {
+		m[r.Key()] = string(r.State)
+	}
+	hots, err := cat.HotChunkKeys()
+	require.NoError(t, err)
+	for _, id := range hots {
+		m[hotChunkKey(id)] = string(mustHotState(t, cat, id))
+	}
+	return m
+}
+
+func TestSurgicalRecovery_HotOnly_LeavesColdUntouched(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// The case-4 fixture: cold artifacts survive on durable storage; only the
+	// hot DBs are lost. A hot-only recovery must NOT touch any cold key.
+	freezeKinds(t, cat, 5, KindLedgers)
+	readyHot(t, cat, 5)
+	readyHot(t, cat, 6)
+
+	plan, err := cat.SurgicalRecovery(RecoveryRequest{Lo: 5, Hi: 6, Tier: RecoverHotOnly})
+	require.NoError(t, err)
+
+	require.Empty(t, plan.ColdKeys, "hot-only recovery must not list cold keys")
+	require.Len(t, plan.HotKeys, 2)
+
+	// Cold keys are exactly as seeded.
+	require.Equal(t, StateFrozen, mustState(t, cat, 5, KindLedgers))
+
+	// Only the hot keys were demoted.
+	require.Equal(t, HotTransient, mustHotState(t, cat, 5))
+	require.Equal(t, HotTransient, mustHotState(t, cat, 6))
+}
+
+func TestSurgicalRecovery_NeverCreatesAbsentKeys(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Seed only chunk 5; recover a DISJOINT range [20, 25] that matches nothing.
+	freezeKinds(t, cat, 5, KindLedgers)
+	readyHot(t, cat, 5)
+
+	plan, err := cat.SurgicalRecovery(RecoveryRequest{Lo: 20, Hi: 25, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+	require.True(t, plan.Empty(), "a range matching no keys yields an empty plan")
+
+	// No key was conjured for any chunk in [20, 25].
+	for c := chunk.ID(20); c <= 25; c++ {
+		require.Equal(t, State(""), mustState(t, cat, c, KindLedgers))
+		require.Equal(t, HotState(""), mustHotState(t, cat, c))
+	}
+	// The seeded chunk is untouched.
+	require.Equal(t, StateFrozen, mustState(t, cat, 5, KindLedgers))
+	require.Equal(t, HotReady, mustHotState(t, cat, 5))
+}
+
+func TestSurgicalRecovery_RangeValidation(t *testing.T) {
+	cat, _ := testCatalog(t)
+	_, err := cat.SurgicalRecovery(RecoveryRequest{Lo: 7, Hi: 3, Tier: RecoverColdAndHot})
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "lo")
+}
+
+// TestSurgicalRecovery_ColdBoundary proves the cold-key range predicate is
+// inclusive at both endpoints and excludes strictly-out-of-range chunks.
+func TestSurgicalRecovery_ColdBoundary(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Frozen cold artifacts at the range edges and just outside [10, 20].
+	for _, c := range []chunk.ID{9, 10, 20, 21} {
+		freezeKinds(t, cat, c, KindLedgers)
+	}
+
+	plan, err := PlanSurgicalRecovery(cat, RecoveryRequest{Lo: 10, Hi: 20, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+
+	selected := map[string]bool{}
+	for _, ref := range plan.ColdKeys {
+		selected[ref.Key()] = true
+	}
+	require.True(t, selected[chunkKey(10, KindLedgers)], "chunk 10 is the low edge (inclusive)")
+	require.True(t, selected[chunkKey(20, KindLedgers)], "chunk 20 is the high edge (inclusive)")
+	require.False(t, selected[chunkKey(9, KindLedgers)], "chunk 9 is below the range")
+	require.False(t, selected[chunkKey(21, KindLedgers)], "chunk 21 is above the range")
+}
+
+// ---------------------------------------------------------------------------
+// Self-correcting watermark. Demoting hot keys regresses deriveWatermark to the
+// last frozen boundary; demoting strictly below the live chunk leaves it
+// unchanged. No manual rewind.
+// ---------------------------------------------------------------------------
+
+// TestSurgicalRecovery_SelfCorrectingWatermark_RegressesToLastFrozenBoundary
+// is the design's case-3/4 claim made concrete: a demotion reaching the live
+// chunk rewinds the derived watermark to the last frozen boundary, with NO
+// stored pointer to edit.
+func TestSurgicalRecovery_SelfCorrectingWatermark_RegressesToLastFrozenBoundary(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq)) // genesis floor
+
+	// Cold history: chunks 0..2 fully durable (frozen). Last frozen boundary is
+	// chunk 2's last ledger.
+	makeChunkDurable(t, cat, 0)
+	makeChunkDurable(t, cat, 1)
+	makeChunkDurable(t, cat, 2)
+
+	// Live chunk 3: a real hot DB committed mid-chunk. The watermark must reflect
+	// this committed frontier BEFORE recovery.
+	live := chunk.ID(3)
+	db := openLiveHotDB(t, cat, live)
+	committed := live.FirstLedger() + 4321
+	require.NoError(t, db.Ledgers().AddLedgers(ledger.Entry{Seq: committed, Bytes: []byte("live")}))
+	require.NoError(t, db.Close())
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	before, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, committed, before, "watermark reflects the live DB's committed frontier")
+
+	// Recovery reaches the live chunk (range [3, 3]): its hot key -> "transient".
+	// The hot dir is left in place; demotion is pure key surgery.
+	_, err = cat.SurgicalRecovery(RecoveryRequest{Lo: live, Hi: live, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+
+	// deriveWatermark now ignores the demoted (no-longer-"ready") live key and
+	// lands at chunk 2's last ledger — the last frozen boundary. No rewind edit.
+	after, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, chunk.ID(2).LastLedger(), after,
+		"demoting the live hot key regresses the watermark to the last frozen boundary")
+	require.Less(t, after, before, "the watermark strictly regressed")
+}
+
+// TestSurgicalRecovery_DemotionBelowLiveLeavesWatermarkUnchanged proves the
+// other half of the uniformity claim: a demotion strictly BELOW the live chunk
+// leaves the watermark put — those chunks are not the highest "ready" key, and
+// the live chunk's "ready" DB still pins the bound.
+func TestSurgicalRecovery_DemotionBelowLiveLeavesWatermarkUnchanged(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	makeChunkDurable(t, cat, 0)
+	makeChunkDurable(t, cat, 1)
+
+	// Two ready hot chunks: a lower one (2) and the live one (5) with a real DB.
+	readyHot(t, cat, 2)
+	live := chunk.ID(5)
+	db := openLiveHotDB(t, cat, live)
+	committed := live.FirstLedger() + 100
+	require.NoError(t, db.Ledgers().AddLedgers(ledger.Entry{Seq: committed, Bytes: []byte("live")}))
+	require.NoError(t, db.Close())
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	before, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, committed, before)
+
+	// Demote ONLY the lower hot chunk 2 (strictly below the live chunk 5).
+	_, err = cat.SurgicalRecovery(RecoveryRequest{Lo: 2, Hi: 2, Tier: RecoverHotOnly})
+	require.NoError(t, err)
+	require.Equal(t, HotTransient, mustHotState(t, cat, 2))
+
+	after, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, before, after,
+		"demoting a hot key strictly below the live chunk leaves the watermark unchanged")
+}
+
+// TestSurgicalRecovery_CatchupReDerivesFreezingColdArtifacts proves the cold
+// half heals through existing machinery: a chunk whose artifacts were demoted to
+// "freezing" is no longer counted durable by highestDurableChunk — which is
+// exactly the signal that makes backfill's per-chunk resolver re-materialize it
+// (rule 1, overwriting in place). We assert the durable-chunk frontier regresses
+// past the demoted chunk.
+func TestSurgicalRecovery_CatchupReDerivesFreezingColdArtifacts(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Chunks 0..3 durable; the durable frontier is 3.
+	for c := chunk.ID(0); c <= 3; c++ {
+		makeChunkDurable(t, cat, c)
+	}
+	frontier, err := highestDurableChunk(cat)
+	require.NoError(t, err)
+	require.Equal(t, int64(3), frontier)
+
+	// Taint chunks 2..3 (cold only). Their artifacts drop to "freezing".
+	_, err = cat.SurgicalRecovery(RecoveryRequest{Lo: 2, Hi: 3, Tier: RecoverColdAndHot})
+	require.NoError(t, err)
+	require.Equal(t, StateFreezing, mustState(t, cat, 2, KindLedgers))
+	require.Equal(t, StateFreezing, mustState(t, cat, 3, KindLedgers))
+
+	// The durable frontier regresses to chunk 1 — chunks 2 and 3 are now
+	// re-derivable "freezing" debris, not durable truth. Catch-up's resolver will
+	// schedule their re-materialization; we assert the watermark/frontier input
+	// that drives it.
+	frontier, err = highestDurableChunk(cat)
+	require.NoError(t, err)
+	require.Equal(t, int64(1), frontier,
+		"demoting cold artifacts to freezing regresses the durable-chunk frontier")
+}
+
+// ---------------------------------------------------------------------------
+// Hot-volume-loss detection (case 4) — the fatal already exists; verify it.
+// ---------------------------------------------------------------------------
+
+// TestHotVolumeLoss_DeriveWatermarkFatalOnReadyKeyMissingDir is the case-4
+// fatal: a "ready" hot key whose dir is gone is hot-volume loss, surfaced as
+// ErrHotVolumeLost — never silently healed.
+func TestHotVolumeLoss_DeriveWatermarkFatalOnReadyKeyMissingDir(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// A ready hot key WITHOUT its dir (the lost-volume shape: meta survived, the
+	// ephemeral hot tree did not). readyHot creates the dir; do it by hand and
+	// then remove the dir to simulate loss.
+	live := chunk.ID(4)
+	require.NoError(t, cat.PutHotTransient(live))
+	require.NoError(t, cat.FlipHotReady(live))
+	require.NoError(t, os.RemoveAll(cat.layout.HotChunkPath(live)))
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+	_, err := deriveWatermark(cat, probe)
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrHotVolumeLost,
+		"a ready hot key with a missing dir must fatal as ErrHotVolumeLost")
+}
+
+// TestHotVolumeLoss_OpenHotTierFatalOnReadyKeyMissingDir is the same fatal at
+// the OTHER detection site — openHotTierForChunk, which a later open would hit
+// if derivation somehow didn't.
+func TestHotVolumeLoss_OpenHotTierFatalOnReadyKeyMissingDir(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	live := chunk.ID(4)
+	require.NoError(t, cat.PutHotTransient(live))
+	require.NoError(t, cat.FlipHotReady(live))
+	require.NoError(t, os.RemoveAll(cat.layout.HotChunkPath(live)))
+
+	_, err := openHotTierForChunk(cat, live, silentLogger())
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrHotVolumeLost,
+		"opening a ready hot key with a missing dir must fatal as ErrHotVolumeLost")
+}
+
+// TestHotVolumeLoss_RecoveryThenWatermarkHealsForward ties case 4 end to end:
+// the operator demotes the orphaned hot key (hot-only), the fatal stops firing
+// (it checks "ready" keys), and the watermark falls to the last frozen boundary
+// for re-ingestion to fill forward.
+func TestHotVolumeLoss_RecoveryThenWatermarkHealsForward(t *testing.T) {
+	cat, _ := testCatalog(t)
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+
+	// Durable cold history through chunk 2 (survives on durable storage).
+	for c := chunk.ID(0); c <= 2; c++ {
+		makeChunkDurable(t, cat, c)
+	}
+
+	// Orphaned live hot key: "ready" with a missing dir (the lost NVMe).
+	live := chunk.ID(3)
+	require.NoError(t, cat.PutHotTransient(live))
+	require.NoError(t, cat.FlipHotReady(live))
+	require.NoError(t, os.RemoveAll(cat.layout.HotChunkPath(live)))
+
+	probe := NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger())
+
+	// Before recovery: the fatal fires.
+	_, err := deriveWatermark(cat, probe)
+	require.ErrorIs(t, err, ErrHotVolumeLost)
+
+	// Operator runs the case-4 (hot-only) recovery over the orphaned chunk.
+	_, err = cat.SurgicalRecovery(RecoveryRequest{Lo: live, Hi: live, Tier: RecoverHotOnly})
+	require.NoError(t, err)
+	require.Equal(t, HotTransient, mustHotState(t, cat, live))
+
+	// After recovery: no "ready" key with a missing dir, so the fatal no longer
+	// fires; the watermark falls to the last frozen boundary (chunk 2's last
+	// ledger) for captive core to re-ingest the lost tail forward.
+	after, err := deriveWatermark(cat, probe)
+	require.NoError(t, err)
+	require.Equal(t, chunk.ID(2).LastLedger(), after,
+		"after hot-only recovery the watermark heals to the last frozen boundary")
+}
+
+// ---------------------------------------------------------------------------
+// Operator entrypoint — RunSurgicalRecovery: stopped-daemon-only (flock) and
+// the end-to-end open/demote/close happy path.
+// ---------------------------------------------------------------------------
+
+// recoveryConfig builds a Config rooted at a temp dir, enough for
+// RunSurgicalRecovery (which only needs the data dir + cpi default).
+func recoveryConfig(t *testing.T) Config {
+	t.Helper()
+	return Config{
+		Service:   ServiceConfig{DefaultDataDir: t.TempDir()},
+		Streaming: StreamingConfig{EarliestLedger: "genesis"},
+	}
+}
+
+func TestRunSurgicalRecovery_RefusesWhileDaemonRunning(t *testing.T) {
+	cfg := recoveryConfig(t)
+	paths := cfg.WithDefaults().ResolvePaths()
+
+	// Hold one of the storage-root flocks (the hot tree — any root would do;
+	// RunSurgicalRecovery takes them all) to stand in for ANOTHER process that
+	// owns it. This proves the ErrRootLocked fail-fast fires whenever a root is
+	// already held; it is the same guard a daemon will trip ONCE the daemon-side
+	// LockRoots wiring lands (today the daemon does not take these flocks, so the
+	// live-daemon guard is instead RocksDB's metastore single-writer LOCK — see
+	// the STOPPED-DAEMON-ONLY note in recovery.go).
+	held, err := LockRoots(paths.HotStorage)
+	require.NoError(t, err)
+	defer held.Release()
+
+	_, err = RunSurgicalRecovery(cfg, RecoveryRequest{Lo: 1, Hi: 2, Tier: RecoverColdAndHot}, silentLogger(), nil)
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrRootLocked,
+		"recovery against a running daemon must fail fast with ErrRootLocked")
+}
+
+func TestRunSurgicalRecovery_HappyPath_OpensDemotesCloses(t *testing.T) {
+	cfg := recoveryConfig(t)
+	paths := cfg.WithDefaults().ResolvePaths()
+
+	// Seed durable state through a catalog on the SAME meta path the entrypoint
+	// will reopen, then CLOSE it (RocksDB is single-writer; the entrypoint takes
+	// the lock + reopens).
+	seedStore, err := metastore.New(paths.Catalog, silentLogger())
+	require.NoError(t, err)
+	seedCat := NewCatalog(seedStore, NewLayout(paths.DataDir))
+	freezeKinds(t, seedCat, 5, KindLedgers)
+	require.NoError(t, seedCat.PutHotTransient(5))
+	require.NoError(t, seedCat.FlipHotReady(5))
+	require.NoError(t, seedStore.Close())
+
+	// Run the entrypoint: it locks every root, reopens the store, commits the
+	// demotion batch, and releases.
+	plan, err := RunSurgicalRecovery(cfg,
+		RecoveryRequest{Lo: 5, Hi: 5, Tier: RecoverColdAndHot}, silentLogger(), nil)
+	require.NoError(t, err)
+	require.False(t, plan.Empty())
+	require.Len(t, plan.ColdKeys, 1)
+	require.Len(t, plan.HotKeys, 1)
+
+	// The entrypoint released its locks, so a fresh reopen sees the demotions.
+	verifyStore, err := metastore.New(paths.Catalog, silentLogger())
+	require.NoError(t, err)
+	defer func() { _ = verifyStore.Close() }()
+	verifyCat := NewCatalog(verifyStore, NewLayout(paths.DataDir))
+
+	require.Equal(t, StateFreezing, mustState(t, verifyCat, 5, KindLedgers))
+	require.Equal(t, HotTransient, mustHotState(t, verifyCat, 5))
+}
+
+func TestRunSurgicalRecovery_EmptyRangeReportsErrRecoveryEmptyRange(t *testing.T) {
+	cfg := recoveryConfig(t)
+	paths := cfg.WithDefaults().ResolvePaths()
+
+	// Open and immediately close the store so the path exists but holds no keys.
+	store, err := metastore.New(paths.Catalog, silentLogger())
+	require.NoError(t, err)
+	require.NoError(t, store.Close())
+
+	plan, err := RunSurgicalRecovery(cfg,
+		RecoveryRequest{Lo: 1, Hi: 9, Tier: RecoverColdAndHot}, silentLogger(), nil)
+	require.ErrorIs(t, err, ErrRecoveryEmptyRange,
+		"a range matching no keys reports ErrRecoveryEmptyRange")
+	require.True(t, plan.Empty())
+
+	// Sanity: lock files were created under each root (and released).
+	_, statErr := os.Stat(filepath.Join(paths.HotStorage, lockFileName))
+	require.NoError(t, statErr)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/retention_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/retention_test.go
new file mode 100644
index 000000000..56089f709
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/retention_test.go
@@ -0,0 +1,185 @@
+package streaming
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// Reader retention contract (retention.go): a seq below the floor is not-found
+// regardless of on-disk state. These are pure-arithmetic unit tests; the
+// straddling-window scenario below ties the gate to real on-disk artifacts.
+// ---------------------------------------------------------------------------
+
+func TestRetentionGate_AdmitsAtAndAboveFloor(t *testing.T) {
+	// through = chunk 100's last ledger, retain 10 chunks ⇒ floor = chunk 91's
+	// first ledger (effectiveRetentionFloor: 100-10+1 = 91).
+	through := chunk.ID(100).LastLedger()
+	gate := NewRetentionGate(through, 10, 0)
+	require.Equal(t, chunk.ID(91).FirstLedger(), gate.Floor())
+
+	tests := []struct {
+		name string
+		seq  uint32
+		want bool
+	}{
+		{"one below the floor => not-found", gate.Floor() - 1, false},
+		{"exactly the floor => admitted", gate.Floor(), true},
+		{"floor chunk's last ledger => admitted", chunk.ID(91).LastLedger(), true},
+		{"well above the floor => admitted", chunk.ID(100).FirstLedger(), true},
+		{"genesis (far below) => not-found", chunk.FirstLedgerSeq, false},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			assert.Equal(t, tc.want, gate.Admits(tc.seq))
+			// The free function and the gate agree (one definition).
+			assert.Equal(t, tc.want, seqWithinRetention(tc.seq, through, 10, 0))
+		})
+	}
+}
+
+// Shortening retention raises the floor immediately in the gate — no per-chunk
+// state to migrate. The SAME (through, earliest) with a smaller retentionChunks
+// yields a higher floor, so seqs that were admitted become not-found at once.
+func TestRetentionGate_ShorteningRaisesFloorImmediately(t *testing.T) {
+	through := chunk.ID(100).LastLedger()
+
+	wide := NewRetentionGate(through, 50, 0)   // floor = chunk 51
+	narrow := NewRetentionGate(through, 10, 0) // floor = chunk 91
+	require.Equal(t, chunk.ID(51).FirstLedger(), wide.Floor())
+	require.Equal(t, chunk.ID(91).FirstLedger(), narrow.Floor())
+
+	// A seq in chunk 60: inside the wide window, below the narrowed floor.
+	seq := chunk.ID(60).FirstLedger()
+	assert.True(t, wide.Admits(seq), "in range under the wide retention")
+	assert.False(t, narrow.Admits(seq), "shortening retention makes it not-found at once")
+}
+
+// ChunkBelowFloor: a chunk wholly below the floor is past retention; one
+// straddling it is not.
+func TestRetentionGate_ChunkBelowFloor(t *testing.T) {
+	// through = chunk 11's last ledger, retain 4 chunks ⇒ floor = chunk 8's first
+	// ledger (11-4+1 = 8).
+	through := chunk.ID(11).LastLedger()
+	gate := NewRetentionGate(through, 4, 0)
+	require.Equal(t, chunk.ID(8).FirstLedger(), gate.Floor())
+
+	// Chunk 7 is below the floor; chunk 8 is the floor chunk.
+	assert.True(t, gate.ChunkBelowFloor(7))
+	assert.False(t, gate.ChunkBelowFloor(8))
+}
+
+// ---------------------------------------------------------------------------
+// Scenario: a chunk STRADDLING the floor serves in-range seqs and not-found
+// below. The reader gate makes below-floor reads not-found regardless of what
+// is on disk, while the in-range tail still serves. Only chunks WHOLLY below the
+// floor are swept by the prune scan; a straddling chunk's frozen ledger artifact
+// survives.
+// ---------------------------------------------------------------------------
+
+func TestReaderRetention_StraddlingFloorServesInRangeNotBelow(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Chunks 0..3 have their ledger artifacts frozen, written when the floor sat at
+	// genesis.
+	for c := chunk.ID(0); c <= 3; c++ {
+		freezeKinds(t, cat, c, KindLedgers)
+		writeArtifact(t, cat.layout.LedgerPackPath(c))
+	}
+
+	// The floor later rose to chunk 2 (its first ledger): chunks 0,1 below it,
+	// chunks 2,3 in range.
+	through := chunk.ID(3).LastLedger()
+	// Pick retentionChunks so the sliding floor lands on chunk 2:
+	// lastCompleteChunkAt(through)=3, floor chunk = 3-retention+1 = 2 ⇒ retention=2.
+	gate := NewRetentionGate(through, 2, 0)
+	require.Equal(t, chunk.ID(2).FirstLedger(), gate.Floor(), "the floor lands at chunk 2")
+
+	// A seq in chunk 2 or 3 (in range) is admitted; a seq in chunk 0 or 1 is
+	// not-found regardless of the file still being on disk.
+	assert.True(t, gate.Admits(chunk.ID(2).FirstLedger()), "floor chunk: in range")
+	assert.True(t, gate.Admits(chunk.ID(3).LastLedger()), "above the floor: in range")
+	assert.False(t, gate.Admits(chunk.ID(1).LastLedger()), "below the floor: not-found")
+	assert.False(t, gate.Admits(chunk.ID(0).FirstLedger()), "below the floor: not-found")
+
+	// The prune scan sweeps only the WHOLLY-below-floor chunks 0,1; chunks 2,3
+	// survive — exactly the data the gate admits.
+	cfg, _ := lifecycleTestConfig(t, cat, 2)
+	pops, err := eligiblePruneOps(cfg, cat, through)
+	require.NoError(t, err)
+	for _, op := range pops {
+		require.NoError(t, op())
+	}
+
+	for c := chunk.ID(0); c <= 1; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, State(""), ledgers, "below-floor chunk %s pruned", c)
+	}
+	for c := chunk.ID(2); c <= 3; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, StateFrozen, ledgers, "in-range chunk %s survives", c)
+	}
+	assertQuiescent(t, cfg, cat, through)
+}
+
+// ---------------------------------------------------------------------------
+// Scenario: retention SHORTENING prunes the newly-out-of-range chunks
+// immediately. The prune scan reads the floor live from (through,
+// RetentionChunks), so a smaller RetentionChunks raises the floor and the next
+// tick sweeps the chunks that just fell past it — keys and files alike.
+// ---------------------------------------------------------------------------
+
+func TestReaderRetention_ShorteningPrunesNewlyOutOfRangeChunks(t *testing.T) {
+	cat, _ := testCatalog(t)
+
+	// Chunks 0..5 fully frozen, with a real .pack on disk. Live chunk 6
+	// (positional ⇒ through = chunk 5's last).
+	for c := chunk.ID(0); c <= 5; c++ {
+		freezeKinds(t, cat, c, KindLedgers)
+		writeArtifact(t, cat.layout.LedgerPackPath(c))
+	}
+	live := openLiveHotDB(t, cat, 6)
+	t.Cleanup(func() { _ = live.Close() })
+
+	through, err := deriveCompleteThrough(cat)
+	require.NoError(t, err)
+	require.Equal(t, chunk.ID(5).LastLedger(), through)
+
+	// Under wide retention (5 chunks) the floor would be chunk 1's first ledger,
+	// so only chunk 0 would be past it — documenting the pre-shortening floor.
+	require.Equal(t, chunk.ID(1).FirstLedger(),
+		effectiveRetentionFloor(through, 5, 0), "the wide-retention floor is chunk 1")
+
+	// Now SHORTEN retention to 2 chunks: floor = chunk 4's first ledger. Chunks
+	// 0..3 are now past retention and must be swept on the next tick.
+	cfg, rec := lifecycleTestConfig(t, cat, 2)
+	require.Equal(t, chunk.ID(4).FirstLedger(),
+		effectiveRetentionFloor(through, 2, 0), "shortening raised the floor to chunk 4")
+
+	runTickForCatalog(context.Background(), t, cfg, cat)
+	require.False(t, rec.fired(), "a shortening prune tick never aborts: %v", rec.last.Load())
+
+	// Chunks 0..3 (newly out of range) are gone — keys and files.
+	for c := chunk.ID(0); c <= 3; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, State(""), ledgers, "chunk %s key swept by the shortened floor", c)
+		assert.NoFileExists(t, cat.layout.LedgerPackPath(c), "chunk %s pack swept", c)
+	}
+	// Chunks 4,5 (the new retention window) survive.
+	for c := chunk.ID(4); c <= 5; c++ {
+		ledgers, serr := cat.State(c, KindLedgers)
+		require.NoError(t, serr)
+		assert.Equal(t, StateFrozen, ledgers, "chunk %s within the shortened retention survives", c)
+		assert.FileExists(t, cat.layout.LedgerPackPath(c))
+	}
+
+	assertQuiescent(t, cfg, cat, through)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/startup.go b/cmd/stellar-rpc/internal/fullhistory/streaming/startup.go
new file mode 100644
index 000000000..d6456f14d
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/startup.go
@@ -0,0 +1,450 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// startStreaming is the daemon's startup orchestration — the design's "Daemon
+// flow -> Startup", in two steps:
+//
+//  1. CATCH UP via backfill. Bring on-disk coverage in line with the retention
+//     window: each pass backfills up through the last complete chunk at the
+//     network tip, re-passing while new chunks appear at the tip, with one
+//     exclusion — a mid-chunk watermark within one chunk of the tip leaves the
+//     partial resume chunk to ingestion (core replays its tail faster than a
+//     bulk refetch, and a mid-chunk watermark can only have come from the live
+//     hot DB, so the data is local by construction). runBackfill is the SAME
+//     resolve + executePlan the lifecycle tick uses (Phase B); there is no
+//     upfront producibility gate — each chunk's producibility is enforced
+//     lazily during its build by the cold ingest.
+//
+//  2. SERVE + INGEST. Open the resume chunk's hot DB (Issue 10), start captive
+//     core (injected), launch the lifecycle goroutine (Issue 11) on a doorbell,
+//     start serving reads (injected), and run the ingestion loop (Issue 10).
+//     The ingestion loop's first act is a doorbell ring, so the first lifecycle
+//     tick doubles as startup convergence (finishing crash leftovers + pruning
+//     downtime leftovers concurrently with early serving).
+//
+// EVERYTHING the daemon needs that startup cannot construct itself crosses an
+// INJECTED interface (StartConfig.NetworkTip, .Core, .ServeReads), so this is
+// unit-testable without captive core, a real bulk backend, or a real RPC
+// server. validateConfig (the full TOML form) is Phase D; this accepts an
+// already-resolved StartConfig and the pinned earliest_ledger is read from the
+// catalog.
+//
+// It returns nil only on a clean shutdown (ctx canceled mid-run, or the
+// ingestion loop's clean stop); any other return is restartable error the
+// daemon's top-level loop surfaces (ErrFirstStartNoTip on a true first start
+// with no reachable backend; a backfill/ingest failure; ErrHotVolumeLost).
+func startStreaming(ctx context.Context, cfg StartConfig) error {
+	if err := cfg.validate(); err != nil {
+		return err
+	}
+	cfg = cfg.withDefaults()
+	cat := cfg.Exec.Catalog
+	logger := cfg.Exec.Logger
+
+	// earliest_ledger is pinned by validateConfig BEFORE startStreaming runs (the
+	// design's flow; the full TOML form is Phase D). It must be present here: the
+	// loop's first-start predicate is `lastCommitted < earliest`, which only
+	// classifies correctly when earliest is the real pinned floor (e.g. genesis
+	// pins earliest=2, the watermark sentinel preGenesisLedger=1 sits below it).
+	// An absent pin would read as 0 and mis-classify a genuine first start as a
+	// degrade-and-serve restart, so refuse it loudly rather than silently.
+	earliest, pinned, err := cat.EarliestLedger()
+	if err != nil {
+		return fmt.Errorf("streaming: startup read earliest ledger: %w", err)
+	}
+	if !pinned {
+		return errors.New("streaming: startup requires config:earliest_ledger pinned " +
+			"(validateConfig pins it before startStreaming; not done here)")
+	}
+
+	// Derived, never stored: the highest ledger durably committed (frozen cold
+	// artifacts vs the highest ready hot DB's max committed seq, clamped by
+	// earliest-1). With a probe it does ONE read of the highest ready hot DB and
+	// detects hot-volume loss LAZILY on that open (ErrHotVolumeLost) before
+	// ingestion ever opens a writer.
+	lastCommitted, err := lastCommittedLedger(cat, cfg.Exec.Process.HotProbe)
+	if err != nil {
+		return fmt.Errorf("streaming: startup derive watermark: %w", err)
+	}
+
+	metrics := cfg.Exec.metrics()
+	metrics.Watermark(lastCommitted, effectiveRetentionFloor(lastCommitted, cfg.Lifecycle.RetentionChunks, earliest))
+	logger.WithField("last_committed", lastCommitted).
+		WithField("earliest", earliest).
+		WithField("pinned", pinned).
+		Info("streaming: startup — watermark derived, beginning catch-up")
+
+	// Step 1: catch up via backfill.
+	lastCommitted, err = catchUp(ctx, cfg, lastCommitted, earliest)
+	if err != nil {
+		return err
+	}
+
+	logger.WithField("last_committed", lastCommitted).
+		WithField("resume_chunk", chunk.IDFromLedger(lastCommitted+1).String()).
+		Info("streaming: catch-up complete — opening resume hot tier and ingesting")
+
+	// Step 2: serve + ingest. resumeLedger is one past the watermark — the live
+	// chunk's next un-committed ledger (or the chunk's first ledger on an empty
+	// resume DB; runIngestionLoop re-derives the exact resume point from durable
+	// state, so a lastCommitted that lands mid-chunk and a lastCommitted on a
+	// chunk boundary both resume correctly).
+	resumeLedger := lastCommitted + 1
+	resumeChunk := chunk.IDFromLedger(resumeLedger)
+
+	hotDB, err := openHotTierForChunk(cat, resumeChunk, logger)
+	if err != nil {
+		return fmt.Errorf("streaming: startup open resume hot tier chunk %s: %w", resumeChunk, err)
+	}
+
+	// Start captive core from the resume ledger. On failure the resume hot DB is
+	// already open; close it so a restart re-opens cleanly (the bracket is
+	// idempotent, but the rocksdb LOCK must be released).
+	core, closeCore, err := cfg.Core.OpenCore(ctx, resumeLedger)
+	if err != nil {
+		_ = hotDB.Close()
+		return fmt.Errorf("streaming: startup start captive core at ledger %d: %w", resumeLedger, err)
+	}
+	defer func() {
+		if closeCore != nil {
+			_ = closeCore()
+		}
+	}()
+
+	// The lifecycle goroutine runs one tick per notification, carrying the just-
+	// completed chunk id. Buffered to lifecycleQueueDepth; the ingestion loop
+	// sends at every chunk boundary. It shares NO in-memory state with ingestion —
+	// it derives everything from durable keys.
+	lifecycleCh := make(chan chunk.ID, lifecycleQueueDepth)
+
+	// Seed the first tick with the last complete chunk at the resume point so its
+	// run fires at once — clearing crash/downtime leftovers concurrently with
+	// serving (the design's startup seed: lastCompleteChunkAt(resumeLedger - 1)).
+	// Skipped on a young network where no chunk is complete (nothing to converge;
+	// the first real boundary triggers the first tick).
+	if seed := lastCompleteChunkAt(lastCommitted); seed >= 0 {
+		lifecycleCh <- chunk.ID(seed) //nolint:gosec // seed >= 0
+	}
+
+	// The lifecycle goroutine is tied to a PER-ITERATION child ctx, not the
+	// daemon-lifetime ctx, and is canceled + JOINED before startStreaming returns
+	// for ANY reason. This restores the design's single-lifecycle-goroutine
+	// invariant: startStreaming returns on a restartable error (a captive-core /
+	// GetLedger hiccup, a boundary hot-DB open failure) and superviseStreaming
+	// restarts it with the SAME live daemon ctx after a backoff — so if the
+	// lifecycle were tied to the daemon ctx, the prior iteration's loop would never
+	// be canceled and would leak (blocked forever on the old channel) or, worse,
+	// run a tick CONCURRENTLY with the next iteration's lifecycle + ingestion (two
+	// RunColdChunk passes truncating the same .pack/.idx; a stale tick's op error
+	// firing Fatalf). runLifecycleTick checks ctx at every step and executePlan
+	// returns on cancellation, so the join cannot block past the current step.
+	lifecycleCtx, cancelLifecycle := context.WithCancel(ctx)
+	var lifecycleWG sync.WaitGroup
+	lifecycleWG.Go(func() {
+		lifecycleLoop(lifecycleCtx, cfg.Lifecycle, cat, lifecycleCh)
+	})
+	// Cancel + join the lifecycle goroutine. This defer runs only on the two return
+	// paths registered after it: the ingestion-loop return (ingestion is a
+	// synchronous same-goroutine call whose inline notify is the sole writer to
+	// lifecycleCh, so it has already stopped) and the ServeReads error path
+	// (ingestion never started). Either way no send on lifecycleCh can race the
+	// cancel. The earlier error paths (resume hot-DB open, OpenCore) return BEFORE
+	// this defer is registered and before the goroutine starts — nothing to join.
+	defer func() {
+		cancelLifecycle()
+		lifecycleWG.Wait()
+	}()
+
+	// Begin serving reads (injected). Serve-readiness is established by step 1
+	// plus the resume chunk's hot DB just opened — crash debris and downtime
+	// leftovers are reader-invisible, so the first tick clears them concurrently
+	// with serving rather than ahead of it.
+	if err := cfg.ServeReads(ctx); err != nil {
+		_ = hotDB.Close()
+		return fmt.Errorf("streaming: startup serve reads: %w", err)
+	}
+
+	// The ingestion loop owns hotDB for the rest of its life (it closes it on any
+	// exit and reopens at each boundary). Returns the GetLedger/boundary error;
+	// the daemon top level classifies a ctx-canceled return as a clean shutdown.
+	return runIngestionLoop(ctx, core, hotDB, cat, lifecycleCh, allHotTypes, logger, metrics)
+}
+
+// catchUp runs the design's catch-up loop, mutating and returning lastCommitted
+// as backfill makes progress. It samples networkTip each pass (degrading to
+// lastCommitted on a transient backend error, FATAL via ErrFirstStartNoTip when
+// there is no local history to serve either), anchors on max(tip, lastCommitted)
+// to guard a lagging bulk tip, computes the [rangeStart, rangeEnd] window with
+// the mid-chunk resume exclusion, and breaks on an empty/already-done range.
+//
+// backfilledThrough guards against infinite re-passes when the tip stops moving:
+// a rangeEnd that does not advance past the previous pass breaks the loop.
+func catchUp(ctx context.Context, cfg StartConfig, lastCommitted, earliest uint32) (uint32, error) {
+	retentionChunks := cfg.Lifecycle.RetentionChunks
+	metrics := cfg.Exec.metrics()
+	logger := cfg.Exec.Logger
+
+	backfilledThrough := int64(-1)
+	for {
+		if err := ctx.Err(); err != nil {
+			return 0, err
+		}
+
+		tip, err := networkTip(ctx, cfg.NetworkTip, cfg.TipBackoff, cfg.TipMaxAttempts)
+		if err != nil {
+			if lastCommitted < earliest {
+				// True first start (no committed progress) with no reachable backend:
+				// we can neither catch up nor serve local history. FATAL — never
+				// start serving on empty/incomplete history. Returned as a sentinel
+				// (not a process exit) so the daemon's top-level loop owns the
+				// fatal-and-surface decision and the supervisor restarts; networkTip
+				// retries on the next process start.
+				return 0, fmt.Errorf("%w: %w", ErrFirstStartNoTip, err)
+			}
+			// Restart with local progress: the window below lastCommitted is
+			// complete (catch-up-before-advance), so serve what is materialized and
+			// skip catch-up this pass. A later pass with a reachable backend resumes
+			// extending the bottom of storage.
+			tip = lastCommitted
+		}
+
+		// max() guards a lagging bulk tip in BOTH uses below: anchored on the tip
+		// alone, the floor would regress below where pruning advanced, and a
+		// complete watermark chunk could fall outside the range. When the tip leads
+		// (long downtime) it is the correct anchor.
+		anchor := maxU32(tip, lastCommitted)
+		rangeStart := chunk.IDFromLedger(effectiveRetentionFloor(anchor, retentionChunks, earliest))
+
+		// rangeEnd anchored on the same max() so a complete watermark chunk above a
+		// lagging bulk tip still folds into its window's index before serving. The
+		// span beyond the bulk tip is only durable chunks (production self-skips) or
+		// complete-in-hot-DB chunks (backfillSource's hot branch) — the bulk backend
+		// is never asked for them.
+		rangeEndSigned := lastCompleteChunkAt(anchor)
+
+		// Mid-chunk resume exclusion: a mid-chunk watermark within one chunk of the
+		// tip leaves the partial resume chunk to ingestion. watermarkMidChunk is
+		// computed in the SIGNED domain so the genesis sentinel (lastCommitted =
+		// earliest-1, chunk-aligned by construction) reads as a boundary, never
+		// spuriously mid-chunk.
+		if withinOneChunkOfTip(tip, lastCommitted) && watermarkMidChunk(lastCommitted) {
+			// rangeEnd = chunkID(lastCommitted) - 1: stop one short of the live chunk.
+			rangeEndSigned = chunkIDOfLedger(lastCommitted) - 1
+		}
+
+		// Lag/progress gauges each pass: the live tip-vs-watermark gap and where
+		// catch-up has reached vs its target (the tip-anchored upper bound).
+		metrics.IngestionLag(tip, lastCommitted)
+		metrics.CatchupProgress(lastCommitted, anchor)
+
+		// Break on an empty range (rangeEnd < rangeStart — a young network, or the
+		// exclusion left nothing) or a non-advancing one (rangeEnd <=
+		// backfilledThrough — the tip stopped moving).
+		if rangeEndSigned < int64(rangeStart) || rangeEndSigned <= backfilledThrough {
+			break
+		}
+		rangeEnd := chunk.ID(rangeEndSigned) //nolint:gosec // > rangeStart >= 0
+
+		logger.WithField("range_lo", rangeStart.String()).
+			WithField("range_hi", rangeEnd.String()).
+			WithField("tip", tip).
+			WithField("last_committed", lastCommitted).
+			Info("streaming: catch-up pass starting")
+
+		passStart := time.Now()
+		if err := runBackfill(ctx, cfg.Exec, rangeStart, rangeEnd); err != nil {
+			return 0, fmt.Errorf("streaming: startup backfill [%s,%s]: %w", rangeStart, rangeEnd, err)
+		}
+		passDuration := time.Since(passStart)
+
+		// Advance the mutating watermark to the last ledger of the backfilled range
+		// (never regress — a lagging tip's rangeEnd can sit below lastCommitted).
+		lastCommitted = maxU32(lastCommitted, rangeEnd.LastLedger())
+		backfilledThrough = rangeEndSigned
+
+		metrics.CatchupPass(uint32(rangeStart), uint32(rangeEnd), passDuration)
+		metrics.CatchupProgress(lastCommitted, anchor)
+		logger.WithField("range_lo", rangeStart.String()).
+			WithField("range_hi", rangeEnd.String()).
+			WithField("last_committed", lastCommitted).
+			WithField("duration", passDuration.String()).
+			Info("streaming: catch-up pass complete")
+	}
+	return lastCommitted, nil
+}
+
+// withinOneChunkOfTip reports whether the watermark sits within one chunk of the
+// tip. SIGNED so a lagging bulk tip BELOW the resume point (tip < lastCommitted)
+// yields a negative difference < LedgersPerChunk and reads true — the watermark
+// is then certainly the live (near-tip) chunk's, the exclusion's intent.
+func withinOneChunkOfTip(tip, lastCommitted uint32) bool {
+	return int64(tip)-int64(lastCommitted) < int64(chunk.LedgersPerChunk)
+}
+
+// watermarkMidChunk reports whether lastCommitted falls strictly inside a chunk
+// (not on its last ledger). The genesis sentinel (preGenesisLedger) maps via
+// chunkIDOfLedger to chunk -1 whose "last ledger" is preGenesisLedger, so the
+// sentinel reads as a boundary — never spuriously mid-chunk.
+func watermarkMidChunk(lastCommitted uint32) bool {
+	c := chunkIDOfLedger(lastCommitted)
+	return lastCommitted != completeThrough(c)
+}
+
+// maxU32 is the unsigned max the catch-up arithmetic uses (the built-in max
+// works, but a named helper keeps the anchor/advance call sites self-documenting
+// alongside the signed helpers above).
+func maxU32(a, b uint32) uint32 { return max(a, b) }
+
+// ErrFirstStartNoTip is the first-start FATAL: no committed local progress AND
+// no reachable network tip, so the daemon can neither catch up nor serve a local
+// history. Returned as a sentinel (not a process exit) so the daemon's top-level
+// loop owns the fatal-and-surface decision and tests can assert it; the
+// supervisor restarts and networkTip retries on the next process start.
+var ErrFirstStartNoTip = errors.New("streaming: network tip unavailable and no local history to serve")
+
+// ---------------------------------------------------------------------------
+// Injected external boundaries. startStreaming touches NOTHING outside the
+// process directly: the network tip, captive core, and the read server all
+// cross an interface so startup is exercised end to end with fakes.
+// ---------------------------------------------------------------------------
+
+// NetworkTipBackend samples the configured bulk backend's current network tip
+// (the highest ledger the backend can serve). Production wraps the daemon's
+// LedgerBackend; tests pass a fake that is reachable / unreachable / unready.
+// It is consulted only during catch-up; once ingestion runs, captive core is
+// the tip.
+type NetworkTipBackend interface {
+	NetworkTip(ctx context.Context) (uint32, error)
+}
+
+// CoreOpener prepares captive core at resumeLedger and hands back a LedgerGetter
+// the ingestion loop polls plus a closer the caller defers. Production wraps
+// captive core's PrepareRange + GetLedger; tests pass a fake getter. The closer
+// tears down the backend on daemon exit.
+type CoreOpener interface {
+	OpenCore(ctx context.Context, resumeLedger uint32) (LedgerGetter, func() error, error)
+}
+
+// StartConfig is startStreaming's resolved dependency bundle. It composes the
+// scheduler/lifecycle configs (so catch-up and the lifecycle goroutine share one
+// catalog, worker pool, and retention floor) and the three injected external
+// boundaries, plus the networkTip backoff bounds. The full daemon Config
+// (TOML-parsed paths, captive-core toml, …) is a superset assembled at the call
+// site; only what startup reads lives here.
+type StartConfig struct {
+	// Exec drives catch-up's runBackfill (resolve + executePlan). Its Catalog and
+	// Logger are the shared ones the whole startup reads.
+	Exec ExecConfig
+
+	// Lifecycle drives the lifecycle goroutine. Its embedded ExecConfig should be
+	// the SAME wiring as Exec (one catalog, one pool); RetentionChunks is the
+	// catch-up floor's width too.
+	Lifecycle LifecycleConfig
+
+	// NetworkTip samples the bulk backend's tip during catch-up. Required.
+	NetworkTip NetworkTipBackend
+
+	// Core starts captive core and yields the ingestion getter. Required.
+	Core CoreOpener
+
+	// ServeReads begins serving reads (the RPC server). It must return promptly
+	// (it launches the server; it does not block until shutdown) — startup
+	// proceeds to the blocking ingestion loop after it returns. Required.
+	ServeReads func(ctx context.Context) error
+
+	// TipBackoff is networkTip's inter-attempt sleep; TipMaxAttempts bounds the
+	// retries against a transiently-unavailable backend before networkTip returns
+	// an error (which catch-up then classifies first-start-fatal vs degrade). Zero
+	// values fall back to defaults in withDefaults.
+	TipBackoff     time.Duration
+	TipMaxAttempts int
+}
+
+const (
+	defaultTipBackoff     = time.Second
+	defaultTipMaxAttempts = 5
+)
+
+// withDefaults fills the worker-pool / lifecycle / tip-backoff defaults. The
+// embedded ExecConfig defaults (Workers -> GOMAXPROCS) and the LifecycleConfig
+// Fatalf default are applied so a caller need not.
+func (cfg StartConfig) withDefaults() StartConfig {
+	cfg.Exec = cfg.Exec.WithDefaults()
+	cfg.Lifecycle = cfg.Lifecycle.WithLifecycleDefaults()
+	if cfg.TipBackoff <= 0 {
+		cfg.TipBackoff = defaultTipBackoff
+	}
+	if cfg.TipMaxAttempts <= 0 {
+		cfg.TipMaxAttempts = defaultTipMaxAttempts
+	}
+	return cfg
+}
+
+func (cfg StartConfig) validate() error {
+	if cfg.Exec.Catalog == nil {
+		return errors.New("streaming: StartConfig.Exec.Catalog is nil")
+	}
+	if cfg.Exec.Logger == nil {
+		return errors.New("streaming: StartConfig.Exec.Logger is nil")
+	}
+	if cfg.Exec.Process.HotProbe == nil {
+		return errors.New("streaming: StartConfig.Exec.Process.HotProbe is nil (watermark derivation needs it)")
+	}
+	if cfg.NetworkTip == nil {
+		return errors.New("streaming: StartConfig.NetworkTip is nil")
+	}
+	if cfg.Core == nil {
+		return errors.New("streaming: StartConfig.Core is nil")
+	}
+	if cfg.ServeReads == nil {
+		return errors.New("streaming: StartConfig.ServeReads is nil")
+	}
+	return nil
+}
+
+// networkTip samples backend.NetworkTip, hardened against the two ways the tip
+// lies: it retries on a transient error with a fixed backoff (bounded by
+// maxAttempts), and rejects a tip below genesis as "not ready" (an empty /
+// not-yet-synced backend) so an unready tip never reaches the chunk arithmetic
+// where it would pin a garbage floor. ctx cancellation aborts the wait
+// immediately. The catch-up loop has a local substitute (lastCommitted) and
+// degrades on the returned error EXCEPT on a true first start, where it fatals.
+func networkTip(
+	ctx context.Context, backend NetworkTipBackend, backoff time.Duration, maxAttempts int,
+) (uint32, error) {
+	var lastErr error
+	for attempt := range maxAttempts {
+		if attempt > 0 {
+			timer := time.NewTimer(backoff)
+			select {
+			case <-ctx.Done():
+				timer.Stop()
+				return 0, ctx.Err()
+			case <-timer.C:
+			}
+		}
+		tip, err := backend.NetworkTip(ctx)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+		if tip < chunk.FirstLedgerSeq {
+			// Genesis is the lowest valid tip; below it the backend is empty or not
+			// yet synced. Treated as not-ready (an error catch-up classifies), NOT
+			// retried — a synced-from-empty backend would just keep returning 0.
+			return 0, fmt.Errorf("streaming: backend tip %d is below genesis %d — backend not ready",
+				tip, chunk.FirstLedgerSeq)
+		}
+		return tip, nil
+	}
+	return 0, fmt.Errorf("streaming: network tip unavailable after %d attempts: %w", maxAttempts, lastErr)
+}
diff --git a/cmd/stellar-rpc/internal/fullhistory/streaming/startup_test.go b/cmd/stellar-rpc/internal/fullhistory/streaming/startup_test.go
new file mode 100644
index 000000000..e936f63be
--- /dev/null
+++ b/cmd/stellar-rpc/internal/fullhistory/streaming/startup_test.go
@@ -0,0 +1,597 @@
+package streaming
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/pkg/chunk"
+)
+
+// ---------------------------------------------------------------------------
+// Injected-boundary fakes.
+// ---------------------------------------------------------------------------
+
+// fakeTipBackend is a NetworkTipBackend whose result is programmable per call:
+// it returns tips[i] (clamped to the last element after that). When err is set,
+// it returns that error for the first errFirst calls and then the tip — modeling
+// a backend that is transiently down then comes online (errFirst large ⇒ always
+// down).
+type fakeTipBackend struct {
+	mu       sync.Mutex
+	tips     []uint32
+	calls    int
+	err      error
+	errFirst int // return err for the first errFirst calls, then the tip
+}
+
+func (b *fakeTipBackend) NetworkTip(context.Context) (uint32, error) {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	n := b.calls
+	b.calls++
+	if b.err != nil && n < b.errFirst {
+		return 0, b.err
+	}
+	if len(b.tips) == 0 {
+		return 0, errors.New("fakeTipBackend: no tips programmed")
+	}
+	idx := n
+	if idx >= len(b.tips) {
+		idx = len(b.tips) - 1
+	}
+	return b.tips[idx], nil
+}
+
+func (b *fakeTipBackend) callCount() int {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.calls
+}
+
+// fakeCore is a CoreOpener handing back a programmed LedgerGetter and recording
+// the resume ledger it was started from.
+type fakeCore struct {
+	getter      LedgerGetter
+	openErr     error
+	resumeSeen  atomic.Uint32
+	openedCount atomic.Int32
+}
+
+func (c *fakeCore) OpenCore(_ context.Context, resumeLedger uint32) (LedgerGetter, func() error, error) {
+	c.openedCount.Add(1)
+	c.resumeSeen.Store(resumeLedger)
+	if c.openErr != nil {
+		return nil, nil, c.openErr
+	}
+	getter := c.getter
+	if getter == nil {
+		// Default: a live getter that blocks until ctx is canceled (the daemon's
+		// steady state). Tests that need a finite poll set c.getter.
+		getter = &fakeLedgerGetter{frames: map[uint32][]byte{}, blockOnCtx: true}
+	}
+	return getter, func() error { return nil }, nil
+}
+
+// recordingPlan captures the (rangeStart, rangeEnd) every backfill pass asked
+// for, via the ExecConfig runChunk test seam — so a backfill test asserts the
+// loop's range arithmetic without real cold I/O. Because resolve emits per-chunk
+// builds, the lowest/highest chunk a pass touched bracket the requested range.
+type recordingPlan struct {
+	mu     sync.Mutex
+	passes [][2]chunk.ID // {minChunk, maxChunk} per pass
+	cur    *[2]chunk.ID
+}
+
+// note records a ChunkBuild's chunk into the current pass. runBackfill calls
+// resolve then executePlan; we observe each ChunkBuild via the runChunk seam. A
+// new pass is opened lazily on the first chunk after the previous pass closed.
+func (r *recordingPlan) note(c chunk.ID) {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.cur == nil {
+		r.cur = &[2]chunk.ID{c, c}
+		return
+	}
+	if c < r.cur[0] {
+		r.cur[0] = c
+	}
+	if c > r.cur[1] {
+		r.cur[1] = c
+	}
+}
+
+func (r *recordingPlan) endPass() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	if r.cur != nil {
+		r.passes = append(r.passes, *r.cur)
+		r.cur = nil
+	}
+}
+
+func (r *recordingPlan) snapshot() [][2]chunk.ID {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	out := make([][2]chunk.ID, len(r.passes))
+	copy(out, r.passes)
+	return out
+}
+
+// startTestConfig builds a StartConfig over a real catalog (genesis floor pinned
+// to GenesisLedger by default) with all external boundaries faked. recordPlan,
+// when non-nil, wires the runChunk seam so backfill passes are recorded without
+// cold I/O.
+func startTestConfig(
+	t *testing.T, cat *Catalog, tip *fakeTipBackend, core *fakeCore, recordPlan *recordingPlan,
+) StartConfig {
+	t.Helper()
+	exec := ExecConfig{
+		Catalog: cat,
+		Logger:  silentLogger(),
+		Workers: 2,
+		Process: ProcessConfig{
+			HotProbe: NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger()),
+			Backend:  zeroTxBackend(t),
+		},
+	}
+	if recordPlan != nil {
+		exec.runChunk = func(_ context.Context, cb ChunkBuild, _ ExecConfig) error {
+			recordPlan.note(cb.Chunk)
+			return nil
+		}
+	}
+	life := LifecycleConfig{ExecConfig: exec, RetentionChunks: 0, Fatalf: (&fatalRecorder{}).fatalf}
+	return StartConfig{
+		Exec:           exec,
+		Lifecycle:      life,
+		NetworkTip:     tip,
+		Core:           core,
+		ServeReads:     func(context.Context) error { return nil },
+		TipBackoff:     time.Millisecond,
+		TipMaxAttempts: 3,
+	}
+}
+
+// pinGenesis pins config:earliest_ledger to GenesisLedger (what validateConfig
+// does for a "genesis" floor), so startup's first-start predicate classifies
+// correctly.
+func pinGenesis(t *testing.T, cat *Catalog) {
+	t.Helper()
+	require.NoError(t, cat.PutEarliestLedger(chunk.FirstLedgerSeq))
+}
+
+// ---------------------------------------------------------------------------
+// networkTip — backoff, sub-genesis rejection, exhausted retries.
+// ---------------------------------------------------------------------------
+
+func TestNetworkTip_RejectsSubGenesisAsNotReady(t *testing.T) {
+	tip, err := networkTip(context.Background(),
+		&fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq - 1}}, time.Millisecond, 3)
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "not ready")
+	require.Zero(t, tip)
+}
+
+func TestNetworkTip_RetriesThenSucceeds(t *testing.T) {
+	b := &fakeTipBackend{tips: []uint32{50_000}, err: errors.New("object store down"), errFirst: 2}
+	tip, err := networkTip(context.Background(), b, time.Millisecond, 5)
+	require.NoError(t, err)
+	require.Equal(t, uint32(50_000), tip)
+	require.Equal(t, 3, b.callCount(), "two failures then a success")
+}
+
+func TestNetworkTip_ExhaustedRetriesErrors(t *testing.T) {
+	b := &fakeTipBackend{err: errors.New("object store down"), errFirst: 99}
+	_, err := networkTip(context.Background(), b, time.Millisecond, 4)
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "after 4 attempts")
+	require.Equal(t, 4, b.callCount())
+}
+
+func TestNetworkTip_CtxCancelAbortsWait(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+	b := &fakeTipBackend{err: errors.New("down"), errFirst: 99}
+	_, err := networkTip(ctx, b, time.Hour, 5)
+	require.ErrorIs(t, err, context.Canceled)
+}
+
+// ---------------------------------------------------------------------------
+// catchUp — the backfill loop edge cases (the heart of Issue 12).
+// ---------------------------------------------------------------------------
+
+// First start (genesis, no local history) with the tip ABSENT is FATAL: the
+// daemon can neither catch up nor serve a local history.
+func TestBackfill_FirstStartTipAbsentFatal(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	tip := &fakeTipBackend{err: errors.New("backend unreachable"), errFirst: 99}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, &recordingPlan{})
+
+	// lastCommitted = deriveWatermark over an empty catalog = preGenesisLedger (1);
+	// earliest = GenesisLedger (2); 1 < 2 ⇒ first start with no progress.
+	_, err := catchUp(context.Background(), cfg, preGenesisLedger, chunk.FirstLedgerSeq)
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrFirstStartNoTip)
+}
+
+// First start (genesis) with the tip PRESENT a few chunks up: the range is
+// computed [chunk 0, lastCompleteChunkAt(tip)] and backfill runs over it.
+func TestBackfill_FirstStartTipPresentComputesRange(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// Tip in the middle of chunk 3 ⇒ last complete chunk is 2.
+	tipLedger := chunk.ID(3).FirstLedger() + 100
+	rec := &recordingPlan{}
+	tip := &fakeTipBackend{tips: []uint32{tipLedger}}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, preGenesisLedger, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+	rec.endPass()
+
+	passes := rec.snapshot()
+	require.Len(t, passes, 1, "the tip does not move, so exactly one backfill pass")
+	assert.Equal(t, chunk.ID(0), passes[0][0], "rangeStart is chunk 0 (genesis floor)")
+	assert.Equal(t, chunk.ID(2), passes[0][1], "rangeEnd is lastCompleteChunkAt(tip)")
+	// lastCommitted advances to chunk 2's last ledger.
+	assert.Equal(t, chunk.ID(2).LastLedger(), last)
+}
+
+// A young network (tip below the first complete chunk) is a no-op: rangeEnd < 0
+// < rangeStart, so the loop breaks immediately without backfilling.
+func TestBackfill_YoungNetworkNoOp(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// Tip inside chunk 0 (no chunk has fully closed yet).
+	tip := &fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq + 50}}
+	rec := &recordingPlan{}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, preGenesisLedger, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+	rec.endPass()
+	require.Empty(t, rec.snapshot(), "no backfill pass on a young network")
+	assert.Equal(t, preGenesisLedger, last, "watermark unchanged")
+}
+
+// Steady restart with local progress and a tip just past it: backfill is a
+// no-op (everything below the watermark is already complete), the watermark is
+// unchanged.
+func TestBackfill_SteadyRestartNoOp(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// Watermark on a chunk boundary (chunk 2 complete), tip just past it in
+	// chunk 3 — but resolve finds chunks 0..2 already... actually nothing is
+	// frozen, so a pass WOULD run. To model a true steady-state no-op we make the
+	// watermark sit at chunk 2's end and the tip lag at the same point: rangeEnd
+	// == backfilledThrough on the SECOND iteration breaks the loop, but the first
+	// still backfills. The crisp no-op is the mid-chunk-within-one-chunk case
+	// below; here we assert the loop converges (terminates) and advances the
+	// watermark monotonically.
+	watermark := chunk.ID(2).LastLedger()
+	tipLedger := chunk.ID(3).FirstLedger() + 10 // last complete chunk == 2
+	rec := &recordingPlan{}
+	tip := &fakeTipBackend{tips: []uint32{tipLedger}}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, watermark, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+	rec.endPass()
+
+	passes := rec.snapshot()
+	require.Len(t, passes, 1)
+	assert.Equal(t, chunk.ID(2), passes[0][1], "rangeEnd == lastCompleteChunkAt(tip) == 2")
+	assert.Equal(t, watermark, last, "watermark does not regress and stays at chunk 2 end")
+}
+
+// Mid-chunk resume exclusion: a watermark strictly inside a chunk, within one
+// chunk of the tip, leaves the partial resume chunk to ingestion — rangeEnd is
+// pulled back to chunkID(watermark)-1.
+//
+// The tip is placed AT chunk 5's last ledger (chunk 5 complete-at-tip) while the
+// watermark stays mid-chunk-5. This is the distinguishing scenario: WITHOUT the
+// exclusion, lastCompleteChunkAt(anchor) = 5 and the loop would backfill the live
+// chunk ingestion owns; WITH it, rangeEnd folds back to 4. (A tip that is also
+// mid-chunk-5 would yield lastCompleteChunkAt = 4 anyway, making the exclusion
+// undetectable.) within-one-chunk still holds: tip - watermark = 9999 - 100 =
+// 9899 < 10000.
+func TestBackfill_MidChunkResumeExclusion(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// Watermark mid-chunk-5 (not on a boundary); tip AT chunk 5's last ledger so
+	// chunk 5 is complete-at-tip — the case that distinguishes the exclusion.
+	watermark := chunk.ID(5).FirstLedger() + 100
+	tipLedger := chunk.ID(5).LastLedger() // within one chunk, but chunk 5 complete-at-tip
+	rec := &recordingPlan{}
+	tip := &fakeTipBackend{tips: []uint32{tipLedger}}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, watermark, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+	rec.endPass()
+
+	passes := rec.snapshot()
+	require.Len(t, passes, 1)
+	assert.Equal(t, chunk.ID(4), passes[0][1],
+		"rangeEnd pulled back to chunkID(watermark)-1 = chunk 4; chunk 5 is ingestion's")
+	// Chunk 5 (complete-at-tip) is NOT backfilled — the exclusion left it to
+	// ingestion. Without the exclusion rangeEnd would be 5 and chunk 5 would
+	// appear in the pass; this assertion is what makes deleting the exclusion
+	// logic detectable.
+	assert.Less(t, passes[0][1], chunk.ID(5), "the live resume chunk 5 is never backfilled")
+	assert.Less(t, passes[0][0], chunk.ID(5))
+	// The watermark itself is NOT advanced past where it was (the excluded chunk
+	// stays the resume point): max(watermark, chunk4.LastLedger) == watermark.
+	assert.Equal(t, watermark, last)
+}
+
+// Long-downtime re-pass: the tip ADVANCES between passes, so the loop runs more
+// than once, extending the backfilled range, then terminates when the tip stops.
+func TestBackfill_LongDowntimeRePass(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	// First sample: last complete chunk 2. Second sample: tip jumped to chunk 5
+	// (new chunks appeared while the first pass was in flight). Third sample
+	// (clamped): same as second ⇒ rangeEnd unchanged ⇒ break.
+	tip := &fakeTipBackend{tips: []uint32{
+		chunk.ID(3).FirstLedger() + 1, // last complete 2
+		chunk.ID(6).FirstLedger() + 1, // last complete 5
+	}}
+	// Record the raw set of chunks every backfill pass touched (across passes);
+	// the highest chunk reached proves the re-pass extended the range to the
+	// advanced tip.
+	var mu sync.Mutex
+	var allChunks []chunk.ID
+	exec := ExecConfig{
+		Catalog: cat,
+		Logger:  silentLogger(),
+		Workers: 2,
+		Process: ProcessConfig{
+			HotProbe: NewRocksHotProbe(cat.layout.HotChunkPath, silentLogger()),
+			Backend:  zeroTxBackend(t),
+		},
+		runChunk: func(_ context.Context, cb ChunkBuild, _ ExecConfig) error {
+			mu.Lock()
+			allChunks = append(allChunks, cb.Chunk)
+			mu.Unlock()
+			return nil
+		},
+	}
+	cfg := StartConfig{
+		Exec:           exec,
+		Lifecycle:      LifecycleConfig{ExecConfig: exec, Fatalf: (&fatalRecorder{}).fatalf},
+		NetworkTip:     tip,
+		Core:           &fakeCore{},
+		ServeReads:     func(context.Context) error { return nil },
+		TipBackoff:     time.Millisecond,
+		TipMaxAttempts: 3,
+	}
+
+	last, err := catchUp(context.Background(), cfg, preGenesisLedger, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+
+	mu.Lock()
+	defer mu.Unlock()
+	// Two passes ran: first [0,2], second extended to chunk 5. The highest chunk
+	// touched is 5, and the final watermark is chunk 5's last ledger.
+	maxChunkTouched := chunk.ID(0)
+	for _, c := range allChunks {
+		if c > maxChunkTouched {
+			maxChunkTouched = c
+		}
+	}
+	assert.Equal(t, chunk.ID(5), maxChunkTouched, "the re-pass extended the range to the advanced tip")
+	assert.Equal(t, chunk.ID(5).LastLedger(), last)
+	assert.GreaterOrEqual(t, tip.callCount(), 3, "the loop re-sampled the tip across passes")
+}
+
+// Degrade-and-serve restart: the tip is UNREACHABLE but there IS local progress
+// (watermark >= earliest), so backfill does NOT fatal — it degrades to tip :=
+// lastCommitted and re-resolves the already-local range below the watermark
+// (self-skipping frozen chunks in production). It terminates (does not loop
+// forever) and never regresses the watermark.
+func TestBackfill_RestartTipUnreachableDegrades(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	watermark := chunk.ID(2).LastLedger() // local progress exists
+	tip := &fakeTipBackend{err: errors.New("backend down"), errFirst: 99}
+	rec := &recordingPlan{}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, watermark, chunk.FirstLedgerSeq)
+	require.NoError(t, err, "local progress means no fatal")
+	rec.endPass()
+
+	// tip := watermark ⇒ anchor == watermark ⇒ rangeEnd == lastCompleteChunkAt
+	// (chunk 2 end) == 2, rangeStart == chunk 0; ONE re-resolve pass over the
+	// already-local [0,2], then backfilledThrough==2 breaks the loop.
+	passes := rec.snapshot()
+	require.Len(t, passes, 1, "exactly one degraded re-resolve pass, then terminate")
+	assert.Equal(t, chunk.ID(2), passes[0][1])
+	assert.Equal(t, watermark, last, "watermark does not regress")
+}
+
+// Lagging bulk tip below a chunk-aligned watermark: the bulk backend's tip sits
+// in chunk 3, but a complete watermark chunk (chunk 5, chunk-aligned) is durably
+// committed above it. The anchor is max(tip, lastCommitted) == the watermark, so
+// rangeEnd == lastCompleteChunkAt(watermark) == 5 — the complete watermark chunk
+// still folds into its window's index before serving. Anchored on the tip alone
+// it would be lastCompleteChunkAt(tip) == 2 (regressing below where pruning
+// advanced and dropping chunks 3..5). The mid-chunk exclusion does NOT fire: the
+// watermark is on a boundary (watermarkMidChunk == false), even though
+// withinOneChunkOfTip is true (signed: lagging tip below the watermark).
+func TestBackfill_LaggingBulkTipFoldsWatermarkChunk(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	watermark := chunk.ID(5).LastLedger()       // chunk-aligned, complete watermark chunk 5
+	tipLedger := chunk.ID(3).FirstLedger() + 10 // lagging bulk tip in chunk 3 (last complete 2)
+	rec := &recordingPlan{}
+	tip := &fakeTipBackend{tips: []uint32{tipLedger}}
+	cfg := startTestConfig(t, cat, tip, &fakeCore{}, rec)
+
+	last, err := catchUp(context.Background(), cfg, watermark, chunk.FirstLedgerSeq)
+	require.NoError(t, err)
+	rec.endPass()
+
+	passes := rec.snapshot()
+	require.Len(t, passes, 1, "one pass anchored on the watermark, then backfilledThrough==5 breaks")
+	assert.Equal(t, chunk.ID(5), passes[0][1],
+		"rangeEnd == lastCompleteChunkAt(watermark) == 5, NOT lastCompleteChunkAt(tip) == 2")
+	assert.Equal(t, chunk.ID(0), passes[0][0], "rangeStart is chunk 0 (genesis floor)")
+	assert.Equal(t, watermark, last, "watermark does not regress below where pruning advanced")
+}
+
+// ---------------------------------------------------------------------------
+// startStreaming — the full serve+ingest handoff (clean shutdown).
+// ---------------------------------------------------------------------------
+
+// A genesis first start with a tip inside chunk 0 (young network) does no
+// backfill, opens the resume chunk's hot DB, starts the (blocking) fake core
+// getter, serves reads, and runs the ingestion loop — which returns the ctx-
+// canceled GetLedger error when ctx is canceled. The clean-shutdown
+// classification now lives at the daemon top level (superviseStreaming treats a
+// ctx-canceled return as clean), so startStreaming surfaces the wrapped
+// context.Canceled. The resume ledger is genesis.
+func TestStartStreaming_FirstStartServeIngestCleanShutdown(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+
+	served := atomic.Int32{}
+	// Live getter: blocks until ctx cancel (the daemon's steady state).
+	core := &fakeCore{getter: &fakeLedgerGetter{frames: map[uint32][]byte{}, blockOnCtx: true}}
+	tip := &fakeTipBackend{tips: []uint32{chunk.FirstLedgerSeq + 10}} // young: no backfill
+	cfg := startTestConfig(t, cat, tip, core, nil)
+	cfg.ServeReads = func(context.Context) error { served.Add(1); return nil }
+
+	ctx, cancel := context.WithCancel(context.Background())
+	errCh := make(chan error, 1)
+	go func() { errCh <- startStreaming(ctx, cfg) }()
+
+	// Give the loop time to open the hot DB, start core, serve, and park on the
+	// blocking getter, then request a clean shutdown.
+	require.Eventually(t, func() bool { return served.Load() == 1 }, 2*time.Second, 5*time.Millisecond)
+	cancel()
+
+	select {
+	case err := <-errCh:
+		// The ingestion loop surfaces the ctx-canceled GetLedger error; the daemon
+		// top level (superviseStreaming) classifies a ctx-canceled return as clean.
+		require.ErrorIs(t, err, context.Canceled, "clean shutdown surfaces the ctx-canceled error")
+	case <-time.After(3 * time.Second):
+		t.Fatal("startStreaming did not return after ctx cancel")
+	}
+
+	require.Equal(t, int32(1), served.Load(), "reads were served exactly once")
+	require.Equal(t, int32(1), core.openedCount.Load(), "captive core started once")
+	require.Equal(t, uint32(chunk.FirstLedgerSeq), core.resumeSeen.Load(),
+		"resume ledger is genesis on a fresh start (watermark+1)")
+
+	// The resume chunk's hot key is "ready" (the loop opened it and the boundary
+	// was never crossed).
+	state, err := cat.HotState(chunk.IDFromLedger(chunk.FirstLedgerSeq))
+	require.NoError(t, err)
+	assert.Equal(t, HotReady, state)
+}
+
+// startStreaming fatals on a true first start when the tip is unavailable: the
+// error is ErrFirstStartNoTip and NEITHER the hot DB nor core is opened.
+func TestStartStreaming_FirstStartNoTipFatal(t *testing.T) {
+	cat, _ := testCatalog(t)
+	pinGenesis(t, cat)
+	core := &fakeCore{}
+	tip := &fakeTipBackend{err: errors.New("unreachable"), errFirst: 99}
+	cfg := startTestConfig(t, cat, tip, core, nil)
+
+	err := startStreaming(context.Background(), cfg)
+	require.ErrorIs(t, err, ErrFirstStartNoTip)
+	require.Zero(t, core.openedCount.Load(), "core is never started when backfill fatals")
+}
+
+// startStreaming surfaces a missing earliest_ledger pin loudly (validateConfig
+// pins it before startStreaming; absent here is a wiring error, not a first
+// start to mis-classify).
+func TestStartStreaming_RequiresEarliestPin(t *testing.T) {
+	cat, _ := testCatalog(t)
+	// No pinGenesis.
+	cfg := startTestConfig(t, cat, &fakeTipBackend{tips: []uint32{50_000}}, &fakeCore{}, nil)
+	err := startStreaming(context.Background(), cfg)
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "earliest_ledger pinned")
+}
+
+// startStreaming validates its injected boundaries.
+func TestStartStreaming_ValidatesConfig(t *testing.T) {
+	cat, _ := testCatalog(t)
+	base := startTestConfig(t, cat, &fakeTipBackend{tips: []uint32{50_000}}, &fakeCore{}, nil)
+
+	t.Run("nil NetworkTip", func(t *testing.T) {
+		cfg := base
+		cfg.NetworkTip = nil
+		require.Error(t, startStreaming(context.Background(), cfg))
+	})
+	t.Run("nil Core", func(t *testing.T) {
+		cfg := base
+		cfg.Core = nil
+		require.Error(t, startStreaming(context.Background(), cfg))
+	})
+	t.Run("nil ServeReads", func(t *testing.T) {
+		cfg := base
+		cfg.ServeReads = nil
+		require.Error(t, startStreaming(context.Background(), cfg))
+	})
+	t.Run("nil HotProbe", func(t *testing.T) {
+		cfg := base
+		cfg.Exec.Process.HotProbe = nil
+		require.Error(t, startStreaming(context.Background(), cfg))
+	})
+}
+
+// ---------------------------------------------------------------------------
+// Pure helpers: withinOneChunkOfTip, watermarkMidChunk.
+// ---------------------------------------------------------------------------
+
+func TestWatermarkMidChunk(t *testing.T) {
+	tests := []struct {
+		name      string
+		watermark uint32
+		mid       bool
+	}{
+		{"genesis sentinel is a boundary", preGenesisLedger, false},
+		{"chunk-0 last ledger is a boundary", chunk.ID(0).LastLedger(), false},
+		{"chunk-2 last ledger is a boundary", chunk.ID(2).LastLedger(), false},
+		{"mid chunk 0", chunk.ID(0).FirstLedger() + 1, true},
+		{"mid chunk 5", chunk.ID(5).FirstLedger() + 100, true},
+		{"chunk-5 first ledger is mid (not the last)", chunk.ID(5).FirstLedger(), true},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.mid, watermarkMidChunk(tt.watermark))
+		})
+	}
+}
+
+func TestWithinOneChunkOfTip(t *testing.T) {
+	tests := []struct {
+		name           string
+		tip, watermark uint32
+		within         bool
+	}{
+		{"tip equals watermark", 100_000, 100_000, true},
+		{"tip one less than a chunk ahead", 100_000 + chunk.LedgersPerChunk - 1, 100_000, true},
+		{"tip exactly a chunk ahead", 100_000 + chunk.LedgersPerChunk, 100_000, false},
+		{"lagging tip below watermark", 90_000, 100_000, true}, // signed: negative < L
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.within, withinOneChunkOfTip(tt.tip, tt.watermark))
+		})
+	}
+}
diff --git a/cmd/stellar-rpc/main.go b/cmd/stellar-rpc/main.go
index cdda10d60..f7492c493 100644
--- a/cmd/stellar-rpc/main.go
+++ b/cmd/stellar-rpc/main.go
@@ -3,6 +3,8 @@ package main
 import (
 	"fmt"
 	"os"
+	"os/signal"
+	"syscall"
 
 	"github.com/spf13/cobra"
 
@@ -11,6 +13,7 @@ import (
 
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/config"
 	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/daemon"
+	"github.com/stellar/stellar-rpc/cmd/stellar-rpc/internal/fullhistory/streaming"
 )
 
 func main() {
@@ -79,8 +82,43 @@ func main() {
 		},
 	}
 
+	// full-history-streaming launches the full-history streaming daemon (Issue 13
+	// entrypoint). It is a SEPARATE subcommand from the default v1 run: the full
+	// SQLite→full-history cutover that flips the default `run` path is issue #772.
+	// TODO(#772): when #772 lands, fold this into the daemon's primary flow (or
+	// flip `run` to it) and retire the v1 SQLite ingestion/preflight path.
+	//
+	// TODO(windows): this import wires the full-history daemon into the
+	// cross-platform binary, but the daemon is Unix-only by construction —
+	// streaming/config_lock.go takes a flock via golang.org/x/sys/unix (no
+	// Windows build) and the hot tier is cgo RocksDB/grocksdb (needs RocksDB
+	// libs). So `go build ./cmd/stellar-rpc` on windows-latest fails to compile;
+	// #805–#807 pass only because their main.go does not yet import streaming.
+	// Before the Windows build matrix can be green with the daemon wired in,
+	// build-constrain the daemon path off Windows (a //go:build unix tag on the
+	// streaming/daemon packages + a Windows stub for this subcommand, per the
+	// packfile/writeback_* and txhash/odirect_* precedent), or drop windows-latest
+	// from the daemon build.
+	var fullHistoryConfigPath string
+	fullHistoryCmd := &cobra.Command{
+		Use:   "full-history-streaming",
+		Short: "Run the full-history streaming daemon (experimental; see #772 for the v1 cutover)",
+		Run: func(cmd *cobra.Command, _ []string) {
+			ctx, stop := signal.NotifyContext(cmd.Context(), syscall.SIGINT, syscall.SIGTERM)
+			defer stop()
+			if err := streaming.RunDaemon(ctx, fullHistoryConfigPath); err != nil {
+				fmt.Fprintf(os.Stderr, "full-history streaming daemon: %v\n", err)
+				os.Exit(1)
+			}
+		},
+	}
+	fullHistoryCmd.Flags().StringVar(&fullHistoryConfigPath, "config", "",
+		"path to the full-history streaming daemon TOML config (required)")
+	_ = fullHistoryCmd.MarkFlagRequired("config")
+
 	rootCmd.AddCommand(versionCmd)
 	rootCmd.AddCommand(genConfigFileCmd)
+	rootCmd.AddCommand(fullHistoryCmd)
 
 	if err := cfg.AddFlags(rootCmd); err != nil {
 		fmt.Fprintf(os.Stderr, "could not parse config options: %v\n", err)