From 16f48da548552daaabc35b2746567739b365faf4 Mon Sep 17 00:00:00 2001
From: Antawari de la Torre Cobos <antawari@gmail.com>
Date: Sat, 13 Jun 2026 15:23:01 -0600
Subject: [PATCH] type the LanceDB backend get_by_source failure instead of
 swallowing it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

get_by_source() caught `except Exception` and returned [], making a genuine
LanceDB lookup failure (corrupt table, unreachable store, vector-op error)
indistinguishable from "no entries for this source path" — the swallowed
failure the Elegance Law forbids, and the exact masquerade its structural
sibling query() already had removed.

Now a thrown lookup surfaces as a typed, retryable RetrievalError (the one
Bonfire failure vocabulary) carrying the originating exception as __cause__
and structured context (source_path). A genuine empty result (zero matching
entries) stays a legitimate success and still returns []. exists() keeps its
deliberate, separately-tested fail-open dedup contract untouched.

The implementation is net-zero in line/statement count, so the src/bonfire
file budget is unmoved; the only baseline ratchet is the tests/unit package
total re-frozen for the new contract test (a new file under the 500-line
ceiling, re-anchored per BASELINE-CONVENTIONS.md).

RED: tests/unit/test_knowledge_backend_get_by_source_typed_failure.py pins
the typed-failure-vs-swallow contract using a fake exploding table (lancedb
is not a test dependency).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 file-budget.json                              |   2 +-
 src/bonfire/knowledge/backend.py              |   4 +-
 ...dge_backend_get_by_source_typed_failure.py | 156 ++++++++++++++++++
 3 files changed, 159 insertions(+), 3 deletions(-)
 create mode 100644 tests/unit/test_knowledge_backend_get_by_source_typed_failure.py

diff --git a/file-budget.json b/file-budget.json
index 1277f0a..9c59de0 100644
--- a/file-budget.json
+++ b/file-budget.json
@@ -50,6 +50,6 @@
     "src/bonfire/handlers": 3126,
     "src/bonfire/onboard": 2973,
     "src/bonfire/prompt": 855,
-    "tests/unit": 58858
+    "tests/unit": 59014
   }
 }
diff --git a/src/bonfire/knowledge/backend.py b/src/bonfire/knowledge/backend.py
index 23de0d3..91effb3 100644
--- a/src/bonfire/knowledge/backend.py
+++ b/src/bonfire/knowledge/backend.py
@@ -120,8 +120,8 @@ async def get_by_source(self, source_path: str) -> list[VaultEntry]:
             )
             return [self._record_to_entry(r) for r in results]
         except Exception as exc:
-            logger.exception("Vault get_by_source failed: %s", exc)
-            return []
+            msg = f"Vault get_by_source failed: {exc}"
+            raise RetrievalError(msg, context={"source_path": source_path}) from exc
 
     def _ensure_connected(self) -> None:
         """Lazy connect. Runs migration if vault_v2 doesn't exist."""
diff --git a/tests/unit/test_knowledge_backend_get_by_source_typed_failure.py b/tests/unit/test_knowledge_backend_get_by_source_typed_failure.py
new file mode 100644
index 0000000..4515e07
--- /dev/null
+++ b/tests/unit/test_knowledge_backend_get_by_source_typed_failure.py
@@ -0,0 +1,156 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright 2026 BonfireAI
+
+"""Elegance-Law contract for ``LanceDBBackend.get_by_source()``.
+
+``get_by_source()`` is the source-path retrieval path on the ``VaultBackend``
+contract: callers ask "give me every entry that originated from this file" and
+treat the returned list as the complete, authoritative answer. A real backend
+failure — ``search().where().to_list()`` raising because the table is corrupt,
+the store is unreachable, or a vector op blows up — was previously swallowed
+into ``return []``. That ``[]`` is **indistinguishable from a legitimate
+"no entries for this source path"**, so a failed lookup masqueraded as "nothing
+is stored from here": the caller proceeds on partial truth with no typed signal,
+no retryable flag, no context. That is exactly the swallowed-failure the Elegance
+Law forbids — and the exact masquerade its structural sibling ``query()`` already
+had removed (``query`` now raises a typed ``RetrievalError`` instead of ``[]``).
+
+``get_by_source()`` is the overlooked sibling: same ``table.search(...).where(
+...).to_list()`` shape, same "list of entries" return type, same indistinguishable
+``[]``. This test pins the matching contract: a backend failure inside
+``get_by_source()`` must surface as a typed :class:`bonfire.errors.RetrievalError`
+(the one vocabulary; ``code == "retrieval"``, ``retryable is True``) carrying the
+originating exception as its ``__cause__`` and naming the source path in its
+structured context — NOT a bare ``[]``. A genuine empty result (zero matching
+entries) stays a legitimate success and returns ``[]``.
+
+Note the deliberate divergence from the *other* sibling, ``exists()``: that method
+fail-opens to ``False`` by documented design (its caller treats a false-negative
+as a harmless dedup miss). ``get_by_source()`` has no such fail-open caller — its
+list IS the answer — so, like ``query()``, it must raise rather than degrade.
+
+``lancedb`` is not a test dependency, so we never connect to a real store: we
+pre-set the backend's private ``_table``/``_db`` so ``_ensure_connected()`` is a
+no-op and inject a fake table whose ``.search(...).where(...).to_list()`` raises.
+That exercises the exact ``except`` arm under test without any LanceDB runtime.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from bonfire.errors import BonfireError, RetrievalError
+from bonfire.knowledge.backend import LanceDBBackend
+
+
+class _FakeEmbedder:
+    """Minimal embedder stub — ``get_by_source()`` only reads ``dim`` for the
+    zero-vector probe; it never embeds free text."""
+
+    dim = 4
+
+    def embed(self, texts: list[str]) -> list[list[float]]:
+        return [[0.0] * self.dim for _ in texts]
+
+
+class _ExplodingSearch:
+    """A search builder whose terminal ``.to_list()`` blows up."""
+
+    def limit(self, _n: int) -> _ExplodingSearch:
+        return self
+
+    def where(self, _clause: str) -> _ExplodingSearch:
+        return self
+
+    def to_list(self) -> list[dict[str, object]]:
+        raise RuntimeError("simulated LanceDB to_list failure")
+
+
+class _ExplodingTable:
+    """Non-empty table whose source-path query fails at ``to_list()``."""
+
+    def count_rows(self) -> int:
+        # Non-empty so ``get_by_source()`` proceeds past the early empty-table
+        # guard and reaches the protected ``search(...).where(...).to_list()``.
+        return 1
+
+    def search(self, _vector: object) -> _ExplodingSearch:
+        return _ExplodingSearch()
+
+
+class _EmptyHitSearch:
+    def limit(self, _n: int) -> _EmptyHitSearch:
+        return self
+
+    def where(self, _clause: str) -> _EmptyHitSearch:
+        return self
+
+    def to_list(self) -> list[dict[str, object]]:
+        return []
+
+
+class _EmptyHitTable:
+    """Non-empty table whose source-path query legitimately matches nothing."""
+
+    def count_rows(self) -> int:
+        return 1
+
+    def search(self, _vector: object) -> _EmptyHitSearch:
+        return _EmptyHitSearch()
+
+
+def _make_backend(table: object) -> LanceDBBackend:
+    backend = LanceDBBackend(vault_path=":memory:", embedder=_FakeEmbedder())
+    # Pre-wire the lazy-connect fields so ``_ensure_connected()`` short-circuits
+    # (it returns early when ``_table is not None``) and never imports lancedb.
+    backend._table = table
+    backend._db = object()
+    return backend
+
+
+@pytest.mark.asyncio
+async def test_get_by_source_backend_failure_raises_typed_retrieval_error() -> None:
+    """A failed ``get_by_source()`` lookup must speak the one vocabulary, not ``[]``."""
+    backend = _make_backend(_ExplodingTable())
+
+    with pytest.raises(RetrievalError) as exc_info:
+        await backend.get_by_source("/src/whatever.py")
+
+    err = exc_info.value
+    assert isinstance(err, BonfireError), "must be in the one Bonfire vocabulary"
+    assert err.code == "retrieval"
+    assert err.retryable is True, "a transient retrieval failure is retryable"
+
+
+@pytest.mark.asyncio
+async def test_get_by_source_failure_chains_the_original_exception() -> None:
+    """The typed error must carry the originating exception (no lost traceback)."""
+    backend = _make_backend(_ExplodingTable())
+
+    with pytest.raises(RetrievalError) as exc_info:
+        await backend.get_by_source("/src/whatever.py")
+
+    cause = exc_info.value.__cause__
+    assert isinstance(cause, RuntimeError)
+    assert "simulated LanceDB to_list failure" in str(cause)
+
+
+@pytest.mark.asyncio
+async def test_get_by_source_failure_context_carries_the_source_path() -> None:
+    """Structured context must name the source path so the failure is self-describing."""
+    backend = _make_backend(_ExplodingTable())
+
+    with pytest.raises(RetrievalError) as exc_info:
+        await backend.get_by_source("/src/needle.py")
+
+    assert exc_info.value.context.get("source_path") == "/src/needle.py"
+
+
+@pytest.mark.asyncio
+async def test_get_by_source_zero_hits_is_a_success_not_an_error() -> None:
+    """A genuine empty result (no matching entries) stays a success — returns []."""
+    backend = _make_backend(_EmptyHitTable())
+
+    result = await backend.get_by_source("/src/unstored.py")
+
+    assert result == []