diff --git a/build_ast_graph.py b/build_ast_graph.py index cce16631..5fda4158 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -3668,10 +3668,17 @@ def incremental_rebuild( def _init_hash_tracker(source_root: Path, ladybug_path: Path) -> int: - """Initialize hash tracker for all Java files. Returns number of files hashed.""" + """Initialize hash tracker for all Java files. Returns number of files hashed. + + Called right after a full graph rebuild (``write_ladybug``), so the store must + mirror exactly the files that were just indexed. We deliberately do NOT + ``load()`` the existing store: ``update`` re-hashes every current file anyway, + and preserving old entries would leave stale hashes for files that no longer + exist (deleted or now-ignored). Those ghosts would be re-detected as "removed" + on every subsequent ``increment``, sustaining an endless full-rebuild loop. + """ index_dir = ladybug_path.parent tracker = FileHashTracker(index_dir) - tracker.load() ignore = LayeredIgnore(source_root) all_files: set[str] = set() source_root_resolved = source_root.resolve() @@ -3742,7 +3749,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa # Write declares_client edges for row in tables.declares_client_rows: - source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path + source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path conn.execute(_CREATE_DECLARES_CLIENT, { "sid": row.symbol_id, "cid": row.client_id, @@ -3753,7 +3760,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa # Write declares_producer edges for row in tables.declares_producer_rows: - source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path + source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path conn.execute(_CREATE_DECLARES_PRODUCER, { "sid": row.symbol_id, "pid": row.producer_id, diff --git a/tests/test_incremental_graph.py b/tests/test_incremental_graph.py index 7a4121fe..0349c912 100644 --- a/tests/test_incremental_graph.py +++ b/tests/test_incremental_graph.py @@ -4,6 +4,8 @@ """ from __future__ import annotations +import json +import shutil from pathlib import Path import ladybug @@ -979,3 +981,141 @@ def test_incremental_preserves_incoming_edges_to_dependent(self, tmp_path: Path) assert cb_after_count > 0, "out-of-scope C->B CALLS edge must be preserved" conn.close() + + +class TestIncrementalRegressions: + """Regression tests for the ``increment`` always-fully-reprocesses loop. + + Two bugs fed the loop: + 1. ``_write_clients_producers_and_calls`` built a default ``MemberEntry`` + missing the required ``node_id`` field. Because ``dict.get(k, default)`` + evaluates ``default`` eagerly, the TypeError fired whenever ANY + ``declares_client`` / ``declares_producer`` row existed — crashing every + client-bearing incremental rebuild into a full-rebuild fallback. + 2. ``_init_hash_tracker`` (run by every full reprocess AND by that fallback) + did ``load()`` + ``update()`` and never pruned hashes for files no longer + on disk, so ghost entries persisted and re-triggered the loop every run. + """ + + def test_init_hash_tracker_prunes_stale_entries(self, tmp_path: Path) -> None: + """A full rebuild drops hashes for files no longer on disk (ghost pruning). + + Without pruning, a stale entry is re-detected as 'removed' on every + ``increment``, sustaining an endless full-rebuild loop. + """ + from build_ast_graph import write_ladybug + + source_root = tmp_path / "src" + source_root.mkdir() + (source_root / "A.java").write_text("package pkg; class A {}", encoding="utf-8") + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + + tables = GraphTables() + pass1_parse(source_root, tables, verbose=False) + write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False) + + # Inject a ghost hash for a file that does not exist on disk. + hash_file = index_dir / ".graph_hashes.json" + data = json.loads(hash_file.read_text(encoding="utf-8")) + data["ghost/Deleted.java"] = "0" * 64 + hash_file.write_text(json.dumps(data), encoding="utf-8") + + # A second full rebuild (what `reprocess --graph-only` does) re-runs + # _init_hash_tracker, which must drop the ghost. + tables2 = GraphTables() + pass1_parse(source_root, tables2, verbose=False) + write_ladybug(ladybug_path, tables2, source_root=source_root, verbose=False) + + after = json.loads(hash_file.read_text(encoding="utf-8")) + assert "ghost/Deleted.java" not in after + assert "A.java" in after + + def test_incremental_with_http_clients_does_not_fall_back(self, tmp_path: Path) -> None: + """A corpus with Feign/Kafka clients/producers rebuilds incrementally. + + ``http_caller_smoke`` emits DECLARES_CLIENT / DECLARES_PRODUCER rows, so + the buggy eager ``MemberEntry`` default in + ``_write_clients_producers_and_calls`` crashed here before the fix + (forcing full_fallback). After the fix: mode is "incremental". + """ + from _builders import build_ladybug_full_into + from build_ast_graph import incremental_rebuild + + corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke" + source_root = tmp_path / "src" + shutil.copytree(corpus, source_root) + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + + # Full build seeds .graph_hashes.json via write_ladybug -> _init_hash_tracker. + build_ladybug_full_into(source_root, ladybug_path) + + # Mutate one file unrelated to the clients/producers. + target = source_root / "src" / "main" / "java" / "smoke" / "http" / "TopicNames.java" + target.write_text(target.read_text(encoding="utf-8") + "\n// edit\n", encoding="utf-8") + + result = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert result.mode == "incremental", ( + f"expected incremental, got {result.mode!r} (the node_id crash in " + "_write_clients_producers_and_calls forces a full fallback)" + ) + assert result.files_changed == 1 + + def test_reprocess_graph_only_then_increment_is_noop(self, tmp_path: Path) -> None: + """The reported scenario at the builder level: a full graph rebuild (what + ``reprocess --graph-only`` does) followed by ``increment`` with no source + changes must be a no-op, not a second full rebuild.""" + from _builders import build_ladybug_full_into + from build_ast_graph import incremental_rebuild + + corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke" + source_root = tmp_path / "src" + shutil.copytree(corpus, source_root) + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + + # Simulate `reprocess --graph-only`: full rebuild seeds the hash store. + build_ladybug_full_into(source_root, ladybug_path) + + # `increment` with no source changes. + result = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert result.mode == "incremental" + assert (result.files_changed, result.files_added, result.files_removed) == (0, 0, 0) + + def test_incremental_ghost_entry_then_next_run_is_noop(self, tmp_path: Path) -> None: + """A ghost hash entry is detected as 'removed' once, processed by the + scoped path (which prunes it), so the following run is a clean no-op. + + Guards both fixes together: the node_id fix lets the scoped path + complete, and that path prunes the ghost (lines that delete `removed` + hashes). Before the fixes this fell back to full and preserved the ghost. + """ + from _builders import build_ladybug_full_into + from build_ast_graph import incremental_rebuild + + corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke" + source_root = tmp_path / "src" + shutil.copytree(corpus, source_root) + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + build_ladybug_full_into(source_root, ladybug_path) + + # Inject a ghost (no source change). + hash_file = index_dir / ".graph_hashes.json" + data = json.loads(hash_file.read_text(encoding="utf-8")) + data["ghost/Gone.java"] = "0" * 64 + hash_file.write_text(json.dumps(data), encoding="utf-8") + + first = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert first.mode == "incremental", f"expected incremental, got {first.mode!r}" + assert first.files_removed == 1 + + # The ghost must be gone, so the next run detects nothing. + second = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert second.mode == "incremental" + assert (second.files_changed, second.files_added, second.files_removed) == (0, 0, 0) diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index d1342f1b..8f6b251c 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -544,6 +544,61 @@ def test_increment_first_run_falls_back_to_full( +def test_reprocess_graph_only_then_increment_graph_is_noop( + corpus_root: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, +) -> None: + """The reported scenario, exercised through the real CLI wiring. + + ``reprocess --graph-only`` rebuilds the graph and seeds ``.graph_hashes.json`` + (via ``write_ladybug`` -> ``_init_hash_tracker``); the next ``increment``'s + graph stage must be a no-op, NOT a second full rebuild. + + cocoindex is stubbed so ``increment`` runs only its real graph stage (no + embedding model needed). ``reprocess --graph-only`` needs no cocoindex + regardless, so this test runs in the normal (non-heavy) suite. + """ + idx = tmp_path / "idx_reprocess_then_increment" + idx.mkdir() + monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx)) + monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(corpus_root)) + + rc = cli_mod.main( + ["reprocess", "--graph-only", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"], + ) + assert rc == 0, "reprocess --graph-only must succeed" + # reprocess --graph-only must seed the hash store. + hash_file = idx / ".graph_hashes.json" + assert hash_file.exists(), "hash store not seeded by reprocess --graph-only" + + # Inject a ghost entry for a file that does not exist — the exact "N removed + # files every run" symptom. On bank-chat (which has Feign/Kafka clients) the + # scoped path this triggers reaches _write_clients_producers_and_calls, so a + # missing-field MemberEntry default here used to crash into a full fallback. + data = json.loads(hash_file.read_text(encoding="utf-8")) + data["ghost/DoesNotExist.java"] = "0" * 64 + hash_file.write_text(json.dumps(data), encoding="utf-8") + + # Stub cocoindex so increment exercises ONLY its graph stage. + def _noop_coco(env, *, full_reprocess, quiet, verbose=True, lance_project_root=None): + return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="") + + monkeypatch.setattr(cli_mod, "run_cocoindex_update", _noop_coco) + + buf_out = io.StringIO() + buf_err = io.StringIO() + with contextlib.redirect_stdout(buf_out), contextlib.redirect_stderr(buf_err): + rc2 = cli_mod.main( + ["increment", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"], + ) + assert rc2 == 0 + # The graph stage must NOT have fallen back to a full rebuild. + assert "fell back to full graph rebuild" not in buf_err.getvalue() + assert "increment completed (Lance + graph updated)" in buf_out.getvalue() + # The ghost must be pruned, so the next increment is clean. + after = json.loads(hash_file.read_text(encoding="utf-8")) + assert "ghost/DoesNotExist.java" not in after + + @pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") def test_increment_updates_lance_after_touch_java_file(corpus_root: Path, tmp_path: Path) -> None: import lancedb # noqa: PLC0415