Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -3668,10 +3668,17 @@ def incremental_rebuild(


def _init_hash_tracker(source_root: Path, ladybug_path: Path) -> int:
"""Initialize hash tracker for all Java files. Returns number of files hashed."""
"""Initialize hash tracker for all Java files. Returns number of files hashed.

Called right after a full graph rebuild (``write_ladybug``), so the store must
mirror exactly the files that were just indexed. We deliberately do NOT
``load()`` the existing store: ``update`` re-hashes every current file anyway,
and preserving old entries would leave stale hashes for files that no longer
exist (deleted or now-ignored). Those ghosts would be re-detected as "removed"
on every subsequent ``increment``, sustaining an endless full-rebuild loop.
"""
index_dir = ladybug_path.parent
tracker = FileHashTracker(index_dir)
tracker.load()
ignore = LayeredIgnore(source_root)
all_files: set[str] = set()
source_root_resolved = source_root.resolve()
Expand Down Expand Up @@ -3742,7 +3749,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa

# Write declares_client edges
for row in tables.declares_client_rows:
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
conn.execute(_CREATE_DECLARES_CLIENT, {
"sid": row.symbol_id,
"cid": row.client_id,
Expand All @@ -3753,7 +3760,7 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa

# Write declares_producer edges
for row in tables.declares_producer_rows:
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="")).file_path
source_file = member_by_id.get(row.symbol_id, MemberEntry(kind="", decl=None, parent_id="", parent_fqn="", file_path="", module="", microservice="", node_id="")).file_path
conn.execute(_CREATE_DECLARES_PRODUCER, {
"sid": row.symbol_id,
"pid": row.producer_id,
Expand Down
140 changes: 140 additions & 0 deletions tests/test_incremental_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
"""
from __future__ import annotations

import json
import shutil
from pathlib import Path

import ladybug
Expand Down Expand Up @@ -979,3 +981,141 @@ def test_incremental_preserves_incoming_edges_to_dependent(self, tmp_path: Path)
assert cb_after_count > 0, "out-of-scope C->B CALLS edge must be preserved"

conn.close()


class TestIncrementalRegressions:
"""Regression tests for the ``increment`` always-fully-reprocesses loop.

Two bugs fed the loop:
1. ``_write_clients_producers_and_calls`` built a default ``MemberEntry``
missing the required ``node_id`` field. Because ``dict.get(k, default)``
evaluates ``default`` eagerly, the TypeError fired whenever ANY
``declares_client`` / ``declares_producer`` row existed — crashing every
client-bearing incremental rebuild into a full-rebuild fallback.
2. ``_init_hash_tracker`` (run by every full reprocess AND by that fallback)
did ``load()`` + ``update()`` and never pruned hashes for files no longer
on disk, so ghost entries persisted and re-triggered the loop every run.
"""

def test_init_hash_tracker_prunes_stale_entries(self, tmp_path: Path) -> None:
"""A full rebuild drops hashes for files no longer on disk (ghost pruning).

Without pruning, a stale entry is re-detected as 'removed' on every
``increment``, sustaining an endless full-rebuild loop.
"""
from build_ast_graph import write_ladybug

source_root = tmp_path / "src"
source_root.mkdir()
(source_root / "A.java").write_text("package pkg; class A {}", encoding="utf-8")
index_dir = tmp_path / "index"
index_dir.mkdir()
ladybug_path = index_dir / "code_graph.lbug"

tables = GraphTables()
pass1_parse(source_root, tables, verbose=False)
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False)

# Inject a ghost hash for a file that does not exist on disk.
hash_file = index_dir / ".graph_hashes.json"
data = json.loads(hash_file.read_text(encoding="utf-8"))
data["ghost/Deleted.java"] = "0" * 64
hash_file.write_text(json.dumps(data), encoding="utf-8")

# A second full rebuild (what `reprocess --graph-only` does) re-runs
# _init_hash_tracker, which must drop the ghost.
tables2 = GraphTables()
pass1_parse(source_root, tables2, verbose=False)
write_ladybug(ladybug_path, tables2, source_root=source_root, verbose=False)

after = json.loads(hash_file.read_text(encoding="utf-8"))
assert "ghost/Deleted.java" not in after
assert "A.java" in after

def test_incremental_with_http_clients_does_not_fall_back(self, tmp_path: Path) -> None:
"""A corpus with Feign/Kafka clients/producers rebuilds incrementally.

``http_caller_smoke`` emits DECLARES_CLIENT / DECLARES_PRODUCER rows, so
the buggy eager ``MemberEntry`` default in
``_write_clients_producers_and_calls`` crashed here before the fix
(forcing full_fallback). After the fix: mode is "incremental".
"""
from _builders import build_ladybug_full_into
from build_ast_graph import incremental_rebuild

corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
source_root = tmp_path / "src"
shutil.copytree(corpus, source_root)
index_dir = tmp_path / "index"
index_dir.mkdir()
ladybug_path = index_dir / "code_graph.lbug"

# Full build seeds .graph_hashes.json via write_ladybug -> _init_hash_tracker.
build_ladybug_full_into(source_root, ladybug_path)

# Mutate one file unrelated to the clients/producers.
target = source_root / "src" / "main" / "java" / "smoke" / "http" / "TopicNames.java"
target.write_text(target.read_text(encoding="utf-8") + "\n// edit\n", encoding="utf-8")

result = incremental_rebuild(source_root, ladybug_path, verbose=False)
assert result.mode == "incremental", (
f"expected incremental, got {result.mode!r} (the node_id crash in "
"_write_clients_producers_and_calls forces a full fallback)"
)
assert result.files_changed == 1

def test_reprocess_graph_only_then_increment_is_noop(self, tmp_path: Path) -> None:
"""The reported scenario at the builder level: a full graph rebuild (what
``reprocess --graph-only`` does) followed by ``increment`` with no source
changes must be a no-op, not a second full rebuild."""
from _builders import build_ladybug_full_into
from build_ast_graph import incremental_rebuild

corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
source_root = tmp_path / "src"
shutil.copytree(corpus, source_root)
index_dir = tmp_path / "index"
index_dir.mkdir()
ladybug_path = index_dir / "code_graph.lbug"

# Simulate `reprocess --graph-only`: full rebuild seeds the hash store.
build_ladybug_full_into(source_root, ladybug_path)

# `increment` with no source changes.
result = incremental_rebuild(source_root, ladybug_path, verbose=False)
assert result.mode == "incremental"
assert (result.files_changed, result.files_added, result.files_removed) == (0, 0, 0)

def test_incremental_ghost_entry_then_next_run_is_noop(self, tmp_path: Path) -> None:
"""A ghost hash entry is detected as 'removed' once, processed by the
scoped path (which prunes it), so the following run is a clean no-op.

Guards both fixes together: the node_id fix lets the scoped path
complete, and that path prunes the ghost (lines that delete `removed`
hashes). Before the fixes this fell back to full and preserved the ghost.
"""
from _builders import build_ladybug_full_into
from build_ast_graph import incremental_rebuild

corpus = Path(__file__).parent / "fixtures" / "http_caller_smoke"
source_root = tmp_path / "src"
shutil.copytree(corpus, source_root)
index_dir = tmp_path / "index"
index_dir.mkdir()
ladybug_path = index_dir / "code_graph.lbug"
build_ladybug_full_into(source_root, ladybug_path)

# Inject a ghost (no source change).
hash_file = index_dir / ".graph_hashes.json"
data = json.loads(hash_file.read_text(encoding="utf-8"))
data["ghost/Gone.java"] = "0" * 64
hash_file.write_text(json.dumps(data), encoding="utf-8")

first = incremental_rebuild(source_root, ladybug_path, verbose=False)
assert first.mode == "incremental", f"expected incremental, got {first.mode!r}"
assert first.files_removed == 1

# The ghost must be gone, so the next run detects nothing.
second = incremental_rebuild(source_root, ladybug_path, verbose=False)
assert second.mode == "incremental"
assert (second.files_changed, second.files_added, second.files_removed) == (0, 0, 0)
55 changes: 55 additions & 0 deletions tests/test_java_codebase_rag_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,61 @@ def test_increment_first_run_falls_back_to_full(



def test_reprocess_graph_only_then_increment_graph_is_noop(
corpus_root: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch,
) -> None:
"""The reported scenario, exercised through the real CLI wiring.

``reprocess --graph-only`` rebuilds the graph and seeds ``.graph_hashes.json``
(via ``write_ladybug`` -> ``_init_hash_tracker``); the next ``increment``'s
graph stage must be a no-op, NOT a second full rebuild.

cocoindex is stubbed so ``increment`` runs only its real graph stage (no
embedding model needed). ``reprocess --graph-only`` needs no cocoindex
regardless, so this test runs in the normal (non-heavy) suite.
"""
idx = tmp_path / "idx_reprocess_then_increment"
idx.mkdir()
monkeypatch.setenv("JAVA_CODEBASE_RAG_INDEX_DIR", str(idx))
monkeypatch.setenv("JAVA_CODEBASE_RAG_SOURCE_ROOT", str(corpus_root))

rc = cli_mod.main(
["reprocess", "--graph-only", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"],
)
assert rc == 0, "reprocess --graph-only must succeed"
# reprocess --graph-only must seed the hash store.
hash_file = idx / ".graph_hashes.json"
assert hash_file.exists(), "hash store not seeded by reprocess --graph-only"

# Inject a ghost entry for a file that does not exist — the exact "N removed
# files every run" symptom. On bank-chat (which has Feign/Kafka clients) the
# scoped path this triggers reaches _write_clients_producers_and_calls, so a
# missing-field MemberEntry default here used to crash into a full fallback.
data = json.loads(hash_file.read_text(encoding="utf-8"))
data["ghost/DoesNotExist.java"] = "0" * 64
hash_file.write_text(json.dumps(data), encoding="utf-8")

# Stub cocoindex so increment exercises ONLY its graph stage.
def _noop_coco(env, *, full_reprocess, quiet, verbose=True, lance_project_root=None):
return subprocess.CompletedProcess(args=[], returncode=0, stdout="", stderr="")

monkeypatch.setattr(cli_mod, "run_cocoindex_update", _noop_coco)

buf_out = io.StringIO()
buf_err = io.StringIO()
with contextlib.redirect_stdout(buf_out), contextlib.redirect_stderr(buf_err):
rc2 = cli_mod.main(
["increment", "--source-root", str(corpus_root), "--index-dir", str(idx), "--quiet"],
)
assert rc2 == 0
# The graph stage must NOT have fallen back to a full rebuild.
assert "fell back to full graph rebuild" not in buf_err.getvalue()
assert "increment completed (Lance + graph updated)" in buf_out.getvalue()
# The ghost must be pruned, so the next increment is clean.
after = json.loads(hash_file.read_text(encoding="utf-8"))
assert "ghost/DoesNotExist.java" not in after


@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv")
def test_increment_updates_lance_after_touch_java_file(corpus_root: Path, tmp_path: Path) -> None:
import lancedb # noqa: PLC0415
Expand Down
Loading