From c6dc03cd53feec19db88f191f30515089b66d0de Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Fri, 3 Jul 2026 23:11:04 +0300 Subject: [PATCH 1/2] fix(cli): erase clears all builder-owned files (crash marker + hash tmp) (#349, #350) erase left .graph_increment_in_progress and .graph_hashes.json.tmp on disk because _cmd_erase hardcoded only .graph_hashes.json. The surviving crash marker then forced the next increment into a silent full rebuild (explained only under --verbose); the .tmp was orphan cruft. Export the builder-owned filenames from build_ast_graph.BUILDER_OWNED_INDEX_FILES and have erase clear all of them from one list, so erase and the builder cannot drift. Co-Authored-By: Claude --- build_ast_graph.py | 22 +++++++++++++++--- java_codebase_rag/cli.py | 22 ++++++++++-------- tests/test_java_codebase_rag_cli.py | 36 +++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/build_ast_graph.py b/build_ast_graph.py index 3e75a32c..5da43738 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -472,10 +472,26 @@ class IncrementalResult: elapsed_sec: float +# --- Builder-owned files in the index dir (single source of truth) --------------- +# Every artifact the graph builder writes next to code_graph.lbug. The lifecycle +# CLI's `erase` clears all of these from one list so the builder and erase cannot +# drift (issues #349 / #350): previously erase hardcoded ".graph_hashes.json" only +# and left the crash marker (.graph_increment_in_progress) and the atomic-write +# temp (.graph_hashes.json.tmp) behind on disk. +GRAPH_HASHES_FILENAME = ".graph_hashes.json" +GRAPH_HASHES_TMP_FILENAME = ".graph_hashes.json.tmp" +GRAPH_INCREMENT_MARKER_FILENAME = ".graph_increment_in_progress" +BUILDER_OWNED_INDEX_FILES: tuple[str, ...] = ( + GRAPH_HASHES_FILENAME, + GRAPH_HASHES_TMP_FILENAME, + GRAPH_INCREMENT_MARKER_FILENAME, +) + + class FileHashTracker: """Track content hashes for incremental graph rebuild.""" def __init__(self, index_dir: Path): - self._path = index_dir / ".graph_hashes.json" + self._path = index_dir / GRAPH_HASHES_FILENAME self._hashes: dict[str, str] = {} # rel_path -> sha256_hex def load(self) -> None: @@ -491,7 +507,7 @@ def load(self) -> None: def save(self) -> None: """Persist hashes to disk atomically (write .tmp, rename).""" - tmp_path = self._path.with_suffix(".json.tmp") + tmp_path = self._path.parent / GRAPH_HASHES_TMP_FILENAME try: with open(tmp_path, "w", encoding="utf-8") as f: json.dump(self._hashes, f, sort_keys=True) @@ -3811,7 +3827,7 @@ def incremental_rebuild( _verbose_stderr_line(f"[increment] detected {len(added)} added, {len(changed)} changed, {len(removed)} removed files") # Step 2: Crash marker check - crash_marker_path = index_dir / ".graph_increment_in_progress" + crash_marker_path = index_dir / GRAPH_INCREMENT_MARKER_FILENAME if crash_marker_path.exists(): if verbose: _verbose_stderr_line("[increment] crash marker exists; falling back to full rebuild") diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index 47cf3887..add0d898 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -24,6 +24,7 @@ ) from java_codebase_rag._fdlimit import raise_fd_limit from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update, run_incremental_graph +from build_ast_graph import BUILDER_OWNED_INDEX_FILES from java_ontology import VALID_UNRESOLVED_CALL_REASONS LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73" @@ -605,8 +606,8 @@ def _cmd_erase(args: argparse.Namespace) -> int: cfg = _resolved_from_ns(args) _startup_hints(cfg) cfg.apply_to_os_environ() - graph_hashes_path = cfg.ladybug_path.parent / ".graph_hashes.json" - to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, graph_hashes_path] + builder_paths = [cfg.ladybug_path.parent / name for name in BUILDER_OWNED_INDEX_FILES] + to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, *builder_paths] if cfg.index_dir.is_dir(): try: import lancedb @@ -643,15 +644,18 @@ def work(progress: "PipelineProgress | None") -> int: ) elif drop.returncode != 0: print(clip(drop.stderr, 4000), file=sys.stderr) - # Remove the LadybugDB graph, the cocoindex state store, and the graph - # builder's content-hash store. Each is removed by type (see _rm_any): - # code_graph.lbug is a file here but may be a dir under kuzu, while - # cocoindex.db is a directory — a type-blind delete silently no-oped on - # one or the other, and .graph_hashes.json was never targeted at all - # (issue #346). + # Remove the LadybugDB graph, the cocoindex state store, and every + # builder-owned bookkeeping file next to code_graph.lbug (the content-hash + # store, its atomic-write temp, and the incremental crash marker). Each is + # removed by type (see _rm_any): code_graph.lbug is a file here but may be + # a dir under kuzu, while cocoindex.db is a directory — a type-blind delete + # silently no-oped on one or the other, and the builder files were never + # targeted at all (issues #346 / #349 / #350). The list comes from + # build_ast_graph.BUILDER_OWNED_INDEX_FILES so erase and the builder cannot drift. _rm_any(cfg.ladybug_path) _rm_any(cfg.cocoindex_db) - _rm_any(graph_hashes_path) + for builder_path in builder_paths: + _rm_any(builder_path) if cfg.index_dir.is_dir(): try: import lancedb diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index 5f0b6e7f..8c9e1a0a 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -138,6 +138,42 @@ def test_erase_removes_graph_file_cocoindex_dir_and_hash_store(tmp_path: Path) - assert not (idx / ".graph_hashes.json").exists(), "erase left .graph_hashes.json on disk" +def test_erase_removes_increment_marker_and_hash_store_tmp(tmp_path: Path) -> None: + """erase must also clear the rest of the builder's bookkeeping files. + + Regression for issues #349 / #350: erase removed code_graph.lbug, + cocoindex.db, and .graph_hashes.json but left the incremental crash marker + (``.graph_increment_in_progress``) and the atomic-write temp + (``.graph_hashes.json.tmp``) on disk. The marker surviving erase -> init then + forced the next ``increment`` into a silent full rebuild (explained only under + ``--verbose``); the ``.tmp`` was pure cruft that defeated erase's "clean slate". + Both are builder-owned files (``build_ast_graph.BUILDER_OWNED_INDEX_FILES``), + so erase clears them from the same source of truth instead of hardcoding names. + """ + idx = tmp_path / "erase_builder_state" + idx.mkdir() + (idx / "code_graph.lbug").write_bytes(b"fake-kuzu-db") + (idx / ".graph_hashes.json").write_text("{}", encoding="utf-8") + # Simulate a crashed increment (marker left behind) + a crashed hash-store + # save (atomic-write temp orphaned before the os.replace). + (idx / ".graph_increment_in_progress").write_text("", encoding="utf-8") + (idx / ".graph_hashes.json.tmp").write_text("partial", encoding="utf-8") + env = os.environ.copy() + env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx) + env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(tmp_path) + proc = _run_cli( + ["erase", "--source-root", str(tmp_path), "--index-dir", str(idx), "--yes"], + env=env, + ) + assert proc.returncode == 0, proc.stderr + proc.stdout + assert not (idx / ".graph_increment_in_progress").exists(), ( + "erase left .graph_increment_in_progress; next increment would silently full-rebuild (#349)" + ) + assert not (idx / ".graph_hashes.json.tmp").exists(), ( + "erase left .graph_hashes.json.tmp orphan (#350)" + ) + + def test_embedding_model_precedence_cli_over_env_over_yaml_over_default( tmp_path: Path, monkeypatch: pytest.MonkeyPatch ) -> None: From 365dbe871381391d1d4b4d19284991f7925134b3 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sat, 4 Jul 2026 08:28:05 +0300 Subject: [PATCH 2/2] perf(cli): lazy-import build_ast_graph in _cmd_erase The top-level `from build_ast_graph import BUILDER_OWNED_INDEX_FILES` pulled numpy/ladybug/pyarrow/tree_sitter on every CLI invocation (~54ms; measured via -X importtime: cli cumulative import went 44ms -> 98ms), including `--help` -- violating this file's own lazy-import invariant. The three filenames are only needed on the erase path, so move the import into _cmd_erase. After the move, build_ast_graph is no longer imported at cli import time and cumulative import is back to ~39ms. Co-Authored-By: Claude --- java_codebase_rag/cli.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index add0d898..44d5666a 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -1,7 +1,7 @@ from __future__ import annotations -# Heavy imports (`server`, `pr_analysis`, `path_filtering.LayeredIgnore`) stay lazy -# inside handlers so `java-codebase-rag --help` stays fast. +# Heavy imports (`server`, `pr_analysis`, `path_filtering.LayeredIgnore`, +# `build_ast_graph`) stay lazy inside handlers so `java-codebase-rag --help` stays fast. import argparse import asyncio @@ -24,7 +24,6 @@ ) from java_codebase_rag._fdlimit import raise_fd_limit from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update, run_incremental_graph -from build_ast_graph import BUILDER_OWNED_INDEX_FILES from java_ontology import VALID_UNRESOLVED_CALL_REASONS LADYBUG_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73" @@ -606,6 +605,11 @@ def _cmd_erase(args: argparse.Namespace) -> int: cfg = _resolved_from_ns(args) _startup_hints(cfg) cfg.apply_to_os_environ() + # Lazy import: build_ast_graph transitively pulls numpy/ladybug/pyarrow/ + # tree_sitter (~54ms), and these filenames are only needed on the erase path. + # Keeping it out of the top-level import lets `java-codebase-rag --help` (and + # every other command) stay fast -- see the lazy-import invariant atop this file. + from build_ast_graph import BUILDER_OWNED_INDEX_FILES builder_paths = [cfg.ladybug_path.parent / name for name in BUILDER_OWNED_INDEX_FILES] to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, *builder_paths] if cfg.index_dir.is_dir():