Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,10 +472,26 @@ class IncrementalResult:
elapsed_sec: float


# --- Builder-owned files in the index dir (single source of truth) ---------------
# Every artifact the graph builder writes next to code_graph.lbug. The lifecycle
# CLI's `erase` clears all of these from one list so the builder and erase cannot
# drift (issues #349 / #350): previously erase hardcoded ".graph_hashes.json" only
# and left the crash marker (.graph_increment_in_progress) and the atomic-write
# temp (.graph_hashes.json.tmp) behind on disk.
GRAPH_HASHES_FILENAME = ".graph_hashes.json"
GRAPH_HASHES_TMP_FILENAME = ".graph_hashes.json.tmp"
GRAPH_INCREMENT_MARKER_FILENAME = ".graph_increment_in_progress"
BUILDER_OWNED_INDEX_FILES: tuple[str, ...] = (
GRAPH_HASHES_FILENAME,
GRAPH_HASHES_TMP_FILENAME,
GRAPH_INCREMENT_MARKER_FILENAME,
)


class FileHashTracker:
"""Track content hashes for incremental graph rebuild."""
def __init__(self, index_dir: Path):
self._path = index_dir / ".graph_hashes.json"
self._path = index_dir / GRAPH_HASHES_FILENAME
self._hashes: dict[str, str] = {} # rel_path -> sha256_hex

def load(self) -> None:
Expand All @@ -491,7 +507,7 @@ def load(self) -> None:

def save(self) -> None:
"""Persist hashes to disk atomically (write .tmp, rename)."""
tmp_path = self._path.with_suffix(".json.tmp")
tmp_path = self._path.parent / GRAPH_HASHES_TMP_FILENAME
try:
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(self._hashes, f, sort_keys=True)
Expand Down Expand Up @@ -3811,7 +3827,7 @@ def incremental_rebuild(
_verbose_stderr_line(f"[increment] detected {len(added)} added, {len(changed)} changed, {len(removed)} removed files")

# Step 2: Crash marker check
crash_marker_path = index_dir / ".graph_increment_in_progress"
crash_marker_path = index_dir / GRAPH_INCREMENT_MARKER_FILENAME
if crash_marker_path.exists():
if verbose:
_verbose_stderr_line("[increment] crash marker exists; falling back to full rebuild")
Expand Down
30 changes: 19 additions & 11 deletions java_codebase_rag/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

# Heavy imports (`server`, `pr_analysis`, `path_filtering.LayeredIgnore`) stay lazy
# inside handlers so `java-codebase-rag --help` stays fast.
# Heavy imports (`server`, `pr_analysis`, `path_filtering.LayeredIgnore`,
# `build_ast_graph`) stay lazy inside handlers so `java-codebase-rag --help` stays fast.

import argparse
import asyncio
Expand Down Expand Up @@ -605,8 +605,13 @@ def _cmd_erase(args: argparse.Namespace) -> int:
cfg = _resolved_from_ns(args)
_startup_hints(cfg)
cfg.apply_to_os_environ()
graph_hashes_path = cfg.ladybug_path.parent / ".graph_hashes.json"
to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, graph_hashes_path]
# Lazy import: build_ast_graph transitively pulls numpy/ladybug/pyarrow/
# tree_sitter (~54ms), and these filenames are only needed on the erase path.
# Keeping it out of the top-level import lets `java-codebase-rag --help` (and
# every other command) stay fast -- see the lazy-import invariant atop this file.
from build_ast_graph import BUILDER_OWNED_INDEX_FILES
builder_paths = [cfg.ladybug_path.parent / name for name in BUILDER_OWNED_INDEX_FILES]
to_describe: list[Path] = [cfg.ladybug_path, cfg.cocoindex_db, *builder_paths]
if cfg.index_dir.is_dir():
try:
import lancedb
Expand Down Expand Up @@ -643,15 +648,18 @@ def work(progress: "PipelineProgress | None") -> int:
)
elif drop.returncode != 0:
print(clip(drop.stderr, 4000), file=sys.stderr)
# Remove the LadybugDB graph, the cocoindex state store, and the graph
# builder's content-hash store. Each is removed by type (see _rm_any):
# code_graph.lbug is a file here but may be a dir under kuzu, while
# cocoindex.db is a directory — a type-blind delete silently no-oped on
# one or the other, and .graph_hashes.json was never targeted at all
# (issue #346).
# Remove the LadybugDB graph, the cocoindex state store, and every
# builder-owned bookkeeping file next to code_graph.lbug (the content-hash
# store, its atomic-write temp, and the incremental crash marker). Each is
# removed by type (see _rm_any): code_graph.lbug is a file here but may be
# a dir under kuzu, while cocoindex.db is a directory — a type-blind delete
# silently no-oped on one or the other, and the builder files were never
# targeted at all (issues #346 / #349 / #350). The list comes from
# build_ast_graph.BUILDER_OWNED_INDEX_FILES so erase and the builder cannot drift.
_rm_any(cfg.ladybug_path)
_rm_any(cfg.cocoindex_db)
_rm_any(graph_hashes_path)
for builder_path in builder_paths:
_rm_any(builder_path)
if cfg.index_dir.is_dir():
try:
import lancedb
Expand Down
36 changes: 36 additions & 0 deletions tests/test_java_codebase_rag_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,42 @@ def test_erase_removes_graph_file_cocoindex_dir_and_hash_store(tmp_path: Path) -
assert not (idx / ".graph_hashes.json").exists(), "erase left .graph_hashes.json on disk"


def test_erase_removes_increment_marker_and_hash_store_tmp(tmp_path: Path) -> None:
"""erase must also clear the rest of the builder's bookkeeping files.

Regression for issues #349 / #350: erase removed code_graph.lbug,
cocoindex.db, and .graph_hashes.json but left the incremental crash marker
(``.graph_increment_in_progress``) and the atomic-write temp
(``.graph_hashes.json.tmp``) on disk. The marker surviving erase -> init then
forced the next ``increment`` into a silent full rebuild (explained only under
``--verbose``); the ``.tmp`` was pure cruft that defeated erase's "clean slate".
Both are builder-owned files (``build_ast_graph.BUILDER_OWNED_INDEX_FILES``),
so erase clears them from the same source of truth instead of hardcoding names.
"""
idx = tmp_path / "erase_builder_state"
idx.mkdir()
(idx / "code_graph.lbug").write_bytes(b"fake-kuzu-db")
(idx / ".graph_hashes.json").write_text("{}", encoding="utf-8")
# Simulate a crashed increment (marker left behind) + a crashed hash-store
# save (atomic-write temp orphaned before the os.replace).
(idx / ".graph_increment_in_progress").write_text("", encoding="utf-8")
(idx / ".graph_hashes.json.tmp").write_text("partial", encoding="utf-8")
env = os.environ.copy()
env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx)
env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(tmp_path)
proc = _run_cli(
["erase", "--source-root", str(tmp_path), "--index-dir", str(idx), "--yes"],
env=env,
)
assert proc.returncode == 0, proc.stderr + proc.stdout
assert not (idx / ".graph_increment_in_progress").exists(), (
"erase left .graph_increment_in_progress; next increment would silently full-rebuild (#349)"
)
assert not (idx / ".graph_hashes.json.tmp").exists(), (
"erase left .graph_hashes.json.tmp orphan (#350)"
)


def test_embedding_model_precedence_cli_over_env_over_yaml_over_default(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
Expand Down
Loading