Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ jobs:
- name: Run tests
if: steps.changes.outputs.code == 'true'
env:
JAVA_CODEBASE_RAG_RUN_HEAVY: "0"
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: pytest tests -v
- name: Skip tests (docs-only)
if: steps.changes.outputs.code != 'true'
Expand Down
113 changes: 84 additions & 29 deletions java_codebase_rag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,26 +21,9 @@
index_dir_has_existing_artifacts,
resolve_operator_config,
)
from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update
from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_build_ast_graph_incremental, run_cocoindex_drop, run_cocoindex_update
from java_ontology import VALID_UNRESOLVED_CALL_REASONS

KUZU_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73"

_INCREMENT_WARNING_LINES = (
"WARNING: AST graph (Kuzu) incremental rebuild is not yet implemented.",
"The graph reflects the index state from the last `init` or `reprocess`,",
"which means `find`, `neighbors`, and `describe` may return stale results",
"for files changed since then.",
"",
"Lance vector index has been updated incrementally and is current.",
"",
"For an up-to-date graph, run:",
" java-codebase-rag reprocess",
"",
"Track progress on Kuzu incremental rebuild:",
f" {KUZU_INCREMENTAL_TRACKING_ISSUE_URL}",
)

_REFRESH_DEPRECATION = (
"WARN: 'refresh' is deprecated; use 'reprocess'. "
"This alias will be removed in the next release."
Expand Down Expand Up @@ -178,11 +161,6 @@ def _emit(value: Any) -> None:
print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None))


def _emit_increment_kuzu_warning() -> None:
for line in _INCREMENT_WARNING_LINES:
print(line, file=sys.stderr)


def _parse_source_root(ns: argparse.Namespace) -> Path | None:
if ns.source_root:
return Path(ns.source_root).expanduser().resolve()
Expand Down Expand Up @@ -298,15 +276,27 @@ def _cmd_increment(args: argparse.Namespace) -> int:
cfg = _resolved_from_ns(args)
_startup_hints(cfg)
cfg.apply_to_os_environ()
_emit_increment_kuzu_warning()

def work() -> int:
from refresh_decision import choose_refresh_mode

env = cfg.subprocess_env()
verbose = bool(args.verbose)

# Decide refresh mode first so Lance mode is known
decision = choose_refresh_mode(
cfg.source_root,
cfg.kuzu_path,
mode="auto",
)

# Lance update — full when decision engine says so (config/pipeline change)
lance_full = decision.lance_mode == "full"
coco = run_cocoindex_update(
env,
full_reprocess=False,
full_reprocess=lance_full,
quiet=bool(args.quiet),
verbose=bool(args.verbose),
verbose=verbose,
lance_project_root=None if args.quiet else cfg.source_root,
)
if coco.returncode != 0:
Expand All @@ -320,7 +310,72 @@ def work() -> int:
}
)
return 1
_emit({"success": True, "message": "increment completed (Lance only; graph may be stale — see stderr)"})

# Kuzu rebuild based on decision
if decision.kuzu_mode == "incremental" and decision.detected_changes.modified:
changed = set(decision.detected_changes.modified + decision.detected_changes.added)
if not args.quiet and verbose:
for r in decision.reasons:
print(f" [graph] {r}", file=sys.stderr)
g = run_build_ast_graph_incremental(
source_root=cfg.source_root,
kuzu_path=cfg.kuzu_path,
changed_paths=changed,
verbose=verbose,
quiet=bool(args.quiet),
env=env,
)
if g.returncode != 0:
# Incremental failed — fall back to full
print(
f"[graph] incremental failed (exit {g.returncode}), falling back to full rebuild",
file=sys.stderr,
)
g = run_build_ast_graph(
source_root=cfg.source_root,
kuzu_path=cfg.kuzu_path,
verbose=verbose,
quiet=bool(args.quiet),
env=env,
)
if g.returncode != 0:
_emit(
{
"success": False,
"exit_code": g.returncode,
"stdout": clip(g.stdout, 4000),
"stderr": clip(g.stderr, 4000),
"message": f"graph builder exit {g.returncode}",
}
)
return 1
else:
# Full Kuzu rebuild
if not args.quiet:
for r in decision.reasons:
print(f" [graph] {r}", file=sys.stderr)
if decision.reasons:
print(" [graph] falling back to full Kuzu rebuild", file=sys.stderr)
g = run_build_ast_graph(
source_root=cfg.source_root,
kuzu_path=cfg.kuzu_path,
verbose=verbose,
quiet=bool(args.quiet),
env=env,
)
if g.returncode != 0:
_emit(
{
"success": False,
"exit_code": g.returncode,
"stdout": clip(g.stdout, 4000),
"stderr": clip(g.stderr, 4000),
"message": f"graph builder exit {g.returncode}",
}
)
return 1

_emit({"success": True, "message": "increment completed"})
return 0

return _run_with_pipeline_progress("increment", cfg, quiet=bool(args.quiet), work=work)
Expand Down Expand Up @@ -615,7 +670,7 @@ def build_parser() -> argparse.ArgumentParser:
"--quiet suppresses that stream; stdout remains the machine-readable payload.\n\n"
"Lifecycle (manage the index):\n"
" init Create a fresh index from a Java repository.\n"
" increment Pick up changes since the last index update (Lance only).\n"
" increment Pick up changes since the last index update (Lance + Kuzu incremental).\n"
" reprocess Full vector + graph rebuild (default); optional --vectors-only / --graph-only.\n"
" erase Delete the index from disk.\n\n"
"Introspection (inspect the index):\n"
Expand Down Expand Up @@ -650,7 +705,7 @@ def build_parser() -> argparse.ArgumentParser:
increment = subparsers.add_parser(
"increment",
help="Pick up changes since the last index update.",
description="Runs cocoindex catch-up (no full reprocess). Does not rebuild Kuzu; see stderr warning.",
description="Runs cocoindex catch-up (no full reprocess). Kuzu graph updated incrementally when safe; full rebuild as fallback.",
)
_add_index_embedding_flags(increment)
_add_verbosity_flags(increment)
Expand Down
69 changes: 69 additions & 0 deletions java_codebase_rag/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import shutil
import subprocess
import sys
import tempfile
import threading
import time
from pathlib import Path
Expand Down Expand Up @@ -247,5 +248,73 @@ def run_build_ast_graph(
return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)


def run_build_ast_graph_incremental(
*,
source_root: Path,
kuzu_path: Path,
changed_paths: set[str],
verbose: bool,
quiet: bool = False,
env: dict[str, str] | None = None,
) -> subprocess.CompletedProcess[str]:
"""Run build_ast_graph.py in incremental mode with --changed-paths."""
builder = bundle_dir() / "build_ast_graph.py"
if not builder.is_file():
return subprocess.CompletedProcess(
args=[],
returncode=126,
stdout="",
stderr=f"build_ast_graph.py not found under {builder.parent}",
)
# Write changed paths to a temp file
tmp = tempfile.NamedTemporaryFile(
mode="w", suffix=".paths", delete=False, prefix="changed-",
)
try:
for p in sorted(changed_paths):
tmp.write(p + "\n")
tmp.close()

cmd: list[str] = [
sys.executable,
str(builder),
"--source-root",
str(source_root),
"--kuzu-path",
str(kuzu_path),
"--changed-paths",
tmp.name,
]
if verbose or not quiet:
cmd.append("--verbose")
if quiet:
return subprocess.run(
cmd,
cwd=str(source_root),
env=env or os.environ.copy(),
capture_output=True,
text=True,
)
proc = subprocess.Popen(
cmd,
cwd=str(source_root),
env=env or os.environ.copy(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0,
)
out_s, err_s, code = _popen_capturing_stderr(proc, verbose=verbose)
if not verbose:
from java_codebase_rag.cli_format import bold_cyan, styled_check, styled_cross
marker = styled_check() if code == 0 else styled_cross()
print(f"{marker} {bold_cyan('[graph]')} incremental done", file=sys.stderr, flush=True)
return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s)
finally:
try:
Path(tmp.name).unlink()
except OSError:
pass


def clip(s: str, n: int) -> str:
return s[-n:] if len(s) > n else s
47 changes: 17 additions & 30 deletions plans/active/PLAN-TIER2-INCREMENTAL-REBUILD.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Depends on: none (ontology 16 surface is stable; no pending PRs block this).
| **PR-T1** | Foundation: `FileDeps` dataclass, `.deps.json` read/write, determinism test, perf baseline | none | `.deps.json` schema must be right first time (version field, field coverage for closure rules); determinism test coverage must surface divergence | determinism + deps-read/write | prerequisite only |
| **PR-T2** | Symmetric delete helpers: `delete_*_for_file` for all node/edge types | none | Cypher DELETE must match current schema exactly; cascade semantics (Symbol delete must clean edges); count accuracy for verbose logging | per-node-type delete + cascade | PR-T1 (needs `.deps.json` read) |
| **PR-T3** | Incremental orchestrator: `build_ast_graph_incremental`, `--changed-paths`, per-pass subset functions, closure expansion | none | Closure correctness (missing rule = silent divergence); transaction semantics; pass6/global-invariant; incremental-write functions | equivalence on all fixtures + closure expansion + subset passes | PR-T1 + PR-T2 |
| **PR-T4** | CLI + decision engine: integrate into `_cmd_increment`, remove warning, create `refresh_code_index` MCP tool | none | Decision-engine correctness (wrong mode = stale graph); `refresh_code_index` is new (not an update); CLI stderr format consistency | decision engine + CLI integration + MCP tool | PR-T3 |
| **PR-T4** | CLI + decision engine: integrate into `_cmd_increment`, remove warning | none | Decision-engine correctness (wrong mode = stale graph); CLI stderr format consistency | decision engine + CLI integration | PR-T3 |
| **PR-T5** | Brownfield closure refinement (optional, deferred) | none | Brownfield fanout rules must be formalised before narrowing | brownfield closure tests | PR-T4 |

Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow.
Expand All @@ -71,8 +71,8 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow.
| Schema migrations | **Full rebuild required** — ontology bump invalidates `.deps.json` via `ontology_version` check. |
| `_write_meta` in incremental | **Query live Kuzu DB for global stats** — the partial `GraphTables` accumulator only holds dirty-file data, so aggregation counts would be wrong. `_write_meta` in incremental mode runs a set of COUNT Cypher queries against the live DB to compute `routes_total`, `calls_total`, match breakdowns, etc. |
| `pass5_imperative_edges` and `asts` | **`pass5` does not use `asts`** (it does `del asts` and works from `tables.members`). Subset version mirrors this: `pass5_imperative_edges_subset(tables, dirty)` without an `asts` parameter. |
| `refresh_code_index` MCP tool | **Does not exist yet** — must be created in PR-T4. The proposal (INDEX-AUTO-MODE) specifies its schema; PR-T4 is the first implementation. |
| `refresh_decision.py` location | **Top-level module** (`refresh_decision.py`) — imported by both `java_codebase_rag/cli.py` and `server.py`. Lives alongside `build_ast_graph.py`. |
| Index building scope | **CLI-only** — no MCP tools for index refresh. The MCP server (`server.py`) is a read-only query interface. The decision engine lives in `refresh_decision.py`, imported by `java_codebase_rag/cli.py` and `java_codebase_rag/pipeline.py`. |
| `refresh_decision.py` location | **Top-level module** (`refresh_decision.py`) — imported by `java_codebase_rag/cli.py` and `java_codebase_rag/pipeline.py`. Lives alongside `build_ast_graph.py`. |
| Test fixture strategy | **Per-test fresh builds** for equivalence tests (Tier 3 in `tests/README.md`). Use `tests/_builders.py` helpers (`build_kuzu_full_into`, `build_graph_tables_to`) for full-rebuild baselines. Session fixtures (Tier 1/2) are read-only and cannot be mutated for incremental tests. |
| `graph_meta.last_rebuild_mode` | **Added in PR-T3** — string field on `GraphMeta` node: `"full"` or `"incremental"`. Used for fallback-rate monitoring (cross-PR risk #5). |

Expand Down Expand Up @@ -410,8 +410,8 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow.
`detected_changes: ChangeSet`. The `"auto"` mode from
INDEX-AUTO-MODE-PROPOSE is resolved to concrete `incremental`/`full`
by `_choose_refresh_mode` before returning — callers never see `"auto"`.
- `_detect_repo_changes(source_root, git_ref_base, changed_paths) -> ChangeSet`
git diff or hash-based change detection.
- `_detect_repo_changes(source_root, changed_paths, deps_index) -> ChangeSet`
explicit `changed_paths` or hash-based diff against `.deps.json`.
- `_choose_refresh_mode(changes: ChangeSet, deps_path: Path, total_files: int) -> RefreshDecision`
— implements the decision rules from `INDEX-AUTO-MODE-PROPOSE.md`:
- Full Kuzu when: deletes, renames, config changes, pipeline changes,
Expand Down Expand Up @@ -439,28 +439,17 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow.
--changed-paths <temp-file>`. The temp file contains newline-separated
paths, matching `build_ast_graph.py`'s `--changed-paths` contract.

### 4. `server.py`
- **Create** `refresh_code_index` MCP tool (does not exist yet). Accepts
optional inputs: `confirm: bool`, `mode: "auto" | "incremental" | "full"`,
`changed_paths: list[str] | null`, `git_ref_base: str`, `reason: str | null`.
- Dispatches to incremental or full Kuzu rebuild based on `RefreshDecision`.
- Includes `effective_mode`, `decision_reasons`, `detected_changes` in
response payload.
- Backward compatible: calls passing only `confirm=true` still work
(mode defaults to `"auto"`).

### 5. `tests/test_refresh_decision.py` (new)
### 4. `tests/test_refresh_decision.py` (new)
- `test_auto_modified_only_incremental` — modified-only changes → incremental.
- `test_auto_deleted_file_full_kuzu` — deletion → full Kuzu, incremental Lance.
- `test_auto_renamed_file_full_kuzu` — rename → full Kuzu.
- `test_auto_config_change_full` — `.java-codebase-rag.yml` change → full.
- `test_auto_detection_failure_full` — no git, no pathsfull.
- `test_auto_detection_failure_full` — no paths, no `.deps.json`incremental with empty changes.
- `test_explicit_full_overrides` — `mode=full` → full regardless.
- `test_deps_missing_full_kuzu` — no `.deps.json` → full Kuzu.
- `test_deps_stale_ontology_full_kuzu` — wrong version → full Kuzu.
- `test_backward_compat_confirm_only` — `confirm=true` only → auto mode.

### 6. `tests/test_cli_increment.py` (new or extend existing CLI tests)
### 5. `tests/test_cli_increment.py` (new or extend existing CLI tests)
- `test_increment_dispatches_kuzu_incremental` — mock pipeline, verify
`--changed-paths` passed.
- `test_increment_dispatches_kuzu_full_fallback` — mock pipeline, verify
Expand All @@ -476,33 +465,31 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow.
6. `test_explicit_full_overrides`
7. `test_deps_missing_full_kuzu`
8. `test_deps_stale_ontology_full_kuzu`
9. `test_backward_compat_confirm_only`
10. `test_increment_dispatches_kuzu_incremental`
11. `test_increment_dispatches_kuzu_full_fallback`
12. `test_increment_removes_kuzu_warning`
9. `test_increment_dispatches_kuzu_incremental`
10. `test_increment_dispatches_kuzu_full_fallback`
11. `test_increment_removes_kuzu_warning`

## Definition of done (PR-T4)
- `java-codebase-rag increment` updates both Lance and Kuzu incrementally
when safe.
- Decision engine isolated in `refresh_decision.py` with full test coverage.
- `_emit_increment_kuzu_warning()` removed from `java_codebase_rag/cli.py`.
- `refresh_code_index` MCP tool created in `server.py`, backward compatible.
- No MCP tools added — index building is CLI-only.
- All existing tests pass.
- `ruff check .` clean.

## Implementation step list
| # | Step | File(s) | Done when |
| --- | --- | --- | --- |
| 1 | Implement `ChangeSet` + `RefreshDecision` dataclasses | `refresh_decision.py` | Types defined |
| 2 | Implement `_detect_repo_changes` | `refresh_decision.py` | Git diff + hash fallback work |
| 3 | Implement `_choose_refresh_mode` | `refresh_decision.py` | All 9 decision tests pass |
| 2 | Implement `_detect_repo_changes` | `refresh_decision.py` | changed_paths + hash-based detection work |
| 3 | Implement `_choose_refresh_mode` | `refresh_decision.py` | All 8 decision tests pass |
| 4 | Add `run_build_ast_graph_incremental` to pipeline | `java_codebase_rag/pipeline.py` | Wrapper writes temp file, dispatches `--changed-paths` |
| 5 | Update `_cmd_increment` in CLI | `java_codebase_rag/cli.py` | Dispatches incremental or full based on decision |
| 6 | Remove `_emit_increment_kuzu_warning` + `_INCREMENT_WARNING_LINES` | `java_codebase_rag/cli.py` | No warning emitted |
| 7 | Create `refresh_code_index` MCP tool | `server.py` | New tool with `mode`/`changed_paths`/`git_ref_base` inputs |
| 8 | Write decision engine tests | `tests/test_refresh_decision.py` | All pass |
| 9 | Write CLI integration tests | `tests/test_cli_increment.py` | All pass |
| 10 | Run full test suite + ruff | all | Green |
| 7 | Write decision engine tests | `tests/test_refresh_decision.py` | All pass |
| 8 | Write CLI integration tests | `tests/test_cli_increment.py` | All pass |
| 9 | Run full test suite + ruff | all | Green |

---

Expand Down
Loading
Loading