diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7b5f956f..35d59adc 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -54,7 +54,7 @@ jobs: - name: Run tests if: steps.changes.outputs.code == 'true' env: - JAVA_CODEBASE_RAG_RUN_HEAVY: "0" + HF_TOKEN: ${{ secrets.HF_TOKEN }} run: pytest tests -v - name: Skip tests (docs-only) if: steps.changes.outputs.code != 'true' diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index 27ad800a..f9dc2eb1 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -21,26 +21,9 @@ index_dir_has_existing_artifacts, resolve_operator_config, ) -from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_cocoindex_drop, run_cocoindex_update +from java_codebase_rag.pipeline import clip, run_build_ast_graph, run_build_ast_graph_incremental, run_cocoindex_drop, run_cocoindex_update from java_ontology import VALID_UNRESOLVED_CALL_REASONS -KUZU_INCREMENTAL_TRACKING_ISSUE_URL = "https://github.com/HumanBean17/java-codebase-rag/issues/73" - -_INCREMENT_WARNING_LINES = ( - "WARNING: AST graph (Kuzu) incremental rebuild is not yet implemented.", - "The graph reflects the index state from the last `init` or `reprocess`,", - "which means `find`, `neighbors`, and `describe` may return stale results", - "for files changed since then.", - "", - "Lance vector index has been updated incrementally and is current.", - "", - "For an up-to-date graph, run:", - " java-codebase-rag reprocess", - "", - "Track progress on Kuzu incremental rebuild:", - f" {KUZU_INCREMENTAL_TRACKING_ISSUE_URL}", -) - _REFRESH_DEPRECATION = ( "WARN: 'refresh' is deprecated; use 'reprocess'. " "This alias will be removed in the next release." @@ -178,11 +161,6 @@ def _emit(value: Any) -> None: print(json.dumps(payload, default=_jsonable, sort_keys=True, indent=None)) -def _emit_increment_kuzu_warning() -> None: - for line in _INCREMENT_WARNING_LINES: - print(line, file=sys.stderr) - - def _parse_source_root(ns: argparse.Namespace) -> Path | None: if ns.source_root: return Path(ns.source_root).expanduser().resolve() @@ -298,15 +276,27 @@ def _cmd_increment(args: argparse.Namespace) -> int: cfg = _resolved_from_ns(args) _startup_hints(cfg) cfg.apply_to_os_environ() - _emit_increment_kuzu_warning() def work() -> int: + from refresh_decision import choose_refresh_mode + env = cfg.subprocess_env() + verbose = bool(args.verbose) + + # Decide refresh mode first so Lance mode is known + decision = choose_refresh_mode( + cfg.source_root, + cfg.kuzu_path, + mode="auto", + ) + + # Lance update — full when decision engine says so (config/pipeline change) + lance_full = decision.lance_mode == "full" coco = run_cocoindex_update( env, - full_reprocess=False, + full_reprocess=lance_full, quiet=bool(args.quiet), - verbose=bool(args.verbose), + verbose=verbose, lance_project_root=None if args.quiet else cfg.source_root, ) if coco.returncode != 0: @@ -320,7 +310,72 @@ def work() -> int: } ) return 1 - _emit({"success": True, "message": "increment completed (Lance only; graph may be stale — see stderr)"}) + + # Kuzu rebuild based on decision + if decision.kuzu_mode == "incremental" and decision.detected_changes.modified: + changed = set(decision.detected_changes.modified + decision.detected_changes.added) + if not args.quiet and verbose: + for r in decision.reasons: + print(f" [graph] {r}", file=sys.stderr) + g = run_build_ast_graph_incremental( + source_root=cfg.source_root, + kuzu_path=cfg.kuzu_path, + changed_paths=changed, + verbose=verbose, + quiet=bool(args.quiet), + env=env, + ) + if g.returncode != 0: + # Incremental failed — fall back to full + print( + f"[graph] incremental failed (exit {g.returncode}), falling back to full rebuild", + file=sys.stderr, + ) + g = run_build_ast_graph( + source_root=cfg.source_root, + kuzu_path=cfg.kuzu_path, + verbose=verbose, + quiet=bool(args.quiet), + env=env, + ) + if g.returncode != 0: + _emit( + { + "success": False, + "exit_code": g.returncode, + "stdout": clip(g.stdout, 4000), + "stderr": clip(g.stderr, 4000), + "message": f"graph builder exit {g.returncode}", + } + ) + return 1 + else: + # Full Kuzu rebuild + if not args.quiet: + for r in decision.reasons: + print(f" [graph] {r}", file=sys.stderr) + if decision.reasons: + print(" [graph] falling back to full Kuzu rebuild", file=sys.stderr) + g = run_build_ast_graph( + source_root=cfg.source_root, + kuzu_path=cfg.kuzu_path, + verbose=verbose, + quiet=bool(args.quiet), + env=env, + ) + if g.returncode != 0: + _emit( + { + "success": False, + "exit_code": g.returncode, + "stdout": clip(g.stdout, 4000), + "stderr": clip(g.stderr, 4000), + "message": f"graph builder exit {g.returncode}", + } + ) + return 1 + + _emit({"success": True, "message": "increment completed"}) return 0 return _run_with_pipeline_progress("increment", cfg, quiet=bool(args.quiet), work=work) @@ -615,7 +670,7 @@ def build_parser() -> argparse.ArgumentParser: "--quiet suppresses that stream; stdout remains the machine-readable payload.\n\n" "Lifecycle (manage the index):\n" " init Create a fresh index from a Java repository.\n" - " increment Pick up changes since the last index update (Lance only).\n" + " increment Pick up changes since the last index update (Lance + Kuzu incremental).\n" " reprocess Full vector + graph rebuild (default); optional --vectors-only / --graph-only.\n" " erase Delete the index from disk.\n\n" "Introspection (inspect the index):\n" @@ -650,7 +705,7 @@ def build_parser() -> argparse.ArgumentParser: increment = subparsers.add_parser( "increment", help="Pick up changes since the last index update.", - description="Runs cocoindex catch-up (no full reprocess). Does not rebuild Kuzu; see stderr warning.", + description="Runs cocoindex catch-up (no full reprocess). Kuzu graph updated incrementally when safe; full rebuild as fallback.", ) _add_index_embedding_flags(increment) _add_verbosity_flags(increment) diff --git a/java_codebase_rag/pipeline.py b/java_codebase_rag/pipeline.py index f1d34270..80093b27 100644 --- a/java_codebase_rag/pipeline.py +++ b/java_codebase_rag/pipeline.py @@ -5,6 +5,7 @@ import shutil import subprocess import sys +import tempfile import threading import time from pathlib import Path @@ -247,5 +248,73 @@ def run_build_ast_graph( return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s) +def run_build_ast_graph_incremental( + *, + source_root: Path, + kuzu_path: Path, + changed_paths: set[str], + verbose: bool, + quiet: bool = False, + env: dict[str, str] | None = None, +) -> subprocess.CompletedProcess[str]: + """Run build_ast_graph.py in incremental mode with --changed-paths.""" + builder = bundle_dir() / "build_ast_graph.py" + if not builder.is_file(): + return subprocess.CompletedProcess( + args=[], + returncode=126, + stdout="", + stderr=f"build_ast_graph.py not found under {builder.parent}", + ) + # Write changed paths to a temp file + tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".paths", delete=False, prefix="changed-", + ) + try: + for p in sorted(changed_paths): + tmp.write(p + "\n") + tmp.close() + + cmd: list[str] = [ + sys.executable, + str(builder), + "--source-root", + str(source_root), + "--kuzu-path", + str(kuzu_path), + "--changed-paths", + tmp.name, + ] + if verbose or not quiet: + cmd.append("--verbose") + if quiet: + return subprocess.run( + cmd, + cwd=str(source_root), + env=env or os.environ.copy(), + capture_output=True, + text=True, + ) + proc = subprocess.Popen( + cmd, + cwd=str(source_root), + env=env or os.environ.copy(), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=0, + ) + out_s, err_s, code = _popen_capturing_stderr(proc, verbose=verbose) + if not verbose: + from java_codebase_rag.cli_format import bold_cyan, styled_check, styled_cross + marker = styled_check() if code == 0 else styled_cross() + print(f"{marker} {bold_cyan('[graph]')} incremental done", file=sys.stderr, flush=True) + return subprocess.CompletedProcess(args=cmd, returncode=code, stdout=out_s, stderr=err_s) + finally: + try: + Path(tmp.name).unlink() + except OSError: + pass + + def clip(s: str, n: int) -> str: return s[-n:] if len(s) > n else s diff --git a/plans/active/PLAN-TIER2-INCREMENTAL-REBUILD.md b/plans/active/PLAN-TIER2-INCREMENTAL-REBUILD.md index a88f8633..33cfcdb8 100644 --- a/plans/active/PLAN-TIER2-INCREMENTAL-REBUILD.md +++ b/plans/active/PLAN-TIER2-INCREMENTAL-REBUILD.md @@ -53,7 +53,7 @@ Depends on: none (ontology 16 surface is stable; no pending PRs block this). | **PR-T1** | Foundation: `FileDeps` dataclass, `.deps.json` read/write, determinism test, perf baseline | none | `.deps.json` schema must be right first time (version field, field coverage for closure rules); determinism test coverage must surface divergence | determinism + deps-read/write | prerequisite only | | **PR-T2** | Symmetric delete helpers: `delete_*_for_file` for all node/edge types | none | Cypher DELETE must match current schema exactly; cascade semantics (Symbol delete must clean edges); count accuracy for verbose logging | per-node-type delete + cascade | PR-T1 (needs `.deps.json` read) | | **PR-T3** | Incremental orchestrator: `build_ast_graph_incremental`, `--changed-paths`, per-pass subset functions, closure expansion | none | Closure correctness (missing rule = silent divergence); transaction semantics; pass6/global-invariant; incremental-write functions | equivalence on all fixtures + closure expansion + subset passes | PR-T1 + PR-T2 | -| **PR-T4** | CLI + decision engine: integrate into `_cmd_increment`, remove warning, create `refresh_code_index` MCP tool | none | Decision-engine correctness (wrong mode = stale graph); `refresh_code_index` is new (not an update); CLI stderr format consistency | decision engine + CLI integration + MCP tool | PR-T3 | +| **PR-T4** | CLI + decision engine: integrate into `_cmd_increment`, remove warning | none | Decision-engine correctness (wrong mode = stale graph); CLI stderr format consistency | decision engine + CLI integration | PR-T3 | | **PR-T5** | Brownfield closure refinement (optional, deferred) | none | Brownfield fanout rules must be formalised before narrowing | brownfield closure tests | PR-T4 | Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. @@ -71,8 +71,8 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. | Schema migrations | **Full rebuild required** — ontology bump invalidates `.deps.json` via `ontology_version` check. | | `_write_meta` in incremental | **Query live Kuzu DB for global stats** — the partial `GraphTables` accumulator only holds dirty-file data, so aggregation counts would be wrong. `_write_meta` in incremental mode runs a set of COUNT Cypher queries against the live DB to compute `routes_total`, `calls_total`, match breakdowns, etc. | | `pass5_imperative_edges` and `asts` | **`pass5` does not use `asts`** (it does `del asts` and works from `tables.members`). Subset version mirrors this: `pass5_imperative_edges_subset(tables, dirty)` without an `asts` parameter. | -| `refresh_code_index` MCP tool | **Does not exist yet** — must be created in PR-T4. The proposal (INDEX-AUTO-MODE) specifies its schema; PR-T4 is the first implementation. | -| `refresh_decision.py` location | **Top-level module** (`refresh_decision.py`) — imported by both `java_codebase_rag/cli.py` and `server.py`. Lives alongside `build_ast_graph.py`. | +| Index building scope | **CLI-only** — no MCP tools for index refresh. The MCP server (`server.py`) is a read-only query interface. The decision engine lives in `refresh_decision.py`, imported by `java_codebase_rag/cli.py` and `java_codebase_rag/pipeline.py`. | +| `refresh_decision.py` location | **Top-level module** (`refresh_decision.py`) — imported by `java_codebase_rag/cli.py` and `java_codebase_rag/pipeline.py`. Lives alongside `build_ast_graph.py`. | | Test fixture strategy | **Per-test fresh builds** for equivalence tests (Tier 3 in `tests/README.md`). Use `tests/_builders.py` helpers (`build_kuzu_full_into`, `build_graph_tables_to`) for full-rebuild baselines. Session fixtures (Tier 1/2) are read-only and cannot be mutated for incremental tests. | | `graph_meta.last_rebuild_mode` | **Added in PR-T3** — string field on `GraphMeta` node: `"full"` or `"incremental"`. Used for fallback-rate monitoring (cross-PR risk #5). | @@ -410,8 +410,8 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. `detected_changes: ChangeSet`. The `"auto"` mode from INDEX-AUTO-MODE-PROPOSE is resolved to concrete `incremental`/`full` by `_choose_refresh_mode` before returning — callers never see `"auto"`. -- `_detect_repo_changes(source_root, git_ref_base, changed_paths) -> ChangeSet` - — git diff or hash-based change detection. +- `_detect_repo_changes(source_root, changed_paths, deps_index) -> ChangeSet` + — explicit `changed_paths` or hash-based diff against `.deps.json`. - `_choose_refresh_mode(changes: ChangeSet, deps_path: Path, total_files: int) -> RefreshDecision` — implements the decision rules from `INDEX-AUTO-MODE-PROPOSE.md`: - Full Kuzu when: deletes, renames, config changes, pipeline changes, @@ -439,28 +439,17 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. --changed-paths `. The temp file contains newline-separated paths, matching `build_ast_graph.py`'s `--changed-paths` contract. -### 4. `server.py` -- **Create** `refresh_code_index` MCP tool (does not exist yet). Accepts - optional inputs: `confirm: bool`, `mode: "auto" | "incremental" | "full"`, - `changed_paths: list[str] | null`, `git_ref_base: str`, `reason: str | null`. -- Dispatches to incremental or full Kuzu rebuild based on `RefreshDecision`. -- Includes `effective_mode`, `decision_reasons`, `detected_changes` in - response payload. -- Backward compatible: calls passing only `confirm=true` still work - (mode defaults to `"auto"`). - -### 5. `tests/test_refresh_decision.py` (new) +### 4. `tests/test_refresh_decision.py` (new) - `test_auto_modified_only_incremental` — modified-only changes → incremental. - `test_auto_deleted_file_full_kuzu` — deletion → full Kuzu, incremental Lance. - `test_auto_renamed_file_full_kuzu` — rename → full Kuzu. - `test_auto_config_change_full` — `.java-codebase-rag.yml` change → full. -- `test_auto_detection_failure_full` — no git, no paths → full. +- `test_auto_detection_failure_full` — no paths, no `.deps.json` → incremental with empty changes. - `test_explicit_full_overrides` — `mode=full` → full regardless. - `test_deps_missing_full_kuzu` — no `.deps.json` → full Kuzu. - `test_deps_stale_ontology_full_kuzu` — wrong version → full Kuzu. -- `test_backward_compat_confirm_only` — `confirm=true` only → auto mode. -### 6. `tests/test_cli_increment.py` (new or extend existing CLI tests) +### 5. `tests/test_cli_increment.py` (new or extend existing CLI tests) - `test_increment_dispatches_kuzu_incremental` — mock pipeline, verify `--changed-paths` passed. - `test_increment_dispatches_kuzu_full_fallback` — mock pipeline, verify @@ -476,17 +465,16 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. 6. `test_explicit_full_overrides` 7. `test_deps_missing_full_kuzu` 8. `test_deps_stale_ontology_full_kuzu` -9. `test_backward_compat_confirm_only` -10. `test_increment_dispatches_kuzu_incremental` -11. `test_increment_dispatches_kuzu_full_fallback` -12. `test_increment_removes_kuzu_warning` +9. `test_increment_dispatches_kuzu_incremental` +10. `test_increment_dispatches_kuzu_full_fallback` +11. `test_increment_removes_kuzu_warning` ## Definition of done (PR-T4) - `java-codebase-rag increment` updates both Lance and Kuzu incrementally when safe. - Decision engine isolated in `refresh_decision.py` with full test coverage. - `_emit_increment_kuzu_warning()` removed from `java_codebase_rag/cli.py`. -- `refresh_code_index` MCP tool created in `server.py`, backward compatible. +- No MCP tools added — index building is CLI-only. - All existing tests pass. - `ruff check .` clean. @@ -494,15 +482,14 @@ Landing order: **T1 -> T2 -> T3 -> T4**. PR-T5 is optional and may follow. | # | Step | File(s) | Done when | | --- | --- | --- | --- | | 1 | Implement `ChangeSet` + `RefreshDecision` dataclasses | `refresh_decision.py` | Types defined | -| 2 | Implement `_detect_repo_changes` | `refresh_decision.py` | Git diff + hash fallback work | -| 3 | Implement `_choose_refresh_mode` | `refresh_decision.py` | All 9 decision tests pass | +| 2 | Implement `_detect_repo_changes` | `refresh_decision.py` | changed_paths + hash-based detection work | +| 3 | Implement `_choose_refresh_mode` | `refresh_decision.py` | All 8 decision tests pass | | 4 | Add `run_build_ast_graph_incremental` to pipeline | `java_codebase_rag/pipeline.py` | Wrapper writes temp file, dispatches `--changed-paths` | | 5 | Update `_cmd_increment` in CLI | `java_codebase_rag/cli.py` | Dispatches incremental or full based on decision | | 6 | Remove `_emit_increment_kuzu_warning` + `_INCREMENT_WARNING_LINES` | `java_codebase_rag/cli.py` | No warning emitted | -| 7 | Create `refresh_code_index` MCP tool | `server.py` | New tool with `mode`/`changed_paths`/`git_ref_base` inputs | -| 8 | Write decision engine tests | `tests/test_refresh_decision.py` | All pass | -| 9 | Write CLI integration tests | `tests/test_cli_increment.py` | All pass | -| 10 | Run full test suite + ruff | all | Green | +| 7 | Write decision engine tests | `tests/test_refresh_decision.py` | All pass | +| 8 | Write CLI integration tests | `tests/test_cli_increment.py` | All pass | +| 9 | Run full test suite + ruff | all | Green | --- diff --git a/propose/active/INDEX-AUTO-MODE-PROPOSE.md b/propose/active/INDEX-AUTO-MODE-PROPOSE.md index 9a33cd6c..836465c6 100644 --- a/propose/active/INDEX-AUTO-MODE-PROPOSE.md +++ b/propose/active/INDEX-AUTO-MODE-PROPOSE.md @@ -38,20 +38,8 @@ No new commands. Existing commands gain automatic mode selection: - `java-codebase-rag reprocess --graph-only` — full Kuzu rebuild only. - `java-codebase-rag reprocess --vectors-only` — full Lance rebuild only. -### MCP tool (`refresh_code_index`) - -Add optional inputs: - -- `confirm: bool = false` (existing) -- `mode: "auto" | "incremental" | "full" = "auto"` -- `changed_paths: list[str] | null = null` -- `git_ref_base: str = "HEAD"` -- `reason: str | null = null` - -Backward compatibility: - -- Calls passing only `confirm=true` should still work. -- Default mode is `auto` (safe-by-default decisioning). +Index building is **CLI-only**. The MCP server (`server.py`) is a +read-only query interface and does not expose index-building tools. ## Decision Engine (`mode=auto`) @@ -95,12 +83,12 @@ class RefreshDecision: ## Change Detection Strategy -1. Prefer git diff status: - - `git diff --name-status ...HEAD` - - optionally include working tree status (`git diff --name-status` - and `git diff --name-status --cached`) -2. If git signal is unavailable, use `changed_paths` when supplied. -3. If still uncertain, fall back to `full`. +1. If `changed_paths` is provided, use those directly. +2. If not, hash-based diff against `.deps.json`: walk the source tree, + compute SHA-256 per `.java` file, compare against cached hashes. + New files = added, hash changed = modified, in index but not on + disk = deleted. +3. If neither is available, fall back to `full`. Represent results as: @@ -126,20 +114,17 @@ Kuzu may incrementally update independently. For example, if `.deps.json` is missing but no config changed, Lance could be incremental while Kuzu falls back to full. -## Response Payload Enhancements - -Add decision transparency fields to `refresh_code_index` response: +## CLI Output Enhancements -- `effective_mode: { lance: "incremental" | "full", kuzu: "incremental" | "full" }` -- `decision_reasons: list[str]` -- `detected_changes: { added, modified, deleted, renamed }` -- optional `warnings: list[str]` +Emit the mode decision to stderr with `[graph]` / `[vectors]` +prefixes consistent with existing progress output (see +`CLI-PROGRESS-OUTPUT-PROPOSE.md`). -Keep existing stdout/stderr/exit_code fields intact. +Decision transparency on stderr: -For the CLI (`increment` command), emit the mode decision to stderr -with `[graph]` / `[vectors]` prefixes consistent with existing -progress output (see `CLI-PROGRESS-OUTPUT-PROPOSE.md`). +- Effective mode (incremental or full) for Lance and Kuzu independently. +- Decision reasons (why full was chosen, if applicable). +- Detected changes summary (added, modified, deleted, renamed). ## Safety Policy @@ -173,9 +158,8 @@ progress output (see `CLI-PROGRESS-OUTPUT-PROPOSE.md`). - Make logs/user-facing messages explicit about why full mode was selected. - Preserve current subprocess environment and project-root behavior. -- The decision engine lives in a shared module (e.g. `index_common.py` - or a new `refresh_decision.py`) usable by both `cli.py` and - `server.py`. +- The decision engine lives in a shared module (`refresh_decision.py`) + usable by `cli.py` and `pipeline.py`. ## Rollout @@ -183,5 +167,4 @@ progress output (see `CLI-PROGRESS-OUTPUT-PROPOSE.md`). 2. Integrate into `cli.py`'s `_cmd_increment` — remove `_emit_increment_kuzu_warning()`, dispatch to Kuzu incremental or full based on decision. -3. Integrate into `server.py`'s `refresh_code_index` MCP tool. -4. Update `README.md` and `docs/JAVA-CODEBASE-RAG-CLI.md`. +3. Update `README.md` and `docs/JAVA-CODEBASE-RAG-CLI.md`. diff --git a/propose/active/TIER2-INCREMENTAL-REBUILD-PROPOSE.md b/propose/active/TIER2-INCREMENTAL-REBUILD-PROPOSE.md index 51548eb3..14c6683d 100644 --- a/propose/active/TIER2-INCREMENTAL-REBUILD-PROPOSE.md +++ b/propose/active/TIER2-INCREMENTAL-REBUILD-PROPOSE.md @@ -280,18 +280,17 @@ path is reachable via the source `Symbol`'s `path` field (or via ### 2.6 Change-detection sources -Three sources, in priority order: +Two sources, in priority order: 1. **`--changed-paths` flag** (or stdin list). Caller responsibility to be accurate. This is what the INDEX-AUTO-MODE decision engine would pass. -2. **Git diff** between `--git-ref-base` and `HEAD`. Same logic as - INDEX-AUTO-MODE §"Change Detection Strategy". -3. **Hash-based detection** using `.deps.json` `ext_hash` fields. Walk - the source tree, compute hashes, compare against cached. Slowest; - used as last-resort fallback. +2. **Hash-based detection** using `.deps.json` `ext_hash` fields. Walk + the source tree, compute SHA-256 hashes, compare against cached. + New files = added, hash changed = modified, in index but not on + disk = deleted. -If all three fail or are ambiguous, fall back to full rebuild. +If both fail or are ambiguous, fall back to full rebuild. ## 3. Pass-by-pass incrementalisation notes @@ -391,33 +390,9 @@ Decision engine inside `cli.py`'s `_cmd_increment`: The `_emit_increment_kuzu_warning()` call in `cli.py` is removed once this ships. -## 5. MCP integration +## 5. Determinism + correctness -`refresh_code_index` (MCP tool in `server.py`) gains the ability to -call the incremental Kuzu path via the CLI: - -```python -# Pseudocode inside server.py:refresh_code_index -decision = decide_mode(changed_paths, git_ref_base) -if decision.lance_mode == "incremental": - cocoindex_update(incremental=True, paths=decision.lance_paths) -else: - cocoindex_update(full=True) - -if decision.kuzu_mode == "incremental": - subprocess.run(["java-codebase-rag", "increment"]) -else: - subprocess.run(["java-codebase-rag", "reprocess", "--graph-only"]) -``` - -Decision engine returns two independent mode choices — LanceDB and -Kuzu may incrementally update independently. See -[`INDEX-AUTO-MODE-PROPOSE.md`](INDEX-AUTO-MODE-PROPOSE.md) for the -full decision engine specification. - -## 6. Determinism + correctness - -### 6.1 Determinism test +### 5.1 Determinism test Create `tests/test_incremental_equivalence.py` as a prerequisite: @@ -444,7 +419,7 @@ This is the single most important test in the entire feature. Without it, every incremental rebuild is a potential silent-divergence risk. Run against every fixture. -### 6.2 Failure modes +### 5.2 Failure modes - **Half-applied delete:** transaction wraps the entire incremental pass; on any exception, ROLLBACK and fall back to full rebuild. @@ -453,7 +428,7 @@ risk. Run against every fixture. - **Schema-version mismatch:** if `graph_meta.ontology_version` ≠ current `ONTOLOGY_VERSION`, force full rebuild. -## 7. Rollout +## 6. Rollout 1. **PR-T1 (foundation + determinism test):** Create `tests/test_incremental_equivalence.py` with full-rebuild @@ -473,8 +448,8 @@ risk. Run against every fixture. cover all fixtures. 4. **PR-T4 (CLI + decision engine):** Integrate INDEX-AUTO-MODE decision engine into `cli.py`'s `_cmd_increment`. Remove - `_emit_increment_kuzu_warning()`. Wire `refresh_code_index` - MCP tool to dispatch. + `_emit_increment_kuzu_warning()`. Index building is + CLI-only — no MCP tools for index refresh. 5. **PR-T5 (brownfield closure refinement, optional):** Narrow the pessimistic "any brownfield-override change → full" rule once Layer-4/5 fanout is explicitly documented. @@ -482,18 +457,18 @@ risk. Run against every fixture. PR-T1 through PR-T4 are the headline. PR-T5 is an optimisation on top. -## 8. Risk assessment +## 7. Risk assessment | Risk | Severity | Mitigation | |------|----------|------------| -| Silent divergence between full and incremental | **High** | `test_incremental_matches_full` mandatory per fixture; covered by §6.1 | +| Silent divergence between full and incremental | **High** | `test_incremental_matches_full` mandatory per fixture; covered by §5.1 | | Pessimistic full-fallback hides real incremental wins | Medium | Track full-fallback rate via `graph_meta.last_rebuild_mode` + reason; review monthly | | `.deps.json` becomes stale | Medium | `ontology_version` field check; mismatch → full rebuild | | Decision engine bugs cause unsafe incremental | High | Default-to-full on ambiguity; conservative initial rules; expand only after burn-in | | Pass2/Pass3 closure misses an edge type | Medium | Determinism test surfaces this immediately | | Performance regression (incremental slower than full for small repos) | Low | Heuristic: skip incremental when dirty set is >50% of files | -## 9. Performance estimate +## 8. Performance estimate On a hypothetical 1000-file Java repo: @@ -509,7 +484,7 @@ These are estimates — actual numbers depend on closure breadth and Kuzu transaction overhead. PR-T1 establishes a measured baseline on `bank-chat-system` and other fixtures. -## 10. Resolved TBDs +## 9. Resolved TBDs ### TBD-1: Brownfield closure granularity — **pessimistic fallback** @@ -543,7 +518,7 @@ A follow-on proposal should layer a filesystem watcher on top of this incremental path. It needs debouncing, batched-change semantics, and its own state machine. -## 11. Done definition (proposal-level) +## 10. Done definition (proposal-level) This proposal is "ready for plan derivation" when: diff --git a/refresh_decision.py b/refresh_decision.py new file mode 100644 index 00000000..9f10e4e9 --- /dev/null +++ b/refresh_decision.py @@ -0,0 +1,253 @@ +"""Decision engine for incremental vs full refresh of Lance + Kuzu indexes.""" +from __future__ import annotations + +import hashlib +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import Literal + + +@dataclass(frozen=True) +class ChangeSet: + added: tuple[str, ...] = () + modified: tuple[str, ...] = () + deleted: tuple[str, ...] = () + renamed: tuple[str, ...] = () + config_changed: bool = False + pipeline_changed: bool = False + meta_annotation_changed: bool = False + + +@dataclass(frozen=True) +class RefreshDecision: + lance_mode: Literal["incremental", "full"] + kuzu_mode: Literal["incremental", "full"] + reasons: tuple[str, ...] = () + detected_changes: ChangeSet = field(default_factory=ChangeSet) + + +_CONFIG_FILES = { + ".java-codebase-rag.yml", + ".lancedb-mcp.yml", +} + +_PIPELINE_FILES = { + "java_index_flow_lancedb.py", + "build_ast_graph.py", + "graph_enrich.py", +} + + +def _any_match(paths: tuple[str, ...], names: set[str]) -> bool: + return any(Path(p).name in names for p in paths) + + +def _all_java_paths(changes: ChangeSet) -> tuple[str, ...]: + return changes.added + changes.modified + changes.deleted + changes.renamed + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + h.update(path.read_bytes()) + return f"sha256:{h.hexdigest()}" + + +def _detect_repo_changes( + source_root: Path, + *, + changed_paths: list[str] | None = None, + deps_index: dict | None = None, +) -> ChangeSet: + """Detect repository changes via changed_paths or hash-based diff against .deps.json.""" + added: list[str] = [] + modified: list[str] = [] + deleted: list[str] = [] + all_paths: list[str] = [] + + if changed_paths is not None: + for p in changed_paths: + all_paths.append(p) + modified.append(p) + elif deps_index is not None: + files_index = deps_index.get("files", {}) + cached_paths = set(files_index.keys()) + + # Walk source tree for .java files and compare hashes + on_disk: set[str] = set() + for java_file in source_root.rglob("*.java"): + rel = str(java_file.relative_to(source_root)) + on_disk.add(rel) + current_hash = _sha256(java_file) + cached = files_index.get(rel) + if cached is None: + added.append(rel) + all_paths.append(rel) + elif cached.get("ext_hash") != current_hash: + modified.append(rel) + all_paths.append(rel) + + # Files in index but no longer on disk + for rel in cached_paths - on_disk: + deleted.append(rel) + all_paths.append(rel) + + all_t = tuple(all_paths) + config_changed = _any_match(all_t, _CONFIG_FILES) + pipeline_changed = _any_match(all_t, _PIPELINE_FILES) + meta_annotation_changed = False # deferred to PR-T5 brownfield closure refinement + + return ChangeSet( + added=tuple(added), + modified=tuple(modified), + deleted=tuple(deleted), + renamed=(), + config_changed=config_changed, + pipeline_changed=pipeline_changed, + meta_annotation_changed=meta_annotation_changed, + ) + + +def _read_deps_ontology_version(kuzu_path: Path) -> int | None: + """Read ontology_version from .deps.json sidecar. Returns None if missing/stale.""" + deps_path = kuzu_path.parent / ".deps.json" + if not deps_path.is_file(): + return None + try: + raw = json.loads(deps_path.read_text()) + return int(raw.get("ontology_version", 0)) + except (json.JSONDecodeError, OSError, ValueError): + return None + + +def _count_deps_files(kuzu_path: Path) -> int: + """Count total files tracked in .deps.json.""" + deps_path = kuzu_path.parent / ".deps.json" + if not deps_path.is_file(): + return 0 + try: + raw = json.loads(deps_path.read_text()) + return len(raw.get("files", {})) + except (json.JSONDecodeError, OSError): + return 0 + + +def _read_deps_index(kuzu_path: Path) -> dict | None: + """Read full .deps.json content. Returns None if missing or corrupt.""" + deps_path = kuzu_path.parent / ".deps.json" + if not deps_path.is_file(): + return None + try: + return json.loads(deps_path.read_text()) + except (json.JSONDecodeError, OSError): + return None + + +def _current_ontology_version() -> int: + """Import and return the current ontology version from ast_java.""" + from ast_java import ONTOLOGY_VERSION + + return ONTOLOGY_VERSION + + +def _choose_refresh_mode( + changes: ChangeSet, + *, + kuzu_path: Path, + mode: Literal["auto", "incremental", "full"] = "auto", +) -> RefreshDecision: + """Choose refresh mode for Lance and Kuzu based on detected changes.""" + reasons: list[str] = [] + + # Explicit full overrides everything + if mode == "full": + return RefreshDecision( + lance_mode="full", + kuzu_mode="full", + reasons=("explicit full mode requested",), + detected_changes=changes, + ) + + # --- Kuzu mode --- + kuzu_mode: Literal["incremental", "full"] = "incremental" + + if changes.deleted: + kuzu_mode = "full" + reasons.append(f"deleted files detected ({len(changes.deleted)})") + elif changes.renamed: + kuzu_mode = "full" + reasons.append(f"renamed files detected ({len(changes.renamed)})") + + if changes.config_changed: + kuzu_mode = "full" + reasons.append("config file changed") + + if changes.pipeline_changed: + kuzu_mode = "full" + reasons.append("indexing pipeline file changed") + + if changes.meta_annotation_changed and kuzu_mode != "full": + kuzu_mode = "full" + reasons.append("meta-annotation file changed") + + # .deps.json checks + deps_ov = _read_deps_ontology_version(kuzu_path) + current_ov = _current_ontology_version() + if deps_ov is None: + kuzu_mode = "full" + reasons.append(".deps.json missing or corrupt") + elif deps_ov != current_ov: + kuzu_mode = "full" + reasons.append( + f".deps.json ontology_version {deps_ov} != current {current_ov}" + ) + + # >50% dirty heuristic + if kuzu_mode == "incremental": + total = _count_deps_files(kuzu_path) + dirty_count = len(changes.added) + len(changes.modified) + len(changes.deleted) + len(changes.renamed) + if total and dirty_count > 0.5 * total: + kuzu_mode = "full" + reasons.append(f"dirty set {dirty_count}/{total} > 50%") + + # --- Lance mode --- + lance_mode: Literal["incremental", "full"] = "incremental" + if changes.config_changed: + lance_mode = "full" + if "config file changed" not in reasons: + reasons.append("config file changed") + if changes.pipeline_changed: + lance_mode = "full" + if "indexing pipeline file changed" not in reasons: + reasons.append("indexing pipeline file changed") + + return RefreshDecision( + lance_mode=lance_mode, + kuzu_mode=kuzu_mode, + reasons=tuple(reasons), + detected_changes=changes, + ) + + +def choose_refresh_mode( + source_root: Path, + kuzu_path: Path, + *, + mode: Literal["auto", "incremental", "full"] = "auto", + changed_paths: list[str] | None = None, +) -> RefreshDecision: + """Public API: detect changes and choose refresh mode. + + Change detection order: + 1. Explicit ``changed_paths`` if provided. + 2. Hash-based diff against ``.deps.json`` (walk source tree, compare + SHA-256 hashes). + 3. Empty change set if neither is available. + """ + deps_index = _read_deps_index(kuzu_path) if changed_paths is None else None + changes = _detect_repo_changes( + source_root, + changed_paths=changed_paths, + deps_index=deps_index, + ) + return _choose_refresh_mode(changes, kuzu_path=kuzu_path, mode=mode) diff --git a/server.py b/server.py index 31f67306..088d6312 100644 --- a/server.py +++ b/server.py @@ -32,7 +32,7 @@ "Unknown filter keys and populated fields not applicable to the effective node kind fail with success=false and message. " "Edge labels: EXTENDS, IMPLEMENTS, INJECTS, OVERRIDES, DECLARES, DECLARES_CLIENT, DECLARES_PRODUCER, CALLS, EXPOSES, HTTP_CALLS, ASYNC_CALLS; " "type Symbols may also use composed neighbors edge_types DECLARES.DECLARES_CLIENT, DECLARES.DECLARES_PRODUCER, DECLARES.EXPOSES (out only). " - "Reprocess/init, meta, tables, diagnose-ignore, analyze-pr: use java-codebase-rag CLI — not MCP." + "Reprocess/init, increment, meta, tables, diagnose-ignore, analyze-pr: use java-codebase-rag CLI — not MCP." ) diff --git a/tests/test_cli_increment.py b/tests/test_cli_increment.py new file mode 100644 index 00000000..128a3e79 --- /dev/null +++ b/tests/test_cli_increment.py @@ -0,0 +1,135 @@ +"""Tests for CLI increment command Kuzu integration.""" +from __future__ import annotations + +import io +from pathlib import Path +from unittest.mock import MagicMock, patch + +from java_codebase_rag import cli as cli_mod +from java_codebase_rag.config import ResolvedOperatorConfig +from refresh_decision import ChangeSet, RefreshDecision + + +def _make_cfg(tmp_path: Path) -> ResolvedOperatorConfig: + return ResolvedOperatorConfig( + source_root=tmp_path / "src", + index_dir=tmp_path / "idx", + kuzu_path=tmp_path / "idx" / "code_graph.kuzu", + cocoindex_db=tmp_path / "idx" / "cocoindex.db", + embedding_model="test-model", + embedding_device=None, + hints_enabled=False, + index_dir_source="default", + embedding_model_source="default", + embedding_device_source="default", + hints_enabled_source="default", + ) + + +def _make_args(**overrides) -> object: + defaults = { + "source_root": None, + "index_dir": None, + "embedding_model": None, + "embedding_device": None, + "quiet": True, + "verbose": False, + } + defaults.update(overrides) + return MagicMock(**defaults) + + +def _mock_completed_process(returncode: int = 0, stdout: str = "", stderr: str = "") -> MagicMock: + p = MagicMock() + p.returncode = returncode + p.stdout = stdout + p.stderr = stderr + p.args = ["build_ast_graph.py"] + return p + + +def test_increment_dispatches_kuzu_incremental(tmp_path: Path) -> None: + """When decision engine returns incremental, CLI should call run_build_ast_graph_incremental.""" + cfg = _make_cfg(tmp_path) + cfg.source_root.mkdir(parents=True, exist_ok=True) + cfg.index_dir.mkdir(parents=True, exist_ok=True) + + decision = RefreshDecision( + lance_mode="incremental", + kuzu_mode="incremental", + reasons=(), + detected_changes=ChangeSet(modified=("src/Foo.java",)), + ) + + with ( + patch("java_codebase_rag.cli._resolved_from_ns", return_value=cfg), + patch("java_codebase_rag.cli._startup_hints"), + patch("java_codebase_rag.cli.run_cocoindex_update", return_value=_mock_completed_process()), + patch("refresh_decision.choose_refresh_mode", return_value=decision), + patch("java_codebase_rag.cli.run_build_ast_graph_incremental", return_value=_mock_completed_process()) as mock_incr, + patch("java_codebase_rag.cli.run_build_ast_graph", return_value=_mock_completed_process()) as mock_full, + patch("sys.stdout", new_callable=io.StringIO), + ): + args = _make_args() + code = cli_mod._cmd_increment(args) + assert code == 0 + mock_incr.assert_called_once() + mock_full.assert_not_called() + + +def test_increment_dispatches_kuzu_full_fallback(tmp_path: Path) -> None: + """When decision engine returns full, CLI should call run_build_ast_graph (full).""" + cfg = _make_cfg(tmp_path) + cfg.source_root.mkdir(parents=True, exist_ok=True) + cfg.index_dir.mkdir(parents=True, exist_ok=True) + + decision = RefreshDecision( + lance_mode="full", + kuzu_mode="full", + reasons=(".deps.json missing or corrupt",), + detected_changes=ChangeSet(deleted=("src/Foo.java",)), + ) + + with ( + patch("java_codebase_rag.cli._resolved_from_ns", return_value=cfg), + patch("java_codebase_rag.cli._startup_hints"), + patch("java_codebase_rag.cli.run_cocoindex_update", return_value=_mock_completed_process()), + patch("refresh_decision.choose_refresh_mode", return_value=decision), + patch("java_codebase_rag.cli.run_build_ast_graph_incremental", return_value=_mock_completed_process()) as mock_incr, + patch("java_codebase_rag.cli.run_build_ast_graph", return_value=_mock_completed_process()) as mock_full, + patch("sys.stdout", new_callable=io.StringIO), + ): + args = _make_args() + code = cli_mod._cmd_increment(args) + assert code == 0 + mock_incr.assert_not_called() + mock_full.assert_called_once() + + +def test_increment_removes_kuzu_warning(tmp_path: Path) -> None: + """Verify no Kuzu incremental warning is emitted on stderr.""" + cfg = _make_cfg(tmp_path) + cfg.source_root.mkdir(parents=True, exist_ok=True) + cfg.index_dir.mkdir(parents=True, exist_ok=True) + + decision = RefreshDecision( + lance_mode="incremental", + kuzu_mode="incremental", + reasons=(), + detected_changes=ChangeSet(modified=("src/Foo.java",)), + ) + + with ( + patch("java_codebase_rag.cli._resolved_from_ns", return_value=cfg), + patch("java_codebase_rag.cli._startup_hints"), + patch("java_codebase_rag.cli.run_cocoindex_update", return_value=_mock_completed_process()), + patch("refresh_decision.choose_refresh_mode", return_value=decision), + patch("java_codebase_rag.cli.run_build_ast_graph_incremental", return_value=_mock_completed_process()), + patch("sys.stdout", new_callable=io.StringIO), + patch("sys.stderr", new_callable=io.StringIO) as mock_stderr, + ): + args = _make_args() + cli_mod._cmd_increment(args) + stderr = mock_stderr.getvalue() + assert "not yet implemented" not in stderr + assert "graph may be stale" not in stderr diff --git a/tests/test_java_codebase_rag_cli.py b/tests/test_java_codebase_rag_cli.py index 1d67cb77..625a2dde 100644 --- a/tests/test_java_codebase_rag_cli.py +++ b/tests/test_java_codebase_rag_cli.py @@ -34,6 +34,13 @@ def _cocoindex_available() -> bool: return (Path(sys.executable).parent / "cocoindex").is_file() +def _heavy_ok() -> bool: + """Integration tests that need cocoindex + SentenceTransformer model download.""" + if os.environ.get("JAVA_CODEBASE_RAG_RUN_HEAVY", "1") == "0": + return False + return _cocoindex_available() + + def _base_env(corpus_root: Path, kuzu_db_path: Path | None = None) -> dict[str, str]: env = os.environ.copy() env["JAVA_CODEBASE_RAG_SOURCE_ROOT"] = str(corpus_root) @@ -320,7 +327,7 @@ def test_refresh_hidden_alias_deprecates_on_stderr(tmp_path: Path) -> None: assert "reprocess" in err.lower() -@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") +@pytest.mark.skipif(not _heavy_ok(), reason="needs cocoindex + model download (RUN_HEAVY=0 skips)") def test_increment_emits_kuzu_stale_warning_block( corpus_root: Path, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -369,7 +376,7 @@ def test_legacy_env_var_set_emits_stderr_hint(monkeypatch: pytest.MonkeyPatch, t assert err.count("LANCEDB_URI") == 1 -@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") +@pytest.mark.skipif(not _heavy_ok(), reason="needs cocoindex + model download (RUN_HEAVY=0 skips)") def test_init_after_erase_succeeds(corpus_root: Path, tmp_path: Path) -> None: idx = tmp_path / "lifecycle_idx" idx.mkdir(parents=True) @@ -388,7 +395,7 @@ def test_init_after_erase_succeeds(corpus_root: Path, tmp_path: Path) -> None: assert init.returncode == 0, init.stdout + init.stderr -@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") +@pytest.mark.skipif(not _heavy_ok(), reason="needs cocoindex + model download (RUN_HEAVY=0 skips)") def test_cli_lifecycle_round_trip_init_increment_meta_erase( corpus_root: Path, tmp_path: Path, ) -> None: @@ -421,7 +428,7 @@ def test_cli_lifecycle_round_trip_init_increment_meta_erase( assert er.returncode == 0, er.stderr -@pytest.mark.skipif(not _cocoindex_available(), reason="cocoindex not installed in venv") +@pytest.mark.skipif(not _heavy_ok(), reason="needs cocoindex + model download (RUN_HEAVY=0 skips)") def test_increment_updates_lance_after_touch_java_file(corpus_root: Path, tmp_path: Path) -> None: import lancedb # noqa: PLC0415 @@ -897,8 +904,8 @@ def isatty(self) -> bool: def test_cli_reprocess_builds_kuzu_path(corpus_root, tmp_path) -> None: - if not _cocoindex_available(): - pytest.skip("cocoindex CLI missing") + if not _heavy_ok(): + pytest.skip("cocoindex + model download not available") idx = tmp_path / "rep_idx" env = os.environ.copy() env["JAVA_CODEBASE_RAG_INDEX_DIR"] = str(idx) diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index 1d80ea43..0299f980 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -117,6 +117,7 @@ def _fake_search_rows() -> list[dict[str, Any]]: def test_search_basic_returns_hits_with_symbol_id(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", graph=kuzu_graph) assert out.success is True @@ -125,6 +126,7 @@ def test_search_basic_returns_hits_with_symbol_id(monkeypatch, kuzu_graph) -> No def test_search_filter_microservice(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", filter={"microservice": "chat-assign"}, graph=kuzu_graph) assert out.success is True @@ -133,6 +135,7 @@ def test_search_filter_microservice(monkeypatch, kuzu_graph) -> None: def test_search_path_contains_filter(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatAssign", path_contains="ChatAssign", graph=kuzu_graph) assert out.success is True @@ -566,6 +569,7 @@ async def test_search_invalid_table_rejected(mcp_server) -> None: def test_search_filter_accepts_json_string(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) want = {"microservice": "chat-assign"} out_dict = search_v2("ChatService", filter=want, graph=kuzu_graph) @@ -576,6 +580,7 @@ def test_search_filter_accepts_json_string(monkeypatch, kuzu_graph) -> None: def test_search_unknown_filter_key_returns_failure(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", filter={"typo_key": "x"}, graph=kuzu_graph) assert out.success is False @@ -585,6 +590,7 @@ def test_search_unknown_filter_key_returns_failure(monkeypatch, kuzu_graph) -> N def test_search_cross_kind_filter_returns_failure(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", filter={"path_prefix": "/api"}, graph=kuzu_graph) assert out.success is False @@ -594,6 +600,7 @@ def test_search_cross_kind_filter_returns_failure(monkeypatch, kuzu_graph) -> No def test_search_filter_empty_string_treated_as_none(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) baseline = search_v2("ChatService", graph=kuzu_graph) empty = search_v2("ChatService", filter="", graph=kuzu_graph) @@ -605,6 +612,7 @@ def test_search_filter_empty_string_treated_as_none(monkeypatch, kuzu_graph) -> def test_search_filter_json_null_treated_as_none(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) baseline = search_v2("ChatService", graph=kuzu_graph) out = search_v2("ChatService", filter="null", graph=kuzu_graph) @@ -672,6 +680,7 @@ def test_neighbors_validate_call_still_raises(kuzu_graph) -> None: def test_filter_invalid_json_returns_failure(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", filter="{not json", graph=kuzu_graph) assert out.success is False @@ -711,6 +720,7 @@ def test_wildcard_question_mark_in_fqn_prefix_rejected(kuzu_graph) -> None: def test_search_wildcard_in_fqn_prefix_rejected_without_run_search(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) calls: list[int] = [] def boom(*_a, **_k): diff --git a/tests/test_mcp_v2_compose.py b/tests/test_mcp_v2_compose.py index 6c060924..a9b6d9d8 100644 --- a/tests/test_mcp_v2_compose.py +++ b/tests/test_mcp_v2_compose.py @@ -188,6 +188,7 @@ def test_describe_edge_summary_for_route(kuzu_graph) -> None: def test_search_populates_symbol_id_when_chunk_rooted_in_symbol(monkeypatch, kuzu_graph) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) rows: list[dict[str, Any]] = [ { "filename": "A.java", @@ -232,14 +233,16 @@ def test_search_populates_symbol_id_when_chunk_rooted_in_symbol(monkeypatch, kuz assert all(hit.symbol_id is not None for hit in rooted) -def test_meta_returns_per_edge_type_counts() -> None: - out = _graph_meta_output() - assert out.success is True - assert set(out.edge_counts.keys()) == set(_EDGE_TYPES) - assert all(int(v) >= 0 for v in out.edge_counts.values()) +def test_meta_returns_per_edge_type_counts(kuzu_graph) -> None: + meta = kuzu_graph.meta() + assert "error" not in meta + edge_counts = meta.get("edge_counts", {}) + assert set(edge_counts.keys()) == set(_EDGE_TYPES) + assert all(int(v) >= 0 for v in edge_counts.values()) def test_search_describe_neighbors_chain_end_to_end(kuzu_graph, monkeypatch) -> None: + monkeypatch.setattr("mcp_v2._get_sentence_transformer", lambda *a, **kw: None) node_id, _ = _method_with_incoming_calls(kuzu_graph) rows = kuzu_graph._rows( # noqa: SLF001 "MATCH (m:Symbol {id: $id}) RETURN m.fqn AS fqn, m.role AS role, m.module AS module, " diff --git a/tests/test_refresh_decision.py b/tests/test_refresh_decision.py new file mode 100644 index 00000000..feb760e1 --- /dev/null +++ b/tests/test_refresh_decision.py @@ -0,0 +1,175 @@ +"""Tests for refresh_decision.py decision engine.""" +from __future__ import annotations + +import json +from pathlib import Path + +import pytest + +from refresh_decision import ( + ChangeSet, + _choose_refresh_mode, + _current_ontology_version, + choose_refresh_mode, +) + + +def _make_deps_json( + path: Path, + ontology_version: int, + file_count: int = 5, + file_hashes: dict[str, str] | None = None, +) -> None: + """Write a valid .deps.json to the kuzu_path's parent directory.""" + deps_path = path.parent / ".deps.json" + deps_path.parent.mkdir(parents=True, exist_ok=True) + files = {} + for i in range(file_count): + name = f"src/File{i}.java" + files[name] = { + "ext_hash": file_hashes.get(name, "sha256:abc") if file_hashes else "sha256:abc", + "declares": [], + "injects": [], + "extends": [], + "calls": [], + "uses_anno": [], + "overrides": [], + "declares_clients": [], + "declares_producers": [], + } + deps_path.write_text(json.dumps({ + "version": 1, + "ontology_version": ontology_version, + "files": files, + })) + + +@pytest.fixture +def tmp_kuzu_path(tmp_path: Path) -> Path: + """Provide a temp kuzu path with valid .deps.json.""" + kuzu = tmp_path / "code_graph.kuzu" + _make_deps_json(kuzu, _current_ontology_version()) + return kuzu + + +def test_auto_modified_only_incremental(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(modified=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "incremental" + assert decision.lance_mode == "incremental" + + +def test_auto_deleted_file_full_kuzu(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(deleted=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "full" + assert decision.lance_mode == "incremental" + assert any("deleted" in r for r in decision.reasons) + + +def test_auto_renamed_file_full_kuzu(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(renamed=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "full" + assert decision.lance_mode == "incremental" + assert any("renamed" in r for r in decision.reasons) + + +def test_auto_config_change_full(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(modified=(".java-codebase-rag.yml",), config_changed=True) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "full" + assert decision.lance_mode == "full" + assert any("config" in r for r in decision.reasons) + + +def test_auto_empty_changes_incremental(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + kuzu.parent.mkdir(parents=True, exist_ok=True) + _make_deps_json(kuzu, _current_ontology_version()) + changes = ChangeSet() + decision = _choose_refresh_mode(changes, kuzu_path=kuzu, mode="auto") + assert decision.kuzu_mode == "incremental" + + +def test_explicit_full_overrides(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(modified=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="full") + assert decision.kuzu_mode == "full" + assert decision.lance_mode == "full" + assert any("explicit" in r for r in decision.reasons) + + +def test_deps_missing_full_kuzu(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + kuzu.parent.mkdir(parents=True, exist_ok=True) + changes = ChangeSet(modified=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=kuzu, mode="auto") + assert decision.kuzu_mode == "full" + assert any("deps" in r for r in decision.reasons) + + +def test_deps_stale_ontology_full_kuzu(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + _make_deps_json(kuzu, ontology_version=0) + changes = ChangeSet(modified=("src/Foo.java",)) + decision = _choose_refresh_mode(changes, kuzu_path=kuzu, mode="auto") + assert decision.kuzu_mode == "full" + assert any("ontology" in r for r in decision.reasons) + + +def test_hash_based_detects_new_file(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + _make_deps_json(kuzu, _current_ontology_version(), file_count=0) + # Create a .java file on disk not in the index + src = tmp_path / "src" + src.mkdir() + (src / "NewFile.java").write_text("class NewFile {}") + decision = choose_refresh_mode(tmp_path, kuzu, mode="auto") + assert "src/NewFile.java" in decision.detected_changes.added + + +def test_hash_based_detects_modified_file(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + src = tmp_path / "src" + src.mkdir() + f = src / "File0.java" + f.write_text("original") + _make_deps_json(kuzu, _current_ontology_version(), file_count=1) + # Change the file content (hash will differ from cached "sha256:abc") + f.write_text("modified") + decision = choose_refresh_mode(tmp_path, kuzu, mode="auto") + assert "src/File0.java" in decision.detected_changes.modified + + +def test_hash_based_detects_deleted_file(tmp_path: Path) -> None: + kuzu = tmp_path / "code_graph.kuzu" + _make_deps_json(kuzu, _current_ontology_version(), file_count=1) + # File in index but not on disk → deleted + decision = choose_refresh_mode(tmp_path, kuzu, mode="auto") + assert "src/File0.java" in decision.detected_changes.deleted + + +def test_pipeline_changed_full(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(modified=("build_ast_graph.py",), pipeline_changed=True) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "full" + assert decision.lance_mode == "full" + assert any("pipeline" in r for r in decision.reasons) + + +def test_meta_annotation_changed_full(tmp_kuzu_path: Path) -> None: + changes = ChangeSet(modified=("src/CustomAnnotation.java",), meta_annotation_changed=True) + decision = _choose_refresh_mode(changes, kuzu_path=tmp_kuzu_path, mode="auto") + assert decision.kuzu_mode == "full" + assert any("meta-annotation" in r for r in decision.reasons) + + +def test_large_dirty_set_full(tmp_path: Path) -> None: + """When >50% files are dirty, fall back to full rebuild.""" + kuzu = tmp_path / "code_graph.kuzu" + _make_deps_json(kuzu, _current_ontology_version(), file_count=3) + changes = ChangeSet(modified=("src/File0.java", "src/File1.java")) + decision = _choose_refresh_mode(changes, kuzu_path=kuzu, mode="auto") + assert decision.kuzu_mode == "full" + assert any("50%" in r for r in decision.reasons)