diff --git a/ast_java.py b/ast_java.py index c3dc2395..8fbdf33b 100644 --- a/ast_java.py +++ b/ast_java.py @@ -1868,6 +1868,11 @@ def _collect_outgoing_calls( file_rel: str, ) -> list[OutgoingCallDecl]: del project_root + from java_ontology import ( # deferred: java_ontology imports ast_java + CLIENT_KIND_FEIGN_METHOD, + CLIENT_KIND_REST_TEMPLATE, + CLIENT_KIND_WEB_CLIENT, + ) out: list[OutgoingCallDecl] = [] method_fqn = f"{type_fqn}#{method_decl.signature}" type_mods = _find_modifiers_child(type_node) if type_node is not None else None @@ -1899,7 +1904,7 @@ def _collect_outgoing_calls( OutgoingCallDecl( method_fqn=method_fqn, method_sig=method_decl.signature, - client_kind="feign_method", + client_kind=CLIENT_KIND_FEIGN_METHOD, channel="http", feign_target_name=feign_target_name, feign_target_url=feign_target_url, @@ -1998,7 +2003,7 @@ def visit(n: Node) -> None: OutgoingCallDecl( method_fqn=method_fqn, method_sig=method_decl.signature, - client_kind="rest_template", + client_kind=CLIENT_KIND_REST_TEMPLATE, channel="http", feign_target_name="", feign_target_url="", @@ -2051,7 +2056,7 @@ def visit(n: Node) -> None: OutgoingCallDecl( method_fqn=method_fqn, method_sig=method_decl.signature, - client_kind="web_client", + client_kind=CLIENT_KIND_WEB_CLIENT, channel="http", feign_target_name="", feign_target_url="", diff --git a/build_ast_graph.py b/build_ast_graph.py index 3e75a32c..980f6809 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -66,7 +66,13 @@ symbol_id, ) from path_filtering import LayeredIgnore, iter_java_source_files -from java_ontology import VALID_CLIENT_KINDS, VALID_HTTP_CALL_MATCHES, VALID_PRODUCER_KINDS +from java_ontology import ( + CLIENT_KIND_FEIGN_METHOD, + CLIENT_KIND_REST_TEMPLATE, + VALID_CLIENT_KINDS, + VALID_HTTP_CALL_MATCHES, + VALID_PRODUCER_KINDS, +) log = logging.getLogger(__name__) @@ -2382,7 +2388,7 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str: ) rid = "" strategy = call.resolution_strategy - if call.client_kind == "feign_method": + if call.client_kind == CLIENT_KIND_FEIGN_METHOD: exposing = next((e for e in tables.exposes_rows if e.symbol_id == member.node_id), None) if exposing is not None: rid = exposing.route_id @@ -2585,7 +2591,7 @@ def _match_call_edge( return "unresolved", [] candidates: list[RouteRow] = [] - if call.client_kind == "feign_method": + if call.client_kind == CLIENT_KIND_FEIGN_METHOD: # Prefer endpoint matching by target service + path/method for Feign declarations. path_value = call.path_template_call method_value = call.method_call @@ -2714,7 +2720,7 @@ def _micro_factor(member: MemberEntry | None) -> float: if src_route is None and member is not None: # Recover feign caller hints from persisted caller-side Client declarations. for client in client_hints_by_member.get(member.node_id, ()): - if client.client_kind != "feign_method": + if client.client_kind != CLIENT_KIND_FEIGN_METHOD: continue path_template, path_regex = _normalize_path(client.path) src_route = RouteRow( @@ -2750,7 +2756,7 @@ def _micro_factor(member: MemberEntry | None) -> float: call = OutgoingCallDecl( method_fqn=f"{member.parent_fqn}#{member.decl.signature}" if member else "", method_sig=member.decl.signature if member else "", - client_kind="feign_method" if _feign_like else "rest_template", + client_kind=CLIENT_KIND_FEIGN_METHOD if _feign_like else CLIENT_KIND_REST_TEMPLATE, channel="http", feign_target_name=src_route.feign_name if src_route else "", feign_target_url=src_route.feign_url if src_route else "", @@ -3424,13 +3430,15 @@ def _write_edges(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id _bulk_copy(conn, "OVERRIDES", _REL_OVERRIDES_COLUMNS, overrides_rows) # Stage CALLS rows with dedup and callee_declaring_role materialization - seen_calls: set[tuple[str, str, int, int]] = set() + seen_calls: set[tuple[str, str, int, int, int]] = set() calls_rows: list[dict] = [] member_by_id = {m.node_id: m for m in tables.members} for row in tables.calls_rows: if row.src_id not in valid_ids or row.dst_id not in valid_ids: continue - key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line) + # Include call_site_byte so two call sites of the same method on the same + # source line (same arg_count) are kept as distinct edges (issue #359). + key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line, row.call_site_byte) if key in seen_calls: continue seen_calls.add(key) @@ -3606,10 +3614,15 @@ def _write_routes_and_exposes(conn: ladybug.Connection, tables: GraphTables, _fi def _write_meta(conn: ladybug.Connection, tables: GraphTables, source_root: Path) -> None: - seen_calls: set[tuple[str, str, int, int]] = set() + # Dedup key MUST match _write_edges (build_ast_graph.py, _REL_CALLS writer): the + # 5-tuple includes call_site_byte so two call sites of the same method on the + # same source line are counted separately. A previous version used the 4-tuple + # here, which made counts['calls'] (678) diverge from the real CALLS edge count + # (684) that _write_edges actually persisted — describe/stats then undercounted. + seen_calls: set[tuple[str, str, int, int, int]] = set() calls_unique = 0 for row in tables.calls_rows: - key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line) + key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line, row.call_site_byte) if key not in seen_calls: seen_calls.add(key) calls_unique += 1 diff --git a/graph_enrich.py b/graph_enrich.py index 97e54e36..23ec51c7 100644 --- a/graph_enrich.py +++ b/graph_enrich.py @@ -43,6 +43,7 @@ _TYPE_ANN_TO_CAPABILITY, ) from java_ontology import ( + CLIENT_KIND_REST_TEMPLATE, VALID_CAPABILITIES, VALID_CLIENT_KINDS, VALID_PRODUCER_KINDS, @@ -1301,7 +1302,7 @@ def resolve_http_client_for_method( hint = overrides.annotation_to_http_client_hint.get("CodebaseHttpClient") if hint is None: hint = HttpClientHint( - client_kind=anchor.client_kind if anchor else "rest_template", + client_kind=anchor.client_kind if anchor else CLIENT_KIND_REST_TEMPLATE, target_service=anchor.feign_target_name if anchor else "", path=anchor.path_template_call if anchor else "", method=anchor.method_call if anchor else "", diff --git a/java_codebase_rag/cli.py b/java_codebase_rag/cli.py index 47cf3887..4a7c6f6d 100644 --- a/java_codebase_rag/cli.py +++ b/java_codebase_rag/cli.py @@ -612,7 +612,7 @@ def _cmd_erase(args: argparse.Namespace) -> int: import lancedb db = lancedb.connect(str(cfg.index_dir.resolve())) - for name in db.table_names(): + for name in db.list_tables(): to_describe.append(cfg.index_dir / name) except Exception: pass @@ -657,7 +657,7 @@ def work(progress: "PipelineProgress | None") -> int: import lancedb db = lancedb.connect(str(cfg.index_dir.resolve())) - for name in list(db.table_names()): + for name in list(db.list_tables()): try: db.drop_table(name) except Exception as exc: diff --git a/java_codebase_rag/config.py b/java_codebase_rag/config.py index 0b7a2de1..98570110 100644 --- a/java_codebase_rag/config.py +++ b/java_codebase_rag/config.py @@ -237,7 +237,17 @@ def load_yaml_mapping(source_root: Path) -> dict[str, Any]: return {} try: data = yaml.safe_load(path.read_text(encoding="utf-8")) - except Exception: + except (yaml.YAMLError, OSError, UnicodeDecodeError) as exc: + # Best-effort loader: a missing/unreadable/malformed config must NOT abort + # startup — return {} and proceed with defaults. Narrowing this to + # ``yaml.YAMLError`` alone let OSError (chmod 000, stat/read TOCTOU) and + # UnicodeDecodeError (non-UTF-8 config) propagate to the caller; the broader + # tuple restores the graceful-degradation contract while still surfacing the + # problem on stderr. + print( + f"java-codebase-rag: could not load config {path}: {exc}; ignoring config.", + file=sys.stderr, + ) return {} return data if isinstance(data, dict) else {} @@ -476,7 +486,7 @@ def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]: import lancedb db = lancedb.connect(str(index_dir.resolve())) - for name in db.table_names(): + for name in db.list_tables(): paths.append(str((index_dir / name).resolve()) + " (Lance table)") except Exception: pass diff --git a/java_ontology.py b/java_ontology.py index 3df78769..b7e598d4 100644 --- a/java_ontology.py +++ b/java_ontology.py @@ -49,6 +49,11 @@ "rest_template", "web_client", )) +# Named members of VALID_CLIENT_KINDS — reference these at emit/match sites so a +# rename in the set above cannot silently desync callers (issue #359). +CLIENT_KIND_FEIGN_METHOD = "feign_method" +CLIENT_KIND_REST_TEMPLATE = "rest_template" +CLIENT_KIND_WEB_CLIENT = "web_client" VALID_PRODUCER_KINDS: frozenset[str] = frozenset(( "kafka_send", @@ -433,6 +438,9 @@ class EdgeSpec: "VALID_ROUTE_FRAMEWORKS", "VALID_ROUTE_KINDS", "VALID_CLIENT_KINDS", + "CLIENT_KIND_FEIGN_METHOD", + "CLIENT_KIND_REST_TEMPLATE", + "CLIENT_KIND_WEB_CLIENT", "VALID_PRODUCER_KINDS", "VALID_HTTP_CALL_STRATEGIES", "VALID_ASYNC_CALL_STRATEGIES", diff --git a/ladybug_queries.py b/ladybug_queries.py index 66204b07..e3ceea97 100644 --- a/ladybug_queries.py +++ b/ladybug_queries.py @@ -30,21 +30,27 @@ def _parse_ladybug_json(raw: str | None) -> dict[str, Any]: - """Parse JSON from LadybugDB which returns unquoted keys like {key: value}.""" + """Parse JSON from LadybugDB which returns unquoted keys like {key: value}. + + Only quote keys at key positions (after ``{``, ``,`` or ``[``) so values + containing word-colon patterns (e.g. a URL ``https://...`` inside a quoted + string) are not corrupted. The previous ``(\\w+):`` regex matched ``word:`` + anywhere, including inside values (issue #359). + """ if not raw: return {} - # LadybugDB returns JSON without quotes around keys: {packages: 1, files: 2} - # Convert to standard JSON: {"packages": 1, "files": 2} - # This regex matches word characters followed by ':' at the start of a key - quoted = re.sub(r'(\w+):', r'"\1":', raw) + # Quote unquoted keys only where a key is expected: preceded by '{', ',' or + # '[' (with optional whitespace). This leaves word-colon runs inside values + # untouched. + quoted = re.sub(r'([,{\[]\s*)(\w+):', lambda m: f'{m.group(1)}"{m.group(2)}":', raw) try: return json.loads(quoted) except Exception: try: - # Fallback: try parsing as-is (for standard JSON) + # Fallback: try parsing as-is (for standard JSON). return json.loads(raw) except Exception: - log.warning("Failed to parse counts_json: %s", raw[:100]) + log.warning("Failed to parse graph_meta JSON blob: %s", raw[:100]) return {} # Composed describe / neighbors dot-keys (not stored graph edge labels). diff --git a/mcp_v2.py b/mcp_v2.py index 86807ed3..902688d0 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -1504,7 +1504,18 @@ def resolve_v2( assert trimmed is not None if "*" in trimmed or "?" in trimmed: - return _resolve_finalize_success(trimmed, hint_kind, []) + out = ResolveOutput( + success=False, + status="none", + message=( + "Wildcards (* and ?) are not supported in resolve; " + "use search(query=...) for ranked text search." + ), + advisories=[], + resolved_identifier=trimmed, + ) + _resolve_assert_invariants(out) + return out g = graph or LadybugGraph.get() raw: list[tuple[NodeRef, ResolveReason, int]] = [] diff --git a/search_lancedb.py b/search_lancedb.py index d6172681..bc1543eb 100644 --- a/search_lancedb.py +++ b/search_lancedb.py @@ -853,7 +853,7 @@ def run_search( capability=capability, capability_in=capability_in, ) if "java" in table_keys else [] - skip_role_weight = bool(role or role_in) + skip_role_weight = bool(role or role_in or exclude_roles) query_toks = _query_tokens(query) if len(table_keys) == 1: diff --git a/tests/fixtures/graph_baseline_bank_chat.json b/tests/fixtures/graph_baseline_bank_chat.json index a42224af..2002e992 100644 --- a/tests/fixtures/graph_baseline_bank_chat.json +++ b/tests/fixtures/graph_baseline_bank_chat.json @@ -6,14 +6,14 @@ "INJECTS": 94, "DECLARES": 606, "OVERRIDES": 38, - "CALLS": 678, + "CALLS": 684, "UNRESOLVED_AT": 227 }, "graph_meta": { "ontology_version": 17, "built_at": 1782110216, "source_root": "/Users/dmitry/Desktop/CursorProjects/java-enterprise-codebase-rag/tests/bank-chat-system", - "counts_json": "{packages: 29, files: 130, types: 140, members: 606, phantoms: 54, extends: 18, implements: 21, injects: 94, declares: 606, overrides: 38, calls: 678, routes: 29, exposes: 15, clients: 8, declares_client: 8, producers: 9, declares_producer: 9, http_calls: 8, async_calls: 9}" + "counts_json": "{packages: 29, files: 130, types: 140, members: 606, phantoms: 54, extends: 18, implements: 21, injects: 94, declares: 606, overrides: 38, calls: 684, routes: 29, exposes: 15, clients: 8, declares_client: 8, producers: 9, declares_producer: 9, http_calls: 8, async_calls: 9}" }, "sampled_edges": { "EXTENDS": [ diff --git a/tests/test_ast_graph_build.py b/tests/test_ast_graph_build.py index 344d16dd..844ac095 100644 --- a/tests/test_ast_graph_build.py +++ b/tests/test_ast_graph_build.py @@ -646,17 +646,19 @@ def test_bulk_write_is_deterministic_double_build(corpus_root: Path, tmp_path: P def test_bulk_write_preserves_calls_dedup_and_callee_declaring_role(ladybug_db_path: Path) -> None: - """Bulk COPY FROM preserves CALLS dedup by (src, dst, argc, line) and callee_declaring_role. + """Bulk COPY FROM preserves CALLS dedup by (src, dst, argc, line, byte) and callee_declaring_role. Reuses the @Service callee assertion against a bulk build to verify the materialization at staging time produces the same results as the per-row path. """ conn = _connect(ladybug_db_path) - # Verify CALLS dedup: count unique (src_id, dst_id, arg_count, call_site_line) tuples + # Verify CALLS dedup: count unique (src_id, dst_id, arg_count, call_site_line, + # call_site_byte) tuples — byte is included so two call sites of the same + # method on the same source line are kept distinct (issue #359). result = conn.execute( "MATCH (a)-[c:CALLS]->(b) " - "RETURN COUNT(DISTINCT {src: a.id, dst: b.id, argc: c.arg_count, line: c.call_site_line})" + "RETURN COUNT(DISTINCT {src: a.id, dst: b.id, argc: c.arg_count, line: c.call_site_line, byte: c.call_site_byte})" ) unique_call_keys = int(result.get_next()[0]) diff --git a/tests/test_ladybug_queries.py b/tests/test_ladybug_queries.py index 260bcc6f..19d01c41 100644 --- a/tests/test_ladybug_queries.py +++ b/tests/test_ladybug_queries.py @@ -516,3 +516,22 @@ def test_trace_request_flow_inbound_includes_caller_node_id(ladybug_db_path_cros inbound = flow.get("inbound") or [] assert inbound assert any(row.get("caller_node_id") for row in inbound) + + +def test_parse_ladybug_json_handles_colon_in_values() -> None: + """_parse_ladybug_json quotes only key positions, so a value containing a + word-colon run (e.g. a URL) is not corrupted (issue #359). The prior regex + matched ``(\\w+):`` anywhere, which turned {url: "https://x"} into junk and + fell back to {}.""" + from ladybug_queries import _parse_ladybug_json + + # Standard unquoted keys (LadybugDB style). + assert _parse_ladybug_json("{packages: 1, files: 2}") == {"packages": 1, "files": 2} + # A quoted-string value containing https:// must survive intact. + parsed = _parse_ladybug_json('{base_url: "https://example.com", n: 3}') + assert parsed == {"base_url": "https://example.com", "n": 3} + # Nested unquoted keys are quoted at both levels. + assert _parse_ladybug_json("{outer: {inner: 1}}") == {"outer": {"inner": 1}} + # Empty / None are safe. + assert _parse_ladybug_json("") == {} + assert _parse_ladybug_json(None) == {} diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index 4ceb829a..c9fa481f 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -1149,10 +1149,14 @@ def test_resolve_natural_language_sentence_returns_none(ladybug_graph) -> None: assert out.status == "none" -def test_resolve_wildcard_identifier_returns_none(ladybug_graph) -> None: +def test_resolve_wildcard_identifier_rejected(ladybug_graph) -> None: + """resolve rejects wildcards (* and ?) consistently with search/find/neighbors + (issue #359): previously it silently returned status='none', hiding a likely + user mistake. Now it returns success=False with a message pointing to search.""" out = resolve_v2("com.foo.*Service", hint_kind="symbol", graph=ladybug_graph) - assert out.success is True + assert out.success is False assert out.status == "none" + assert out.message and "search" in out.message.lower() def test_resolve_every_reason_in_closed_set_appears() -> None: