Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions ast_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -1868,6 +1868,11 @@ def _collect_outgoing_calls(
file_rel: str,
) -> list[OutgoingCallDecl]:
del project_root
from java_ontology import ( # deferred: java_ontology imports ast_java
CLIENT_KIND_FEIGN_METHOD,
CLIENT_KIND_REST_TEMPLATE,
CLIENT_KIND_WEB_CLIENT,
)
out: list[OutgoingCallDecl] = []
method_fqn = f"{type_fqn}#{method_decl.signature}"
type_mods = _find_modifiers_child(type_node) if type_node is not None else None
Expand Down Expand Up @@ -1899,7 +1904,7 @@ def _collect_outgoing_calls(
OutgoingCallDecl(
method_fqn=method_fqn,
method_sig=method_decl.signature,
client_kind="feign_method",
client_kind=CLIENT_KIND_FEIGN_METHOD,
channel="http",
feign_target_name=feign_target_name,
feign_target_url=feign_target_url,
Expand Down Expand Up @@ -1998,7 +2003,7 @@ def visit(n: Node) -> None:
OutgoingCallDecl(
method_fqn=method_fqn,
method_sig=method_decl.signature,
client_kind="rest_template",
client_kind=CLIENT_KIND_REST_TEMPLATE,
channel="http",
feign_target_name="",
feign_target_url="",
Expand Down Expand Up @@ -2051,7 +2056,7 @@ def visit(n: Node) -> None:
OutgoingCallDecl(
method_fqn=method_fqn,
method_sig=method_decl.signature,
client_kind="web_client",
client_kind=CLIENT_KIND_WEB_CLIENT,
channel="http",
feign_target_name="",
feign_target_url="",
Expand Down
31 changes: 22 additions & 9 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,13 @@
symbol_id,
)
from path_filtering import LayeredIgnore, iter_java_source_files
from java_ontology import VALID_CLIENT_KINDS, VALID_HTTP_CALL_MATCHES, VALID_PRODUCER_KINDS
from java_ontology import (
CLIENT_KIND_FEIGN_METHOD,
CLIENT_KIND_REST_TEMPLATE,
VALID_CLIENT_KINDS,
VALID_HTTP_CALL_MATCHES,
VALID_PRODUCER_KINDS,
)

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -2382,7 +2388,7 @@ def _phantom_async_route_id(call: OutgoingCallDecl) -> str:
)
rid = ""
strategy = call.resolution_strategy
if call.client_kind == "feign_method":
if call.client_kind == CLIENT_KIND_FEIGN_METHOD:
exposing = next((e for e in tables.exposes_rows if e.symbol_id == member.node_id), None)
if exposing is not None:
rid = exposing.route_id
Expand Down Expand Up @@ -2585,7 +2591,7 @@ def _match_call_edge(
return "unresolved", []

candidates: list[RouteRow] = []
if call.client_kind == "feign_method":
if call.client_kind == CLIENT_KIND_FEIGN_METHOD:
# Prefer endpoint matching by target service + path/method for Feign declarations.
path_value = call.path_template_call
method_value = call.method_call
Expand Down Expand Up @@ -2714,7 +2720,7 @@ def _micro_factor(member: MemberEntry | None) -> float:
if src_route is None and member is not None:
# Recover feign caller hints from persisted caller-side Client declarations.
for client in client_hints_by_member.get(member.node_id, ()):
if client.client_kind != "feign_method":
if client.client_kind != CLIENT_KIND_FEIGN_METHOD:
continue
path_template, path_regex = _normalize_path(client.path)
src_route = RouteRow(
Expand Down Expand Up @@ -2750,7 +2756,7 @@ def _micro_factor(member: MemberEntry | None) -> float:
call = OutgoingCallDecl(
method_fqn=f"{member.parent_fqn}#{member.decl.signature}" if member else "",
method_sig=member.decl.signature if member else "",
client_kind="feign_method" if _feign_like else "rest_template",
client_kind=CLIENT_KIND_FEIGN_METHOD if _feign_like else CLIENT_KIND_REST_TEMPLATE,
channel="http",
feign_target_name=src_route.feign_name if src_route else "",
feign_target_url=src_route.feign_url if src_route else "",
Expand Down Expand Up @@ -3424,13 +3430,15 @@ def _write_edges(conn: ladybug.Connection, tables: GraphTables, _file_by_node_id
_bulk_copy(conn, "OVERRIDES", _REL_OVERRIDES_COLUMNS, overrides_rows)

# Stage CALLS rows with dedup and callee_declaring_role materialization
seen_calls: set[tuple[str, str, int, int]] = set()
seen_calls: set[tuple[str, str, int, int, int]] = set()
calls_rows: list[dict] = []
member_by_id = {m.node_id: m for m in tables.members}
for row in tables.calls_rows:
if row.src_id not in valid_ids or row.dst_id not in valid_ids:
continue
key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line)
# Include call_site_byte so two call sites of the same method on the same
# source line (same arg_count) are kept as distinct edges (issue #359).
key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line, row.call_site_byte)
if key in seen_calls:
continue
seen_calls.add(key)
Expand Down Expand Up @@ -3606,10 +3614,15 @@ def _write_routes_and_exposes(conn: ladybug.Connection, tables: GraphTables, _fi


def _write_meta(conn: ladybug.Connection, tables: GraphTables, source_root: Path) -> None:
seen_calls: set[tuple[str, str, int, int]] = set()
# Dedup key MUST match _write_edges (build_ast_graph.py, _REL_CALLS writer): the
# 5-tuple includes call_site_byte so two call sites of the same method on the
# same source line are counted separately. A previous version used the 4-tuple
# here, which made counts['calls'] (678) diverge from the real CALLS edge count
# (684) that _write_edges actually persisted — describe/stats then undercounted.
seen_calls: set[tuple[str, str, int, int, int]] = set()
calls_unique = 0
for row in tables.calls_rows:
key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line)
key = (row.src_id, row.dst_id, row.arg_count, row.call_site_line, row.call_site_byte)
if key not in seen_calls:
seen_calls.add(key)
calls_unique += 1
Expand Down
3 changes: 2 additions & 1 deletion graph_enrich.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
_TYPE_ANN_TO_CAPABILITY,
)
from java_ontology import (
CLIENT_KIND_REST_TEMPLATE,
VALID_CAPABILITIES,
VALID_CLIENT_KINDS,
VALID_PRODUCER_KINDS,
Expand Down Expand Up @@ -1301,7 +1302,7 @@ def resolve_http_client_for_method(
hint = overrides.annotation_to_http_client_hint.get("CodebaseHttpClient")
if hint is None:
hint = HttpClientHint(
client_kind=anchor.client_kind if anchor else "rest_template",
client_kind=anchor.client_kind if anchor else CLIENT_KIND_REST_TEMPLATE,
target_service=anchor.feign_target_name if anchor else "",
path=anchor.path_template_call if anchor else "",
method=anchor.method_call if anchor else "",
Expand Down
4 changes: 2 additions & 2 deletions java_codebase_rag/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,7 @@ def _cmd_erase(args: argparse.Namespace) -> int:
import lancedb

db = lancedb.connect(str(cfg.index_dir.resolve()))
for name in db.table_names():
for name in db.list_tables():
to_describe.append(cfg.index_dir / name)
except Exception:
pass
Expand Down Expand Up @@ -657,7 +657,7 @@ def work(progress: "PipelineProgress | None") -> int:
import lancedb

db = lancedb.connect(str(cfg.index_dir.resolve()))
for name in list(db.table_names()):
for name in list(db.list_tables()):
try:
db.drop_table(name)
except Exception as exc:
Expand Down
14 changes: 12 additions & 2 deletions java_codebase_rag/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,17 @@ def load_yaml_mapping(source_root: Path) -> dict[str, Any]:
return {}
try:
data = yaml.safe_load(path.read_text(encoding="utf-8"))
except Exception:
except (yaml.YAMLError, OSError, UnicodeDecodeError) as exc:
# Best-effort loader: a missing/unreadable/malformed config must NOT abort
# startup — return {} and proceed with defaults. Narrowing this to
# ``yaml.YAMLError`` alone let OSError (chmod 000, stat/read TOCTOU) and
# UnicodeDecodeError (non-UTF-8 config) propagate to the caller; the broader
# tuple restores the graceful-degradation contract while still surfacing the
# problem on stderr.
print(
f"java-codebase-rag: could not load config {path}: {exc}; ignoring config.",
file=sys.stderr,
)
return {}
return data if isinstance(data, dict) else {}

Expand Down Expand Up @@ -476,7 +486,7 @@ def index_dir_has_existing_artifacts(index_dir: Path) -> tuple[bool, list[str]]:
import lancedb

db = lancedb.connect(str(index_dir.resolve()))
for name in db.table_names():
for name in db.list_tables():
paths.append(str((index_dir / name).resolve()) + " (Lance table)")
except Exception:
pass
Expand Down
8 changes: 8 additions & 0 deletions java_ontology.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@
"rest_template",
"web_client",
))
# Named members of VALID_CLIENT_KINDS — reference these at emit/match sites so a
# rename in the set above cannot silently desync callers (issue #359).
CLIENT_KIND_FEIGN_METHOD = "feign_method"
CLIENT_KIND_REST_TEMPLATE = "rest_template"
CLIENT_KIND_WEB_CLIENT = "web_client"

VALID_PRODUCER_KINDS: frozenset[str] = frozenset((
"kafka_send",
Expand Down Expand Up @@ -433,6 +438,9 @@ class EdgeSpec:
"VALID_ROUTE_FRAMEWORKS",
"VALID_ROUTE_KINDS",
"VALID_CLIENT_KINDS",
"CLIENT_KIND_FEIGN_METHOD",
"CLIENT_KIND_REST_TEMPLATE",
"CLIENT_KIND_WEB_CLIENT",
"VALID_PRODUCER_KINDS",
"VALID_HTTP_CALL_STRATEGIES",
"VALID_ASYNC_CALL_STRATEGIES",
Expand Down
20 changes: 13 additions & 7 deletions ladybug_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,27 @@


def _parse_ladybug_json(raw: str | None) -> dict[str, Any]:
"""Parse JSON from LadybugDB which returns unquoted keys like {key: value}."""
"""Parse JSON from LadybugDB which returns unquoted keys like {key: value}.

Only quote keys at key positions (after ``{``, ``,`` or ``[``) so values
containing word-colon patterns (e.g. a URL ``https://...`` inside a quoted
string) are not corrupted. The previous ``(\\w+):`` regex matched ``word:``
anywhere, including inside values (issue #359).
"""
if not raw:
return {}
# LadybugDB returns JSON without quotes around keys: {packages: 1, files: 2}
# Convert to standard JSON: {"packages": 1, "files": 2}
# This regex matches word characters followed by ':' at the start of a key
quoted = re.sub(r'(\w+):', r'"\1":', raw)
# Quote unquoted keys only where a key is expected: preceded by '{', ',' or
# '[' (with optional whitespace). This leaves word-colon runs inside values
# untouched.
quoted = re.sub(r'([,{\[]\s*)(\w+):', lambda m: f'{m.group(1)}"{m.group(2)}":', raw)
try:
return json.loads(quoted)
except Exception:
try:
# Fallback: try parsing as-is (for standard JSON)
# Fallback: try parsing as-is (for standard JSON).
return json.loads(raw)
except Exception:
log.warning("Failed to parse counts_json: %s", raw[:100])
log.warning("Failed to parse graph_meta JSON blob: %s", raw[:100])
return {}

# Composed describe / neighbors dot-keys (not stored graph edge labels).
Expand Down
13 changes: 12 additions & 1 deletion mcp_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1504,7 +1504,18 @@ def resolve_v2(

assert trimmed is not None
if "*" in trimmed or "?" in trimmed:
return _resolve_finalize_success(trimmed, hint_kind, [])
out = ResolveOutput(
success=False,
status="none",
message=(
"Wildcards (* and ?) are not supported in resolve; "
"use search(query=...) for ranked text search."
),
advisories=[],
resolved_identifier=trimmed,
)
_resolve_assert_invariants(out)
return out

g = graph or LadybugGraph.get()
raw: list[tuple[NodeRef, ResolveReason, int]] = []
Expand Down
2 changes: 1 addition & 1 deletion search_lancedb.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,7 +853,7 @@ def run_search(
capability=capability, capability_in=capability_in,
) if "java" in table_keys else []

skip_role_weight = bool(role or role_in)
skip_role_weight = bool(role or role_in or exclude_roles)
query_toks = _query_tokens(query)

if len(table_keys) == 1:
Expand Down
4 changes: 2 additions & 2 deletions tests/fixtures/graph_baseline_bank_chat.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@
"INJECTS": 94,
"DECLARES": 606,
"OVERRIDES": 38,
"CALLS": 678,
"CALLS": 684,
"UNRESOLVED_AT": 227
},
"graph_meta": {
"ontology_version": 17,
"built_at": 1782110216,
"source_root": "/Users/dmitry/Desktop/CursorProjects/java-enterprise-codebase-rag/tests/bank-chat-system",
"counts_json": "{packages: 29, files: 130, types: 140, members: 606, phantoms: 54, extends: 18, implements: 21, injects: 94, declares: 606, overrides: 38, calls: 678, routes: 29, exposes: 15, clients: 8, declares_client: 8, producers: 9, declares_producer: 9, http_calls: 8, async_calls: 9}"
"counts_json": "{packages: 29, files: 130, types: 140, members: 606, phantoms: 54, extends: 18, implements: 21, injects: 94, declares: 606, overrides: 38, calls: 684, routes: 29, exposes: 15, clients: 8, declares_client: 8, producers: 9, declares_producer: 9, http_calls: 8, async_calls: 9}"
},
"sampled_edges": {
"EXTENDS": [
Expand Down
8 changes: 5 additions & 3 deletions tests/test_ast_graph_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,17 +646,19 @@ def test_bulk_write_is_deterministic_double_build(corpus_root: Path, tmp_path: P


def test_bulk_write_preserves_calls_dedup_and_callee_declaring_role(ladybug_db_path: Path) -> None:
"""Bulk COPY FROM preserves CALLS dedup by (src, dst, argc, line) and callee_declaring_role.
"""Bulk COPY FROM preserves CALLS dedup by (src, dst, argc, line, byte) and callee_declaring_role.

Reuses the @Service callee assertion against a bulk build to verify the materialization
at staging time produces the same results as the per-row path.
"""
conn = _connect(ladybug_db_path)

# Verify CALLS dedup: count unique (src_id, dst_id, arg_count, call_site_line) tuples
# Verify CALLS dedup: count unique (src_id, dst_id, arg_count, call_site_line,
# call_site_byte) tuples — byte is included so two call sites of the same
# method on the same source line are kept distinct (issue #359).
result = conn.execute(
"MATCH (a)-[c:CALLS]->(b) "
"RETURN COUNT(DISTINCT {src: a.id, dst: b.id, argc: c.arg_count, line: c.call_site_line})"
"RETURN COUNT(DISTINCT {src: a.id, dst: b.id, argc: c.arg_count, line: c.call_site_line, byte: c.call_site_byte})"
)
unique_call_keys = int(result.get_next()[0])

Expand Down
19 changes: 19 additions & 0 deletions tests/test_ladybug_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,3 +516,22 @@ def test_trace_request_flow_inbound_includes_caller_node_id(ladybug_db_path_cros
inbound = flow.get("inbound") or []
assert inbound
assert any(row.get("caller_node_id") for row in inbound)


def test_parse_ladybug_json_handles_colon_in_values() -> None:
"""_parse_ladybug_json quotes only key positions, so a value containing a
word-colon run (e.g. a URL) is not corrupted (issue #359). The prior regex
matched ``(\\w+):`` anywhere, which turned {url: "https://x"} into junk and
fell back to {}."""
from ladybug_queries import _parse_ladybug_json

# Standard unquoted keys (LadybugDB style).
assert _parse_ladybug_json("{packages: 1, files: 2}") == {"packages": 1, "files": 2}
# A quoted-string value containing https:// must survive intact.
parsed = _parse_ladybug_json('{base_url: "https://example.com", n: 3}')
assert parsed == {"base_url": "https://example.com", "n": 3}
# Nested unquoted keys are quoted at both levels.
assert _parse_ladybug_json("{outer: {inner: 1}}") == {"outer": {"inner": 1}}
# Empty / None are safe.
assert _parse_ladybug_json("") == {}
assert _parse_ladybug_json(None) == {}
8 changes: 6 additions & 2 deletions tests/test_mcp_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1149,10 +1149,14 @@ def test_resolve_natural_language_sentence_returns_none(ladybug_graph) -> None:
assert out.status == "none"


def test_resolve_wildcard_identifier_returns_none(ladybug_graph) -> None:
def test_resolve_wildcard_identifier_rejected(ladybug_graph) -> None:
"""resolve rejects wildcards (* and ?) consistently with search/find/neighbors
(issue #359): previously it silently returned status='none', hiding a likely
user mistake. Now it returns success=False with a message pointing to search."""
out = resolve_v2("com.foo.*Service", hint_kind="symbol", graph=ladybug_graph)
assert out.success is True
assert out.success is False
assert out.status == "none"
assert out.message and "search" in out.message.lower()


def test_resolve_every_reason_in_closed_set_appears() -> None:
Expand Down
Loading