From 42c5c024c641074042155d59a04a498216b43157 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Fri, 3 Jul 2026 23:31:17 +0300 Subject: [PATCH 1/3] fix(mcp): search filter pushdown, NodeRef.name, has_more_results, per-type neighbor columns (#353-356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four correctness/contract fixes in the search/find/neighbors family: #353 — search now forwards role/module/microservice/capability/exclude_roles from NodeFilter into run_search so the filter applies BEFORE pagination, not as a post-filter on the already-paginated page (which could empty a filtered page even when many matches existed deeper in the ranking). #354 — NodeRef gains `name`; the unresolved-call-site ref (other=NodeRef(..., name=callee)) no longer silently drops the callee identifier (pydantic extra='ignore' discarded it, leaving fqn='' and no readable callee). #355 — FindOutput and NeighborsOutput gain `has_more_results`. find_v2 computed it for the hint payload but never put it on the model; clients had to probe to tell if a next page existed. neighbors_v2 computes it where it paginates in Python (None on the single-origin CALLS SQL-paginated path, where calls_row_count is the signal). #356 — the generic flat-label neighbors query no longer RETURNs a fixed column superset regardless of matched edge type (typed-union RETURN anti-pattern that errors on stricter binders). It now issues one single-label query per edge type, RETURNing only that type's columns (map aligned with build_ast_graph REL schemas). Co-Authored-By: Claude --- mcp_v2.py | 102 +++++++++++++++++++++++++++++----------- tests/test_mcp_v2.py | 108 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 28 deletions(-) diff --git a/mcp_v2.py b/mcp_v2.py index 86807ed3..90058ddb 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -450,6 +450,7 @@ class NodeRef(BaseModel): id: str kind: Literal["symbol", "route", "client", "producer", "unresolved_call_site"] fqn: str + name: str | None = None symbol_kind: str | None = None microservice: str | None = None module: str | None = None @@ -517,6 +518,11 @@ class FindOutput(BaseModel): default=None, description="Echoed from the request — the page offset the server applied. None on success=False.", ) + has_more_results: bool | None = Field( + default=None, + description="True when additional pages remain beyond offset+limit (more matches exist). " + "None when unset (e.g. success=False).", + ) advisories: list[str] = Field(default_factory=list, description="Pure informational text with no tool call suggestion") hints_structured: list[StructuredHint] = Field(default_factory=list, description=MCP_HINTS_STRUCTURED_FIELD_DESCRIPTION) @@ -537,6 +543,12 @@ class NeighborsOutput(BaseModel): default_factory=list, description="Echo of neighbors(edge_types=...) from the request; empty when success=False.", ) + has_more_results: bool | None = Field( + default=None, + description="True when additional pages remain beyond offset+limit. None when unset or " + "when the single-origin CALLS path paginated in SQL (use unfiltered_calls_count / " + "calls_row_count there).", + ) advisories: list[str] = Field(default_factory=list, description="Pure informational text with no tool call suggestion") hints_structured: list[StructuredHint] = Field(default_factory=list, description=MCP_HINTS_STRUCTURED_FIELD_DESCRIPTION) @@ -958,6 +970,17 @@ def search_v2( model_name=model_name, device=device, model=model, + # Push the NodeFilter structural predicates into the LanceDB query so + # they apply BEFORE pagination (issue #353) — previously they were only + # a post-filter on the already-paginated page, which could shrink or + # empty filtered pages even when many matches existed deeper in the + # ranking. _node_matches_filter below still re-checks every row (it + # covers the non-pushdownable fields and is the contract guarantee). + role=nf.role if nf else None, + module=nf.module if nf else None, + microservice=nf.microservice if nf else None, + capability=nf.capability if nf else None, + exclude_roles=nf.exclude_roles if nf else None, ) hits: list[SearchHit] = [] for row in rows: @@ -1071,6 +1094,7 @@ def find_v2( results=refs, limit=limit, offset=offset, + has_more_results=has_more_results, advisories=raw_advisories, hints_structured=_to_structured_hints(raw_struct), ) @@ -1534,6 +1558,28 @@ def resolve_v2( return out +# Per-edge-type attribute columns selected by the generic (flat-label) neighbors +# query (issue #356). RETURNing a fixed superset of columns regardless of which +# edge type matched is the typed-union RETURN anti-pattern: a stricter binder +# (e.g. Kùzu) errors when a RETURNed column does not exist on the matched type. +# Selecting columns per edge type keeps the query portable; _neighbor_edge_attrs +# still drops None/"" so each edge exposes only the attrs that exist for its type. +# Aligned with the REL TABLE schemas in build_ast_graph.py. +_FLAT_EDGE_ATTR_COLUMNS: dict[str, tuple[str, ...]] = { + "CALLS": ("confidence", "strategy", "source", "call_site_line", "call_site_byte", "arg_count", "resolved"), + "HTTP_CALLS": ("confidence", "strategy", "match"), + "ASYNC_CALLS": ("confidence", "strategy", "match"), + "EXPOSES": ("confidence", "strategy"), + "DECLARES_CLIENT": ("confidence", "strategy"), + "DECLARES_PRODUCER": ("confidence", "strategy"), + "INJECTS": ("mechanism", "annotation", "field_or_param", "resolved"), + "EXTENDS": ("resolved",), + "IMPLEMENTS": ("resolved",), + "DECLARES": (), + "OVERRIDES": (), +} + + def _neighbor_edge_attrs(row: dict[str, Any]) -> dict[str, Any]: attrs = { k: v @@ -1896,33 +1942,25 @@ def neighbors_v2( results.extend(origin_edges) continue if flat_labels: - # Some Cypher binders can drop `label(e) IN $list` in WHERE; use OR of scalar equalities. - label_params = [f"l{i}" for i in range(len(flat_labels))] - label_predicate = "(" + " OR ".join(f"label(e) = ${name}" for name in label_params) + ")" - q_params = {"id": origin_id, **dict(zip(label_params, flat_labels, strict=True))} - if direction == "out": - rows = g._rows( # noqa: SLF001 - "MATCH (a)-[e]->(b) WHERE a.id = $id AND " - f"{label_predicate} " - "RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, " - "e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, " - "e.annotation AS annotation, e.field_or_param AS field_or_param, " - "e.source AS source, e.call_site_line AS call_site_line, " - "e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, " - "e.resolved AS resolved", - q_params, - ) - else: - rows = g._rows( # noqa: SLF001 - "MATCH (a)<-[e]-(b) WHERE a.id = $id AND " - f"{label_predicate} " - "RETURN b.id AS other_id, label(e) AS edge_type, e.confidence AS confidence, " - "e.strategy AS strategy, e.match AS match, e.mechanism AS mechanism, " - "e.annotation AS annotation, e.field_or_param AS field_or_param, " - "e.source AS source, e.call_site_line AS call_site_line, " - "e.call_site_byte AS call_site_byte, e.arg_count AS arg_count, " - "e.resolved AS resolved", - q_params, + # Select attribute columns per edge type (issue #356). A single + # multi-label query RETURNing a fixed column superset references + # columns that don't exist on every matched type — the typed-union + # RETURN anti-pattern, which errors on stricter binders (e.g. Kùzu). + # Run one single-label query per type, RETURNing only that type's + # columns, and merge the rows. `label(e) = $label` scalar equality + # (not `label(e) IN [...]`) per the AGENTS.md Cypher note. + rows: list[dict[str, Any]] = [] + match_clause = "MATCH (a)-[e]->(b)" if direction == "out" else "MATCH (a)<-[e]-(b)" + for label in flat_labels: + cols = _FLAT_EDGE_ATTR_COLUMNS.get(label, ()) + select = "b.id AS other_id, label(e) AS edge_type" + if cols: + select += ", " + ", ".join(f"e.{c} AS {c}" for c in cols) + rows.extend( + g._rows( # noqa: SLF001 + f"{match_clause} WHERE a.id = $id AND label(e) = $label RETURN {select}", + {"id": origin_id, "label": label}, + ) ) for row in rows: other_id = str(row.get("other_id") or "") @@ -1979,8 +2017,15 @@ def neighbors_v2( ) if use_calls_path and len(origins) > 1: sliced = results[offset : offset + limit] + neighbors_has_more = len(results) > offset + limit + elif use_calls_path: + # Single-origin CALLS path already paginated in SQL; the row/unfiltered + # counts are the pagination signal there, so has_more stays None. + sliced = results + neighbors_has_more = None else: - sliced = results if use_calls_path else results[offset : offset + limit] + sliced = results[offset : offset + limit] + neighbors_has_more = len(results) > offset + limit first_origin = origins[0] origin_kind = _resolve_node_kind(g, first_origin) subject_record = _load_node_record(g, first_origin, origin_kind) @@ -2006,6 +2051,7 @@ def neighbors_v2( success=True, results=sliced, requested_edge_types=requested_edge_types, + has_more_results=neighbors_has_more, advisories=raw_advisories, hints_structured=_to_structured_hints(raw_struct), ) diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index 4ceb829a..7a729710 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -451,10 +451,14 @@ def test_neighbors_route_in_exposes_returns_handler(ladybug_graph) -> None: def test_neighbors_route_in_http_calls_returns_callers(ladybug_graph) -> None: class FakeGraph: def _rows(self, query: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]: + # The generic flat-label path now queries one edge type at a time and + # passes label(e) = $label (issue #356); model that only HTTP_CALLS + # callers exist in this fixture so the ASYNC_CALLS label returns none. if ( "MATCH (a)<-[e]-(b)" in query and "WHERE a.id" in query and "RETURN b.id AS other_id" in query + and (params or {}).get("label") == "HTTP_CALLS" ): return [{"other_id": "client:caller", "edge_type": "HTTP_CALLS", "confidence": 0.8, "match": "cross_service"}] if "MATCH (n:Client)" in query: @@ -584,6 +588,110 @@ def test_search_unknown_filter_key_returns_failure(monkeypatch, ladybug_graph) - assert "typo_key" in out.message +def test_search_pushes_nodefilter_into_run_search(monkeypatch, ladybug_graph) -> None: + """search forwards NodeFilter structural fields into run_search so the filter + applies BEFORE pagination, not as a post-filter on the already-paginated page + (issue #353) — previously a filtered page could shrink to 0-2 results even + when many matches existed deeper in the ranking.""" + captured: dict[str, Any] = {} + + def fake_run_search(query, **kwargs): + captured.update(kwargs) + return _fake_search_rows() + + monkeypatch.setattr("mcp_v2.run_search", fake_run_search) + out = search_v2( + "ChatService", + filter={ + "role": "SERVICE", + "module": "chat-assign", + "microservice": "chat-assign", + "capability": "c", + "exclude_roles": ["CONTROLLER"], + }, + graph=ladybug_graph, + ) + assert out.success is True + assert captured.get("role") == "SERVICE" + assert captured.get("module") == "chat-assign" + assert captured.get("microservice") == "chat-assign" + assert captured.get("capability") == "c" + assert captured.get("exclude_roles") == ["CONTROLLER"] + + +def test_unresolved_call_site_noderef_carries_callee_name() -> None: + """The unresolved-call-site NodeRef must carry the callee identifier in `name` + (issue #354) — NodeRef previously had no `name` field, so pydantic's default + extra='ignore' silently dropped `name=callee`, leaving the structured ref with + fqn='' and no human-readable callee (clients had to dig into attrs).""" + from mcp_v2 import _unresolved_site_to_edge + + edge = _unresolved_site_to_edge( + "origin:1", + { + "id": "ucs:1", + "callee_simple": "Foo.bar", + "call_site_line": 42, + "call_site_byte": 7, + "arg_count": 2, + "reason": "phantom", + "receiver_expr": "x", + }, + ) + assert edge.other.kind == "unresolved_call_site" + assert edge.other.name == "Foo.bar" + # callee is also still carried in attrs for clients that read attrs + assert edge.attrs["callee_simple"] == "Foo.bar" + + +def test_find_exposes_has_more_results(ladybug_graph) -> None: + """find surfaces has_more_results on FindOutput so a paging client can tell + whether another page exists without a probe call (issue #355). The value was + computed and placed in the hint payload but absent from the output model.""" + out = find_v2("symbol", {"symbol_kind": "method"}, limit=1, offset=0, graph=ladybug_graph) + assert out.success is True + assert out.has_more_results is True # bank-chat has more than one method + + # Past the end: no rows remain, so has_more is False. + out_last = find_v2("symbol", {"symbol_kind": "method"}, limit=1, offset=1_000_000, graph=ladybug_graph) + assert out_last.success is True + assert out_last.has_more_results is False + + +def test_neighbors_flat_labels_select_columns_per_edge_type() -> None: + """The generic flat-label neighbors query issues one Cypher per edge type and + RETURNs only that type's columns (issue #356) — never a fixed superset that + references columns absent on some matched type (the typed-union RETURN + anti-pattern that errors on stricter binders like Kuzu).""" + from mcp_v2 import _FLAT_EDGE_ATTR_COLUMNS + + issued: list[tuple[str, dict[str, Any]]] = [] + + class FakeGraph: + def _rows(self, query: str, params: dict[str, Any] | None = None) -> list[dict[str, Any]]: + if "RETURN b.id AS other_id" in query: + issued.append((query, params or {})) + return [] + + out = neighbors_v2( + "sym:origin", + direction="out", + edge_types=["CALLS", "DECLARES", "INJECTS", "EXPOSES"], + graph=FakeGraph(), # type: ignore[arg-type] + ) + assert out.success is True + # One flat-label query per edge type, each tagged with its label param. + labels = [p.get("label") for _, p in issued] + assert set(labels) == {"CALLS", "DECLARES", "INJECTS", "EXPOSES"} + for query, params in issued: + label = params["label"] + allowed = set(_FLAT_EDGE_ATTR_COLUMNS.get(label, ())) + referenced = set(re.findall(r"e\.(\w+) AS ", query)) + assert referenced <= allowed, ( + f"label {label}: RETURN references {referenced}, only {allowed} are valid" + ) + + def test_search_cross_kind_filter_returns_failure(monkeypatch, ladybug_graph) -> None: monkeypatch.setattr("mcp_v2.run_search", lambda *args, **kwargs: _fake_search_rows()) out = search_v2("ChatService", filter={"path_prefix": "/api"}, graph=ladybug_graph) From b9818ba6051675510ab2012f81483c312686bee0 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sat, 4 Jul 2026 08:21:45 +0300 Subject: [PATCH 2/3] fix(mcp): set has_more_results=False when the CALLS path returns the full set On the single-origin CALLS neighbors path, has_more_results was None unconditionally. When paginate_in_sql is False (a node_filter is set, include_unresolved, or dedup_calls), _neighbors_calls_for_origin loads the FULL matching set (offset=0, limit=None), so the client already has every edge -- has_more must be False, not None ("unknown"), so a paging client need not issue a redundant probe (#355). Keep None only for the SQL-paginated case where the row/unfiltered counts carry the signal. Adds a regression test covering both modes (filtered -> False, unfiltered single-origin -> None). Verified red->green. Co-Authored-By: Claude --- mcp_v2.py | 10 +++++++--- tests/test_mcp_v2.py | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/mcp_v2.py b/mcp_v2.py index 90058ddb..400d914c 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -2019,10 +2019,14 @@ def neighbors_v2( sliced = results[offset : offset + limit] neighbors_has_more = len(results) > offset + limit elif use_calls_path: - # Single-origin CALLS path already paginated in SQL; the row/unfiltered - # counts are the pagination signal there, so has_more stays None. + # Single-origin CALLS path. When paginate_in_sql is True the SQL did + # the OFFSET/LIMIT and the row/unfiltered counts carry the has-more + # signal, so this field stays None (unknown). When paginate_in_sql is + # False (a node_filter is set, include_unresolved, or dedup_calls) we + # loaded the FULL matching set with no pushdown, so the client already + # has every edge -> False (not None), so a paging client need not probe. sliced = results - neighbors_has_more = None + neighbors_has_more = None if paginate_in_sql else False else: sliced = results[offset : offset + limit] neighbors_has_more = len(results) > offset + limit diff --git a/tests/test_mcp_v2.py b/tests/test_mcp_v2.py index 7a729710..c9eb7187 100644 --- a/tests/test_mcp_v2.py +++ b/tests/test_mcp_v2.py @@ -755,6 +755,32 @@ def test_neighbors_filter_accepts_json_string(ladybug_graph) -> None: assert out_dict.results == out_str.results +def test_neighbors_calls_has_more_results_reflects_pagination_mode(ladybug_graph) -> None: + """Single-origin CALLS has_more_results depends on whether SQL paginated. + + Regression for the #355 has_more_results field on NeighborsOutput. When a + node_filter forces the in-memory (non-SQL-paginated) path, the full filtered + CALLS set is returned, so has_more_results must be False (the client has + everything and need not probe) -- not None ("unknown"). With no filter the + single-origin SQL path paginates and the row/unfiltered counts carry the + signal, so the field stays None. + """ + mid = _method_id_with_calls(ladybug_graph, "out") + # node_filter set -> paginate_in_sql False -> full set returned -> has_more False + filtered = neighbors_v2( + mid, direction="out", edge_types=["CALLS"], filter={"role": "SERVICE"}, graph=ladybug_graph + ) + assert filtered.success is True + assert filtered.has_more_results is False, ( + "non-SQL-paginated CALLS returned the full set; has_more_results must be " + "False so a paging client does not issue a redundant probe (#355)" + ) + # No filter, single origin -> SQL-paginated -> has-more signal is the row count. + paginated = neighbors_v2(mid, direction="out", edge_types=["CALLS"], graph=ladybug_graph) + assert paginated.success is True + assert paginated.has_more_results is None + + def test_neighbors_filter_unknown_key_returns_failure(ladybug_graph) -> None: mid = _method_id_with_calls(ladybug_graph, "out") out = neighbors_v2(mid, direction="out", edge_types=["CALLS"], filter={"typo_key": "x"}, graph=ladybug_graph) From 5ec979a8bcc1b1e0d97a642085261bbfee3f1d40 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sat, 4 Jul 2026 08:55:45 +0300 Subject: [PATCH 3/3] fix(mcp): fail-loud on missing edge-type attrs; drop null fields from hint input - _FLAT_EDGE_ATTR_COLUMNS now asserts at import that it covers every EdgeType exactly. A future edge type added to EdgeType + EDGE_SCHEMA + a REL TABLE but missing here would otherwise silently SELECT only (id, label) and yield edges with empty attrs via the .get(label, ()) fallback; the assert trips on both a missing and a stale entry. - The find/neighbors hint payloads used bare model_dump(), so every structured NodeRef carried name: null (and other null fields) into generate_hints. Switch to exclude_none=True -- these dicts feed generate_hints (which reads fields defensively via .get), not the tool result, and it matches the adjacent filter dumps. Verified against test_mcp_v2 / test_mcp_hints / test_client_hint_recovery. Co-Authored-By: Claude --- mcp_v2.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mcp_v2.py b/mcp_v2.py index 400d914c..60761da9 100644 --- a/mcp_v2.py +++ b/mcp_v2.py @@ -1082,7 +1082,12 @@ def find_v2( hint_payload: dict[str, Any] = { "success": True, "kind": kind, - "results": [r.model_dump() for r in refs], + # exclude_none: this dict feeds generate_hints (which reads fields + # defensively via .get), not the tool result (FindOutput below holds the + # pydantic objects). Drop null fields -- including the NodeRef.name field + # that is None for every structured ref -- to match filter_dump above and + # avoid spurious "name": null noise in the hint input. + "results": [r.model_dump(exclude_none=True) for r in refs], "limit": limit, "offset": offset, "filter": filter_dump, @@ -2035,7 +2040,7 @@ def neighbors_v2( subject_record = _load_node_record(g, first_origin, origin_kind) neigh_payload = { "success": True, - "results": [e.model_dump() for e in sliced], + "results": [e.model_dump(exclude_none=True) for e in sliced], "requested_edge_types": requested_edge_types, "requested_direction": direction, "offset": offset,