From 87a2ccfb8c9c9db5f04e9632f7ba8980057c4a00 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Mon, 22 Jun 2026 22:22:45 +0300 Subject: [PATCH] fix(graph): incremental correctness + perf follow-ups (PR-P5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three follow-ups to the init/increment-perf plan (PR-P1..P4): PR-P5b (correctness): pull annotation-definition users into incremental scope. Annotation usage is a node property (annotations STRING[]), not a Symbol->Symbol edge, so _find_dependents never pulled annotation users in — when an @interface definition changed (e.g. gained a meta-annotation shifting the Layer-A chain in resolve_role_and_capabilities), types carrying it went stale until the next full rebuild. Add _find_annotation_dependents; merge its results into dependent_files so users are re-parsed and role-re-SET (the expansion cap bounds scope). Direct usage only; transitive composition documented as a known limitation. PR-P5c (perf): convert the last per-row graph write — the global pass 5-6 Route MERGE upsert — to bulk, mirroring _write_nodes_impl's Symbol pattern (_SET_ROUTE_BY_ID + COPY-new/SET-existing). Behavior-preserving; routes_rows already carries unique ids. PR-P5a (investigation, no code): confirmed OVERRIDES does NOT duplicate on increment — loaded stub types carry no supertype edges, so _populate_overrides_rows can't re-derive their pairs (unlike DECLARES, fixed in PR-P4, which is derived from a member's own fields). No guard shipped; added an invariant guard test pinning the behavior. Co-Authored-By: Claude --- build_ast_graph.py | 98 ++++++++++++++++--- tests/test_incremental_graph.py | 166 ++++++++++++++++++++++++++++++-- 2 files changed, 242 insertions(+), 22 deletions(-) diff --git a/build_ast_graph.py b/build_ast_graph.py index 3e1857ca..3e75a32c 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -684,6 +684,54 @@ def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> se return dependent_files +def _find_annotation_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> set[str]: + """Find files that USE an annotation whose DEFINITION is among the changed nodes. + + Annotation usage is a node property (``annotations`` STRING[]), not a + Symbol->Symbol edge, so `_find_dependents` — which walks edges — never pulls + annotation users into scope. When an annotation definition changes (e.g. + ``@interface Foo`` gains a meta-annotation that shifts the Layer-A chain in + `resolve_role_and_capabilities`), every type carrying ``@Foo`` may need its + ``role``/``capabilities`` recomputed or it goes stale until the next full + rebuild. Return those users' files so the orchestrator treats them as + dependents (re-parsed, role re-SET); the expansion cap bounds the scope. + + Scope is direct usage only: a user of an annotation that transitively + composes the changed one (e.g. ``@A`` where ``@A`` is meta-annotated with the + changed ``@B``) is NOT pulled in — that reverse-chain walk is left to a + future hardening pass. The direct case covers the dominant real-world shape + (a stereotype annotation applied directly to many types). + """ + if not changed_node_ids: + return set() + # Changed annotation definitions → the simple names users reference them by. + # Runs before `_delete_file_scope`, so the def nodes still exist. + name_result = conn.execute( + "MATCH (s:Symbol) WHERE s.id IN $ids AND s.kind = 'annotation' RETURN s.name", + {"ids": list(changed_node_ids)}, + ) + names: list[str] = [] + while name_result.has_next(): + nm = name_result.get_next()[0] + if nm: + names.append(nm) + if not names: + return set() + dependent_files: set[str] = set() + for nm in names: + user_result = conn.execute( + "MATCH (s:Symbol) " + "WHERE list_contains(s.annotations, $nm) AND s.filename <> '' " + "RETURN DISTINCT s.filename", + {"nm": nm}, + ) + while user_result.has_next(): + fn = user_result.get_next()[0] + if fn: + dependent_files.add(fn) + return dependent_files + + def _delete_file_scope( conn: ladybug.Connection, changed_files: set[str], @@ -3089,6 +3137,19 @@ def _existing_node_ids(conn: ladybug.Connection) -> set[str]: "n.signature = $signature, n.parent_id = $parent_id, n.resolved = $resolved" ) +# Refresh every mutable Route field on an existing Route node by id. Mirrors the +# `_write_nodes_impl` Symbol pattern (bulk COPY new rows + per-row SET existing +# ones) so the global pass 5-6 step no longer needs a per-row MERGE upsert. +# Field list mirrors `_ROUTE_COLUMNS` minus `id`. +_SET_ROUTE_BY_ID = ( + "MATCH (r:Route {id: $id}) " + "SET r.kind = $kind, r.framework = $framework, r.method = $method, " + "r.path = $path, r.path_template = $path_template, r.path_regex = $path_regex, " + "r.topic = $topic, r.broker = $broker, r.feign_name = $feign_name, r.feign_url = $feign_url, " + "r.microservice = $microservice, r.module = $module, r.filename = $filename, " + "r.start_line = $start_line, r.end_line = $end_line, r.resolved = $resolved" +) + _REL_EXTENDS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"] _REL_IMPLEMENTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"] _REL_INJECTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved", "mechanism", "annotation", "field_or_param"] @@ -3776,6 +3837,12 @@ def incremental_rebuild( # Find dependents dependent_files = _find_dependents(conn, changed_node_ids) + # Annotation-definition change: also pull in files that USE the changed + # annotation. Annotation usage is a node property, not a Symbol->Symbol + # edge, so `_find_dependents` misses them and their role (derived from + # the project-wide meta-chain) would go stale. See PR-P5b. + dependent_files |= _find_annotation_dependents(conn, changed_node_ids) + # Union changed files with dependents scope_files = changed_files | dependent_files @@ -3940,22 +4007,21 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa Route nodes (created by pass5 for cross-service calls) that wouldn't otherwise exist in LadybugDB. """ - # Upsert every route via MERGE. `tables.routes_rows` is the full route set - # (pass4 routes + pass5 phantom routes), not just phantoms; MERGE is - # idempotent against routes already written during the scoped step, so it - # neither duplicates nor drops them. This is the one remaining per-row graph - # write — converting it to bulk COPY requires filtering against existing - # route ids to reproduce the dedup, and is left as a future optimization. - for row in tables.routes_rows: - conn.execute( - "MERGE (r:Route {id: $id}) " - "SET r.kind = $kind, r.framework = $framework, r.method = $method, " - "r.path = $path, r.path_template = $path_template, r.path_regex = $path_regex, " - "r.topic = $topic, r.broker = $broker, r.feign_name = $feign_name, r.feign_url = $feign_url, " - "r.microservice = $microservice, r.module = $module, r.filename = $filename, " - "r.start_line = $start_line, r.end_line = $end_line, r.resolved = $resolved", - asdict(row), - ) + # Bulk-write routes, mirroring `_write_nodes_impl`: COPY the rows that are new + # to the DB, and SET every mutable field on the routes already present (the + # caller does NOT delete routes, only Clients/Producers — see the global + # pass 5-6 block). `tables.routes_rows` is the full route set (pass4 routes + + # pass5 phantom routes), not just phantoms, so the SET keeps existing routes' + # properties in sync with pass5's cross-service enrichment while the COPY + # materializes phantoms that have no node yet. Replaces the last per-row + # graph write (MERGE upsert). + route_rows = [asdict(row) for row in tables.routes_rows] + existing_route_ids = _existing_node_ids(conn) + new_route_rows = [r for r in route_rows if r["id"] not in existing_route_ids] + _bulk_copy(conn, "Route", _ROUTE_COLUMNS, new_route_rows) + for r in route_rows: + if r["id"] in existing_route_ids: + conn.execute(_SET_ROUTE_BY_ID, r) # Build node_id lookup for members and types member_by_id = {m.node_id: m for m in tables.members} diff --git a/tests/test_incremental_graph.py b/tests/test_incremental_graph.py index 8e10466a..675b8232 100644 --- a/tests/test_incremental_graph.py +++ b/tests/test_incremental_graph.py @@ -1148,12 +1148,166 @@ def role_of(fqn: str) -> str: "(PR-P4 regression: skip-if-exists dropped the upsert)" ) - def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None: - """Pass5/6 Route MERGE dedup is preserved after bulk conversion. + def test_incremental_pulls_in_annotation_users_on_def_change( + self, tmp_path: Path + ) -> None: + """Editing an annotation definition refreshes its direct users' roles. + + Regression guard for the PR-P5b fix. Annotation usage is a node property + (``annotations`` STRING[]), not a Symbol->Symbol edge, so `_find_dependents` + — which walks edges — never pulls annotation users into scope. When an + annotation definition changes (e.g. ``@interface MyService`` gains a + ``@Service`` meta-annotation that shifts the Layer-A chain), types + carrying ``@MyService`` need their ``role`` recomputed or they go stale. + + Unlike `test_incremental_refreshes_dependent_role_on_meta_chain_change` + (PR-P4), the user here has NO edge to any changed node: it is pulled in + purely by the annotation-usage expansion (`_find_annotation_dependents`), + so this isolates the scope-tracking fix from the preserved-dependent + property refresh. + """ + from build_ast_graph import incremental_rebuild + from graph_enrich import collect_annotation_meta_chain + from _builders import build_ladybug_full_into + + source_root = tmp_path / "src" + source_root.mkdir() + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + java = source_root / "pkg" + java.mkdir(parents=True) + + # Svc carries @MyService but calls/extends nothing — no edge to any other + # node, so `_find_dependents` cannot pull it in. Only annotation usage can. + (java / "MyService.java").write_text( + "package pkg; public @interface MyService {}\n", encoding="utf-8" + ) + (java / "Svc.java").write_text( + "package pkg;\n@MyService\npublic class Svc {}\n", encoding="utf-8" + ) + build_ladybug_full_into(source_root, ladybug_path) + + # Edit ONLY the annotation definition: add a @Service meta-annotation so + # the chain maps MyService → Service and Svc's role flips OTHER → SERVICE. + (java / "MyService.java").write_text( + "package pkg;\nimport org.springframework.stereotype.Service;\n" + "@Service\npublic @interface MyService {}\n", + encoding="utf-8", + ) + collect_annotation_meta_chain.cache_clear() + result = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert result.mode == "incremental", f"expected incremental, got {result.mode}" + assert result.dependents_reprocessed >= 1, ( + "Svc should be pulled in as an annotation-dependent, not just the def file" + ) + + def role_of(fqn: str) -> str: + db = ladybug.Database(str(ladybug_path)) + conn = ladybug.Connection(db) + r = conn.execute("MATCH (n:Symbol {fqn: $fqn}) RETURN n.role", {"fqn": fqn}) + v = r.get_next()[0] if r.has_next() else None + conn.close() + db.close() + return v + + # Svc was pulled in by annotation usage (no edge to the changed def); its + # role must refresh to SERVICE to match a full rebuild of this state. + assert role_of("pkg.Svc") == "SERVICE", ( + "annotation user's role not refreshed after annotation-def change " + "(PR-P5b regression: annotation users not pulled into incremental scope)" + ) - Creates a corpus where pass5/6 re-emits an existing route and verifies - no duplicate Route rows after incremental rebuild (the retained MERGE - dedups against routes written during the scoped step). + def test_incremental_overrides_not_duplicated_for_non_scope_subtype( + self, tmp_path: Path + ) -> None: + """A non-scope subtype's OVERRIDES edge is not duplicated on increment. + + Invariant guard for the OVERRIDES path. Unlike DECLARES (derived purely + from a member's own ``parent_id``/``node_id``, which is why a loaded + non-scope member could duplicate it — fixed in PR-P4), an OVERRIDES pair + needs the subtype's supertype via `_direct_supertype_ids`, which reads + `tables.extends_rows`/`implements_rows`. Those edge tables are populated + only by `pass2_edges` over *parsed* files; cross-file resolution stubs + (`loaded_from_db`) load nodes but NOT edges, so a loaded subtype has no + derivable supertype and `_populate_overrides_rows` never re-emits its + OVERRIDES — the edge (``source_file`` = its out-of-scope file) survives + untouched, matching a full rebuild. + + This test pins that behavior down. If stub edge-loading is ever added as + an optimization, this guard will catch the resulting duplication (the + same class of bug PR-P4 fixed for DECLARES). + + Corpus: `TImpl implements T` overrides `foo()` in non-scope files; an + unrelated `Other.java` change triggers the increment so `TImpl`/`T` are + loaded as stubs. The increment must keep exactly one OVERRIDES edge. + """ + from build_ast_graph import incremental_rebuild + from _builders import build_ladybug_full_into + + source_root = tmp_path / "src" + source_root.mkdir() + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + java = source_root / "pkg" + java.mkdir(parents=True) + + (java / "T.java").write_text( + "package pkg; public interface T { void foo(); }\n", encoding="utf-8" + ) + (java / "TImpl.java").write_text( + "package pkg; public class TImpl implements T { public void foo() {} }\n", + encoding="utf-8", + ) + (java / "Other.java").write_text( + "package pkg; public class Other { public void go() {} }\n", encoding="utf-8" + ) + build_ladybug_full_into(source_root, ladybug_path) + + # Change Other.java only. Nothing references Other, so the scope is just + # {Other.java}; TImpl/T are non-scope and loaded as resolution stubs. + (java / "Other.java").write_text( + "package pkg; public class Other { public void go() {} public void more() {} }\n", + encoding="utf-8", + ) + result = incremental_rebuild(source_root, ladybug_path, verbose=False) + assert result.mode == "incremental", f"expected incremental, got {result.mode}" + + def overrides_count(path: Path) -> int: + db = ladybug.Database(str(path)) + conn = ladybug.Connection(db) + r = conn.execute("MATCH ()-[o:OVERRIDES]->() RETURN count(o)") + n = r.get_next()[0] if r.has_next() else 0 + conn.close() + db.close() + return n + + # Exactly one override relationship exists (TImpl.foo → T.foo); a + # duplicate (2) is the bug. Cross-check against a fresh full rebuild of + # the identical final state — the increment must match it. + full_dir = tmp_path / "full" + full_dir.mkdir() + full_path = full_dir / "code_graph.lbug" + build_ladybug_full_into(source_root, full_path) + + assert overrides_count(ladybug_path) == 1, ( + "non-scope OVERRIDES edge duplicated on increment " + "(PR-P5a regression: loaded subtype method re-emitted)" + ) + assert overrides_count(ladybug_path) == overrides_count(full_path), ( + "incremental OVERRIDES count diverged from full rebuild" + ) + + def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None: + """Pass5/6 Route write does not duplicate existing routes. + + Creates a corpus where pass5/6 re-emits an existing route and verifies + no duplicate Route rows after incremental rebuild. The global step now + bulk-writes routes (COPY new ids + SET existing ids — see PR-P5c), + replacing the per-row MERGE upsert this test was originally written for; + the name is kept for plan-reference continuity. The SET branch is what + dedups against routes written during the scoped step here. """ from build_ast_graph import incremental_rebuild, write_ladybug @@ -1216,7 +1370,7 @@ def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None: route_count = route_count_result.get_next()[0] # Should be exactly 1 route (no duplicates) - assert route_count == 1, f"Expected 1 route, found {route_count} (MERGE dedup failed)" + assert route_count == 1, f"Expected 1 route, found {route_count} (route dedup failed)" conn.close()