From 87a2ccfb8c9c9db5f04e9632f7ba8980057c4a00 Mon Sep 17 00:00:00 2001
From: Dmitry Teryaev <doudmitry@gmail.com>
Date: Mon, 22 Jun 2026 22:22:45 +0300
Subject: [PATCH] fix(graph): incremental correctness + perf follow-ups (PR-P5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three follow-ups to the init/increment-perf plan (PR-P1..P4):

PR-P5b (correctness): pull annotation-definition users into incremental
scope. Annotation usage is a node property (annotations STRING[]), not a
Symbol->Symbol edge, so _find_dependents never pulled annotation users in
— when an @interface definition changed (e.g. gained a meta-annotation
shifting the Layer-A chain in resolve_role_and_capabilities), types
carrying it went stale until the next full rebuild. Add
_find_annotation_dependents; merge its results into dependent_files so
users are re-parsed and role-re-SET (the expansion cap bounds scope).
Direct usage only; transitive composition documented as a known limitation.

PR-P5c (perf): convert the last per-row graph write — the global pass 5-6
Route MERGE upsert — to bulk, mirroring _write_nodes_impl's Symbol pattern
(_SET_ROUTE_BY_ID + COPY-new/SET-existing). Behavior-preserving; routes_rows
already carries unique ids.

PR-P5a (investigation, no code): confirmed OVERRIDES does NOT duplicate on
increment — loaded stub types carry no supertype edges, so
_populate_overrides_rows can't re-derive their pairs (unlike DECLARES,
fixed in PR-P4, which is derived from a member's own fields). No guard
shipped; added an invariant guard test pinning the behavior.

Co-Authored-By: Claude <noreply@anthropic.com>
---
 build_ast_graph.py              |  98 ++++++++++++++++---
 tests/test_incremental_graph.py | 166 ++++++++++++++++++++++++++++++--
 2 files changed, 242 insertions(+), 22 deletions(-)

diff --git a/build_ast_graph.py b/build_ast_graph.py
index 3e1857ca..3e75a32c 100644
--- a/build_ast_graph.py
+++ b/build_ast_graph.py
@@ -684,6 +684,54 @@ def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> se
     return dependent_files
 
 
+def _find_annotation_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> set[str]:
+    """Find files that USE an annotation whose DEFINITION is among the changed nodes.
+
+    Annotation usage is a node property (``annotations`` STRING[]), not a
+    Symbol->Symbol edge, so `_find_dependents` — which walks edges — never pulls
+    annotation users into scope. When an annotation definition changes (e.g.
+    ``@interface Foo`` gains a meta-annotation that shifts the Layer-A chain in
+    `resolve_role_and_capabilities`), every type carrying ``@Foo`` may need its
+    ``role``/``capabilities`` recomputed or it goes stale until the next full
+    rebuild. Return those users' files so the orchestrator treats them as
+    dependents (re-parsed, role re-SET); the expansion cap bounds the scope.
+
+    Scope is direct usage only: a user of an annotation that transitively
+    composes the changed one (e.g. ``@A`` where ``@A`` is meta-annotated with the
+    changed ``@B``) is NOT pulled in — that reverse-chain walk is left to a
+    future hardening pass. The direct case covers the dominant real-world shape
+    (a stereotype annotation applied directly to many types).
+    """
+    if not changed_node_ids:
+        return set()
+    # Changed annotation definitions → the simple names users reference them by.
+    # Runs before `_delete_file_scope`, so the def nodes still exist.
+    name_result = conn.execute(
+        "MATCH (s:Symbol) WHERE s.id IN $ids AND s.kind = 'annotation' RETURN s.name",
+        {"ids": list(changed_node_ids)},
+    )
+    names: list[str] = []
+    while name_result.has_next():
+        nm = name_result.get_next()[0]
+        if nm:
+            names.append(nm)
+    if not names:
+        return set()
+    dependent_files: set[str] = set()
+    for nm in names:
+        user_result = conn.execute(
+            "MATCH (s:Symbol) "
+            "WHERE list_contains(s.annotations, $nm) AND s.filename <> '' "
+            "RETURN DISTINCT s.filename",
+            {"nm": nm},
+        )
+        while user_result.has_next():
+            fn = user_result.get_next()[0]
+            if fn:
+                dependent_files.add(fn)
+    return dependent_files
+
+
 def _delete_file_scope(
     conn: ladybug.Connection,
     changed_files: set[str],
@@ -3089,6 +3137,19 @@ def _existing_node_ids(conn: ladybug.Connection) -> set[str]:
     "n.signature = $signature, n.parent_id = $parent_id, n.resolved = $resolved"
 )
 
+# Refresh every mutable Route field on an existing Route node by id. Mirrors the
+# `_write_nodes_impl` Symbol pattern (bulk COPY new rows + per-row SET existing
+# ones) so the global pass 5-6 step no longer needs a per-row MERGE upsert.
+# Field list mirrors `_ROUTE_COLUMNS` minus `id`.
+_SET_ROUTE_BY_ID = (
+    "MATCH (r:Route {id: $id}) "
+    "SET r.kind = $kind, r.framework = $framework, r.method = $method, "
+    "r.path = $path, r.path_template = $path_template, r.path_regex = $path_regex, "
+    "r.topic = $topic, r.broker = $broker, r.feign_name = $feign_name, r.feign_url = $feign_url, "
+    "r.microservice = $microservice, r.module = $module, r.filename = $filename, "
+    "r.start_line = $start_line, r.end_line = $end_line, r.resolved = $resolved"
+)
+
 _REL_EXTENDS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"]
 _REL_IMPLEMENTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved"]
 _REL_INJECTS_COLUMNS = ["FROM", "TO", "source_file", "dst_name", "dst_fqn", "resolved", "mechanism", "annotation", "field_or_param"]
@@ -3776,6 +3837,12 @@ def incremental_rebuild(
         # Find dependents
         dependent_files = _find_dependents(conn, changed_node_ids)
 
+        # Annotation-definition change: also pull in files that USE the changed
+        # annotation. Annotation usage is a node property, not a Symbol->Symbol
+        # edge, so `_find_dependents` misses them and their role (derived from
+        # the project-wide meta-chain) would go stale. See PR-P5b.
+        dependent_files |= _find_annotation_dependents(conn, changed_node_ids)
+
         # Union changed files with dependents
         scope_files = changed_files | dependent_files
 
@@ -3940,22 +4007,21 @@ def _write_clients_producers_and_calls(conn: ladybug.Connection, tables: GraphTa
     Route nodes (created by pass5 for cross-service calls) that wouldn't
     otherwise exist in LadybugDB.
     """
-    # Upsert every route via MERGE. `tables.routes_rows` is the full route set
-    # (pass4 routes + pass5 phantom routes), not just phantoms; MERGE is
-    # idempotent against routes already written during the scoped step, so it
-    # neither duplicates nor drops them. This is the one remaining per-row graph
-    # write — converting it to bulk COPY requires filtering against existing
-    # route ids to reproduce the dedup, and is left as a future optimization.
-    for row in tables.routes_rows:
-        conn.execute(
-            "MERGE (r:Route {id: $id}) "
-            "SET r.kind = $kind, r.framework = $framework, r.method = $method, "
-            "r.path = $path, r.path_template = $path_template, r.path_regex = $path_regex, "
-            "r.topic = $topic, r.broker = $broker, r.feign_name = $feign_name, r.feign_url = $feign_url, "
-            "r.microservice = $microservice, r.module = $module, r.filename = $filename, "
-            "r.start_line = $start_line, r.end_line = $end_line, r.resolved = $resolved",
-            asdict(row),
-        )
+    # Bulk-write routes, mirroring `_write_nodes_impl`: COPY the rows that are new
+    # to the DB, and SET every mutable field on the routes already present (the
+    # caller does NOT delete routes, only Clients/Producers — see the global
+    # pass 5-6 block). `tables.routes_rows` is the full route set (pass4 routes +
+    # pass5 phantom routes), not just phantoms, so the SET keeps existing routes'
+    # properties in sync with pass5's cross-service enrichment while the COPY
+    # materializes phantoms that have no node yet. Replaces the last per-row
+    # graph write (MERGE upsert).
+    route_rows = [asdict(row) for row in tables.routes_rows]
+    existing_route_ids = _existing_node_ids(conn)
+    new_route_rows = [r for r in route_rows if r["id"] not in existing_route_ids]
+    _bulk_copy(conn, "Route", _ROUTE_COLUMNS, new_route_rows)
+    for r in route_rows:
+        if r["id"] in existing_route_ids:
+            conn.execute(_SET_ROUTE_BY_ID, r)
 
     # Build node_id lookup for members and types
     member_by_id = {m.node_id: m for m in tables.members}
diff --git a/tests/test_incremental_graph.py b/tests/test_incremental_graph.py
index 8e10466a..675b8232 100644
--- a/tests/test_incremental_graph.py
+++ b/tests/test_incremental_graph.py
@@ -1148,12 +1148,166 @@ def role_of(fqn: str) -> str:
             "(PR-P4 regression: skip-if-exists dropped the upsert)"
         )
 
-    def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None:
-        """Pass5/6 Route MERGE dedup is preserved after bulk conversion.
+    def test_incremental_pulls_in_annotation_users_on_def_change(
+        self, tmp_path: Path
+    ) -> None:
+        """Editing an annotation definition refreshes its direct users' roles.
+
+        Regression guard for the PR-P5b fix. Annotation usage is a node property
+        (``annotations`` STRING[]), not a Symbol->Symbol edge, so `_find_dependents`
+        — which walks edges — never pulls annotation users into scope. When an
+        annotation definition changes (e.g. ``@interface MyService`` gains a
+        ``@Service`` meta-annotation that shifts the Layer-A chain), types
+        carrying ``@MyService`` need their ``role`` recomputed or they go stale.
+
+        Unlike `test_incremental_refreshes_dependent_role_on_meta_chain_change`
+        (PR-P4), the user here has NO edge to any changed node: it is pulled in
+        purely by the annotation-usage expansion (`_find_annotation_dependents`),
+        so this isolates the scope-tracking fix from the preserved-dependent
+        property refresh.
+        """
+        from build_ast_graph import incremental_rebuild
+        from graph_enrich import collect_annotation_meta_chain
+        from _builders import build_ladybug_full_into
+
+        source_root = tmp_path / "src"
+        source_root.mkdir()
+        index_dir = tmp_path / "index"
+        index_dir.mkdir()
+        ladybug_path = index_dir / "code_graph.lbug"
+        java = source_root / "pkg"
+        java.mkdir(parents=True)
+
+        # Svc carries @MyService but calls/extends nothing — no edge to any other
+        # node, so `_find_dependents` cannot pull it in. Only annotation usage can.
+        (java / "MyService.java").write_text(
+            "package pkg; public @interface MyService {}\n", encoding="utf-8"
+        )
+        (java / "Svc.java").write_text(
+            "package pkg;\n@MyService\npublic class Svc {}\n", encoding="utf-8"
+        )
+        build_ladybug_full_into(source_root, ladybug_path)
+
+        # Edit ONLY the annotation definition: add a @Service meta-annotation so
+        # the chain maps MyService → Service and Svc's role flips OTHER → SERVICE.
+        (java / "MyService.java").write_text(
+            "package pkg;\nimport org.springframework.stereotype.Service;\n"
+            "@Service\npublic @interface MyService {}\n",
+            encoding="utf-8",
+        )
+        collect_annotation_meta_chain.cache_clear()
+        result = incremental_rebuild(source_root, ladybug_path, verbose=False)
+        assert result.mode == "incremental", f"expected incremental, got {result.mode}"
+        assert result.dependents_reprocessed >= 1, (
+            "Svc should be pulled in as an annotation-dependent, not just the def file"
+        )
+
+        def role_of(fqn: str) -> str:
+            db = ladybug.Database(str(ladybug_path))
+            conn = ladybug.Connection(db)
+            r = conn.execute("MATCH (n:Symbol {fqn: $fqn}) RETURN n.role", {"fqn": fqn})
+            v = r.get_next()[0] if r.has_next() else None
+            conn.close()
+            db.close()
+            return v
+
+        # Svc was pulled in by annotation usage (no edge to the changed def); its
+        # role must refresh to SERVICE to match a full rebuild of this state.
+        assert role_of("pkg.Svc") == "SERVICE", (
+            "annotation user's role not refreshed after annotation-def change "
+            "(PR-P5b regression: annotation users not pulled into incremental scope)"
+        )
 
-       Creates a corpus where pass5/6 re-emits an existing route and verifies
-        no duplicate Route rows after incremental rebuild (the retained MERGE
-        dedups against routes written during the scoped step).
+    def test_incremental_overrides_not_duplicated_for_non_scope_subtype(
+        self, tmp_path: Path
+    ) -> None:
+        """A non-scope subtype's OVERRIDES edge is not duplicated on increment.
+
+        Invariant guard for the OVERRIDES path. Unlike DECLARES (derived purely
+        from a member's own ``parent_id``/``node_id``, which is why a loaded
+        non-scope member could duplicate it — fixed in PR-P4), an OVERRIDES pair
+        needs the subtype's supertype via `_direct_supertype_ids`, which reads
+        `tables.extends_rows`/`implements_rows`. Those edge tables are populated
+        only by `pass2_edges` over *parsed* files; cross-file resolution stubs
+        (`loaded_from_db`) load nodes but NOT edges, so a loaded subtype has no
+        derivable supertype and `_populate_overrides_rows` never re-emits its
+        OVERRIDES — the edge (``source_file`` = its out-of-scope file) survives
+        untouched, matching a full rebuild.
+
+        This test pins that behavior down. If stub edge-loading is ever added as
+        an optimization, this guard will catch the resulting duplication (the
+        same class of bug PR-P4 fixed for DECLARES).
+
+        Corpus: `TImpl implements T` overrides `foo()` in non-scope files; an
+        unrelated `Other.java` change triggers the increment so `TImpl`/`T` are
+        loaded as stubs. The increment must keep exactly one OVERRIDES edge.
+        """
+        from build_ast_graph import incremental_rebuild
+        from _builders import build_ladybug_full_into
+
+        source_root = tmp_path / "src"
+        source_root.mkdir()
+        index_dir = tmp_path / "index"
+        index_dir.mkdir()
+        ladybug_path = index_dir / "code_graph.lbug"
+        java = source_root / "pkg"
+        java.mkdir(parents=True)
+
+        (java / "T.java").write_text(
+            "package pkg; public interface T { void foo(); }\n", encoding="utf-8"
+        )
+        (java / "TImpl.java").write_text(
+            "package pkg; public class TImpl implements T { public void foo() {} }\n",
+            encoding="utf-8",
+        )
+        (java / "Other.java").write_text(
+            "package pkg; public class Other { public void go() {} }\n", encoding="utf-8"
+        )
+        build_ladybug_full_into(source_root, ladybug_path)
+
+        # Change Other.java only. Nothing references Other, so the scope is just
+        # {Other.java}; TImpl/T are non-scope and loaded as resolution stubs.
+        (java / "Other.java").write_text(
+            "package pkg; public class Other { public void go() {} public void more() {} }\n",
+            encoding="utf-8",
+        )
+        result = incremental_rebuild(source_root, ladybug_path, verbose=False)
+        assert result.mode == "incremental", f"expected incremental, got {result.mode}"
+
+        def overrides_count(path: Path) -> int:
+            db = ladybug.Database(str(path))
+            conn = ladybug.Connection(db)
+            r = conn.execute("MATCH ()-[o:OVERRIDES]->() RETURN count(o)")
+            n = r.get_next()[0] if r.has_next() else 0
+            conn.close()
+            db.close()
+            return n
+
+        # Exactly one override relationship exists (TImpl.foo → T.foo); a
+        # duplicate (2) is the bug. Cross-check against a fresh full rebuild of
+        # the identical final state — the increment must match it.
+        full_dir = tmp_path / "full"
+        full_dir.mkdir()
+        full_path = full_dir / "code_graph.lbug"
+        build_ladybug_full_into(source_root, full_path)
+
+        assert overrides_count(ladybug_path) == 1, (
+            "non-scope OVERRIDES edge duplicated on increment "
+            "(PR-P5a regression: loaded subtype method re-emitted)"
+        )
+        assert overrides_count(ladybug_path) == overrides_count(full_path), (
+            "incremental OVERRIDES count diverged from full rebuild"
+        )
+
+    def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None:
+        """Pass5/6 Route write does not duplicate existing routes.
+
+        Creates a corpus where pass5/6 re-emits an existing route and verifies
+        no duplicate Route rows after incremental rebuild. The global step now
+        bulk-writes routes (COPY new ids + SET existing ids — see PR-P5c),
+        replacing the per-row MERGE upsert this test was originally written for;
+        the name is kept for plan-reference continuity. The SET branch is what
+        dedups against routes written during the scoped step here.
         """
         from build_ast_graph import incremental_rebuild, write_ladybug
 
@@ -1216,7 +1370,7 @@ def test_incremental_route_merge_dedup_preserved(self, tmp_path: Path) -> None:
             route_count = route_count_result.get_next()[0]
 
         # Should be exactly 1 route (no duplicates)
-        assert route_count == 1, f"Expected 1 route, found {route_count} (MERGE dedup failed)"
+        assert route_count == 1, f"Expected 1 route, found {route_count} (route dedup failed)"
 
         conn.close()