From 846818a30e1871ba8e1158d1f890d427e0a94216 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sat, 13 Jun 2026 19:13:56 +0300 Subject: [PATCH 1/2] fix: preserve dependent nodes in incremental graph rebuild (#305) _delete_file_scope deleted Symbol nodes for both changed and dependent files during incremental rebuild. Dependent nodes can have incoming CALLS edges from out-of-scope callers: the orchestrator expands dependents from changed nodes only, so callers OF dependent nodes are never pulled into scope, and source_file on an edge is the caller's file, so Phase 1's outgoing-edge delete left those incoming edges in place. The plain DELETE then crashed ('Node ... has connected edges in table CALLS in the bwd direction') and the rebuild fell back to full every time. Delete Symbol nodes only for changed files. Dependent nodes are kept and re-MERGEd in place on their deterministic symbol_id, so their incoming edges from out-of-scope callers survive (no crash, no silent edge loss). Phase 3 uses DETACH DELETE for changed nodes as a safety net. No schema or ontology change, so no re-index burden. Fixes #305 Co-Authored-By: Claude --- build_ast_graph.py | 81 +++++++++---- tests/test_incremental_graph.py | 205 +++++++++++++++++++++++++++++++- 2 files changed, 261 insertions(+), 25 deletions(-) diff --git a/build_ast_graph.py b/build_ast_graph.py index 8ba103c5..cc7cd84e 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -612,23 +612,49 @@ def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> se return dependent_files -def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None: - """Delete all nodes and edges originating from the given files. - - Skip phantom nodes (filename=""). Deletes ALL edge types in Phase 1, - then nodes in subsequent phases. Route/Client/Producer nodes use - DETACH DELETE as a safety net for any edges missed in Phase 1. - - Edges are deleted in batch across all filenames first to avoid LadybugDB - "has connected edges" errors when edges from one file point to nodes - in another file within the same scope. +def _delete_file_scope( + conn: ladybug.Connection, + changed_files: set[str], + dependent_files: set[str], +) -> None: + """Delete nodes and edges for a scope split into changed vs dependent files. + + ``changed_files`` are files whose content actually changed (added/modified/ + removed): their Symbol nodes are deleted (and re-created by ``_scoped_write``). + ``dependent_files`` are files pulled in only to re-resolve their OUTGOING + edges against the changed nodes; their node definitions did not change, so + their nodes are deliberately PRESERVED (they re-MERGE in place on the same + deterministic ``symbol_id``). Skipping phantom nodes (filename=""). + + Why dependents are preserved (issue #305): the orchestrator computes + dependents from the *changed* nodes only, so a dependent file's node can + have an incoming CALLS edge from an out-of-scope caller. The ``source_file`` + on every Symbol->Symbol edge is the CALLER's file (pinned by + ``test_source_file_value_matches_symbol_filename``), so Phase 1 below only + deletes edges ORIGINATING in scope; incoming edges from out-of-scope callers + survive. If we then tried to DELETE the dependent node, LadybugDB rejects it + ("Node ... has connected edges in table CALLS in the bwd direction, ... + Please delete the edges first or try DETACH DELETE") and the rebuild falls + back to a full rebuild. A naive fix (DETACH DELETE on dependents, or an + extra incoming-edge pass) would silence the crash but permanently drop those + out-of-scope edges, corrupting the graph. Preserving dependent nodes keeps + both the nodes and their incoming edges intact. + + Phase 1 deletes ALL edge types across the whole scope (changed + dependent) + first to avoid LadybugDB "has connected edges" errors when edges from one + file point to nodes in another file within the same scope. Route/Client/ + Producer nodes use DETACH DELETE as a safety net for any edges missed in + Phase 1. """ - filename_list = list(filenames) - - # Phase 1: Delete ALL edges from ALL scope files at once. - # This avoids ordering issues where file A has an edge from file B - # pointing into it; if we delete A's nodes before B's edges, LadybugDB - # raises "has connected edges" errors. + scope_files = changed_files | dependent_files + scope_list = list(scope_files) + changed_list = list(changed_files) + + # Phase 1: Delete ALL edges ORIGINATING from any scope file (changed + + # dependent). Because `source_file` is the caller's file, this deletes edges + # whose source is in scope (including dependents' outgoing edges to changed + # nodes) while intentionally leaving incoming edges from out-of-scope callers + # intact — those must survive so the dependent nodes below can be preserved. edge_tables = [ "EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES", "UNRESOLVED_AT", "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER", @@ -640,7 +666,7 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None: WHERE e.source_file IN $filenames DELETE e """ - conn.execute(query, {"filenames": filename_list}) + conn.execute(query, {"filenames": scope_list}) # Phase 2: Collect all Symbol node IDs for UnresolvedCallSite cleanup. symbol_ids: list[str] = [] @@ -649,7 +675,7 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None: WHERE s.filename IN $filenames RETURN s.id """ - result = conn.execute(symbol_ids_query, {"filenames": filename_list}) + result = conn.execute(symbol_ids_query, {"filenames": scope_list}) while result.has_next(): row = result.get_next() symbol_ids.append(row[0]) @@ -663,20 +689,29 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None: """ conn.execute(unresolved_query, {"symbol_ids": symbol_ids}) - # Phase 3: Delete Symbol nodes. + # Phase 3: Delete Symbol nodes ONLY for changed files (not dependents). + # Dependent-file nodes are deliberately PRESERVED so their incoming edges + # from out-of-scope callers survive; the dependents are re-MERGEd in place + # by `_scoped_write` on the same deterministic node id. A changed node's + # real incoming edges all come from dependent files (callers pulled into + # scope by `_find_dependents`), so Phase 1 already removed them and the + # dependents re-emit them when reprocessed. DETACH DELETE is only a safety + # net for the rare surviving edge whose source was NOT pulled into scope + # (e.g. a phantom caller with filename="", which `_find_dependents` skips); + # such an edge is stale once the node is recreated, so dropping it is fine. delete_symbols_query = """ MATCH (s:Symbol) WHERE s.filename IN $filenames - DELETE s + DETACH DELETE s """ - conn.execute(delete_symbols_query, {"filenames": filename_list}) + conn.execute(delete_symbols_query, {"filenames": changed_list}) # Phase 4: Delete Route, Client, Producer nodes. # Use DETACH DELETE as a safety net in case any edges were missed in Phase 1. for label in ["Route", "Client", "Producer"]: conn.execute( f"MATCH (n:{label}) WHERE n.filename IN $filenames DETACH DELETE n", - {"filenames": filename_list}, + {"filenames": scope_list}, ) @@ -3524,7 +3559,7 @@ def incremental_rebuild( # Step 4: Scoped deletion if verbose: _verbose_stderr_line("[increment] deleting outdated nodes and edges") - _delete_file_scope(conn, scope_files) + _delete_file_scope(conn, changed_files, dependent_files) # Force deletion to be applied by running a dummy query conn.execute("MATCH (s:Symbol) RETURN count(*)") diff --git a/tests/test_incremental_graph.py b/tests/test_incremental_graph.py index 894212c1..7a4121fe 100644 --- a/tests/test_incremental_graph.py +++ b/tests/test_incremental_graph.py @@ -738,7 +738,11 @@ def test_find_dependents_returns_incoming_edge_sources(self, tmp_path: Path) -> conn.close() def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None: - """Delete scope for one file, verify other files' nodes/edges untouched.""" + """Delete scope for one file (changed), verify other files' nodes/edges untouched. + + Uses the new (changed_files, dependent_files) signature with an empty + dependent set so behavior matches the legacy single-file case. + """ from build_ast_graph import _delete_file_scope source_root = tmp_path / "src" @@ -761,7 +765,7 @@ def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None: conn.execute("MATCH (s:Symbol) RETURN count(*)") # Delete only A.java's scope - _delete_file_scope(conn, {"A.java"}) + _delete_file_scope(conn, changed_files={"A.java"}, dependent_files=set()) # Verify A's nodes are gone but B's remain a_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.A' RETURN count(*)") @@ -778,3 +782,200 @@ def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None: assert b_count > 0 conn.close() + + def test_delete_file_scope_preserves_dependent_nodes(self, tmp_path: Path) -> None: + """Direct unit test for the #305 fix. + + Build a C -> B -> A call chain (only B is a dependent of changed A; C is + out of scope because it has no edge into A). Then call + _delete_file_scope(changed_files={A}, dependent_files={B}) and assert: + no exception, A's node is gone, B and C nodes are preserved, and the + out-of-scope C->B CALLS edge survives. + """ + from build_ast_graph import _delete_file_scope, write_ladybug + + source_root = tmp_path / "src" + source_root.mkdir() + ladybug_path = tmp_path / "code_graph.lbug" + + # C calls B.b; B calls A.a. (pass1-3 needed to produce CALLS edges.) + (source_root / "A.java").write_text( + "package pkg; class A { void a() {} }", encoding="utf-8" + ) + (source_root / "B.java").write_text( + "package pkg; class B {\n" + " void b() {\n" + " A a = new A();\n" + " a.a();\n" + " }\n" + "}", + encoding="utf-8", + ) + (source_root / "C.java").write_text( + "package pkg; class C {\n" + " void c() {\n" + " B b = new B();\n" + " b.b();\n" + " }\n" + "}", + encoding="utf-8", + ) + + tables = GraphTables() + asts = pass1_parse(source_root, tables, verbose=False) + pass2_edges(tables, asts, verbose=False) + from build_ast_graph import pass3_calls + pass3_calls(tables, asts, verbose=False) + write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False) + + db = ladybug.Database(str(ladybug_path)) + conn = ladybug.Connection(db) + + # Sanity: the C->B CALLS edge must exist for this test to be meaningful. + cb_result = conn.execute( + "MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) " + "RETURN count(*)" + ) + cb_count = 0 + if cb_result.has_next(): + cb_count = cb_result.get_next()[0] + assert cb_count > 0, "seeded graph must contain a C->B CALLS edge" + + # A is changed; B is its dependent; C is out of scope. + _delete_file_scope( + conn, changed_files={"A.java"}, dependent_files={"B.java"} + ) + + # A's node must be gone. + a_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.A' RETURN count(*)") + a_count = 0 + if a_result.has_next(): + a_count = a_result.get_next()[0] + assert a_count == 0 + + # B and C nodes must survive. + b_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.B' RETURN count(*)") + c_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.C' RETURN count(*)") + b_count = 0 + c_count = 0 + if b_result.has_next(): + b_count = b_result.get_next()[0] + if c_result.has_next(): + c_count = c_result.get_next()[0] + assert b_count > 0 + assert c_count > 0 + + # The out-of-scope C->B CALLS edge must survive. + cb_after_result = conn.execute( + "MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) " + "RETURN count(*)" + ) + cb_after_count = 0 + if cb_after_result.has_next(): + cb_after_count = cb_after_result.get_next()[0] + assert cb_after_count > 0 + + conn.close() + + def test_incremental_preserves_incoming_edges_to_dependent(self, tmp_path: Path) -> None: + """End-to-end repro for GitHub issue #305. + + Topology C -> B -> A (C.c calls B.b, B.b calls A.a). Change A.java and run + incremental_rebuild. On the unfixed code the dependent B is pulled into + scope but its out-of-scope caller C is not; the surviving C->B CALLS edge + crashes the dependent node delete and the rebuild falls back to full. + + After the fix: mode is "incremental", B's node survives, and the C->B + CALLS edge is preserved. + """ + from build_ast_graph import incremental_rebuild, write_ladybug + + source_root = tmp_path / "src" + source_root.mkdir() + index_dir = tmp_path / "index" + index_dir.mkdir() + ladybug_path = index_dir / "code_graph.lbug" + + (source_root / "A.java").write_text( + "package pkg; class A { void a() {} }", encoding="utf-8" + ) + (source_root / "B.java").write_text( + "package pkg; class B {\n" + " void b() {\n" + " A a = new A();\n" + " a.a();\n" + " }\n" + "}", + encoding="utf-8", + ) + (source_root / "C.java").write_text( + "package pkg; class C {\n" + " void c() {\n" + " B b = new B();\n" + " b.b();\n" + " }\n" + "}", + encoding="utf-8", + ) + + # Initial build (pass1-3 for CALLS edges). + tables = GraphTables() + asts = pass1_parse(source_root, tables, verbose=False) + pass2_edges(tables, asts, verbose=False) + from build_ast_graph import pass3_calls + pass3_calls(tables, asts, verbose=False) + write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False) + + # Sanity: C->B CALLS edge must exist in the seeded graph. + db = ladybug.Database(str(ladybug_path)) + conn = ladybug.Connection(db) + cb_result = conn.execute( + "MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) " + "RETURN count(*)" + ) + cb_count = 0 + if cb_result.has_next(): + cb_count = cb_result.get_next()[0] + assert cb_count > 0, "seeded graph must contain a C->B CALLS edge" + conn.close() + + # Initialize hash tracker for all files. + tracker = FileHashTracker(index_dir) + ignore = LayeredIgnore(source_root, use_gitignore=False, builtin_patterns=[]) + tracker.detect_changes(source_root, ignore) + for rel_path in ["A.java", "B.java", "C.java"]: + tracker.update({rel_path}, source_root) + tracker.save() + + # Change A.java. + (source_root / "A.java").write_text( + "package pkg; class A { void a() {} void a2() {} }", encoding="utf-8" + ) + + result = incremental_rebuild(source_root, ladybug_path, verbose=False) + + assert result.mode == "incremental", ( + f"expected incremental, got {result.mode!r} (likely the bwd-edge crash)" + ) + + # B's node and the C->B CALLS edge must survive. + db = ladybug.Database(str(ladybug_path)) + conn = ladybug.Connection(db) + b_result = conn.execute( + "MATCH (s:Symbol) WHERE s.fqn = 'pkg.B' RETURN count(*)" + ) + b_count = 0 + if b_result.has_next(): + b_count = b_result.get_next()[0] + assert b_count > 0, "dependent node B must be preserved" + + cb_after_result = conn.execute( + "MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) " + "RETURN count(*)" + ) + cb_after_count = 0 + if cb_after_result.has_next(): + cb_after_count = cb_after_result.get_next()[0] + assert cb_after_count > 0, "out-of-scope C->B CALLS edge must be preserved" + + conn.close() From 6201b6ac08f2e8e85fda3d2bb13d6a47032ff068 Mon Sep 17 00:00:00 2001 From: Dmitry Teryaev Date: Sat, 13 Jun 2026 19:33:35 +0300 Subject: [PATCH 2/2] extract _SYMBOL_TO_SYMBOL_EDGE_TYPES; document edge-list invariant Address PR #311 review feedback (non-blocking follow-ups): - Extract the `_SYMBOL_TO_SYMBOL_EDGE_TYPES` module constant and derive `_find_dependents` from it. The #305 fix's safety rests on `_find_dependents` enumerating EVERY Symbol->Symbol edge type (so every real caller of a changed node enters scope and Phase 1 removes its edge before the node delete). The named constant plus cross-references in the Phase 1 and Phase 3 comments make that invariant enforced/documented by construction, instead of two parallel hand-maintained lists (6 vs 12 types) silently needing to stay in sync. - Phase 2: comment why deleting UnresolvedCallSite children of preserved dependents is safe (scope files, dependents included, are reprocessed and re-emit them in `_scoped_write`). No behavior change; tests unchanged and still green. Co-Authored-By: Claude --- build_ast_graph.py | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/build_ast_graph.py b/build_ast_graph.py index cc7cd84e..cce16631 100644 --- a/build_ast_graph.py +++ b/build_ast_graph.py @@ -588,15 +588,25 @@ def _load_existing_members(conn: ladybug.Connection, tables: GraphTables, exclud )) +# Every Symbol->Symbol REL TABLE type in the graph schema. A Symbol node can +# only have an INCOMING edge of one of these types, so `_find_dependents` MUST +# walk all of them: that completeness is what makes the changed-node DETACH +# DELETE in `_delete_file_scope` Phase 3 safe (every real caller of a changed +# node is pulled into scope, so Phase 1 removes the edge before the node delete). +# If you add a new Symbol->Symbol edge type to the schema, add it here too — +# otherwise changed-node deletion would silently drop its surviving edges. +_SYMBOL_TO_SYMBOL_EDGE_TYPES = ( + "EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES", +) + + def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> set[str]: """Find files whose nodes have edges pointing into changed nodes. Returns set of filenames.""" dependent_files: set[str] = set() - # Query each Symbol-to-Symbol edge table for incoming edges - edge_types = ["EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES"] params = {"changed_ids": list(changed_node_ids)} - for edge_type in edge_types: + for edge_type in _SYMBOL_TO_SYMBOL_EDGE_TYPES: query = f""" MATCH (src:Symbol)-[e:{edge_type}]->(dst:Symbol) WHERE dst.id IN $changed_ids @@ -655,6 +665,9 @@ def _delete_file_scope( # whose source is in scope (including dependents' outgoing edges to changed # nodes) while intentionally leaving incoming edges from out-of-scope callers # intact — those must survive so the dependent nodes below can be preserved. + # This list is a superset of `_SYMBOL_TO_SYMBOL_EDGE_TYPES` (it also covers + # Symbol->Route/Client/Producer/UCS and Client/Producer->Route edges); keep + # both lists in sync with the schema. edge_tables = [ "EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES", "UNRESOLVED_AT", "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER", @@ -680,7 +693,10 @@ def _delete_file_scope( row = result.get_next() symbol_ids.append(row[0]) - # Delete UnresolvedCallSite nodes whose caller_id is in the collected set + # Delete UnresolvedCallSite nodes whose caller_id is in the collected set. + # These are children of scope symbols (including preserved dependents); + # deleting them is safe because every scope file — dependents included — is + # reprocessed and re-emits its UnresolvedCallSite nodes in `_scoped_write`. if symbol_ids: unresolved_query = """ MATCH (u:UnresolvedCallSite) @@ -694,7 +710,8 @@ def _delete_file_scope( # from out-of-scope callers survive; the dependents are re-MERGEd in place # by `_scoped_write` on the same deterministic node id. A changed node's # real incoming edges all come from dependent files (callers pulled into - # scope by `_find_dependents`), so Phase 1 already removed them and the + # scope by `_find_dependents`, which walks every type in + # `_SYMBOL_TO_SYMBOL_EDGE_TYPES`), so Phase 1 already removed them and the # dependents re-emit them when reprocessed. DETACH DELETE is only a safety # net for the rare surviving edge whose source was NOT pulled into scope # (e.g. a phantom caller with filename="", which `_find_dependents` skips);