Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 79 additions & 27 deletions build_ast_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,15 +588,25 @@ def _load_existing_members(conn: ladybug.Connection, tables: GraphTables, exclud
))


# Every Symbol->Symbol REL TABLE type in the graph schema. A Symbol node can
# only have an INCOMING edge of one of these types, so `_find_dependents` MUST
# walk all of them: that completeness is what makes the changed-node DETACH
# DELETE in `_delete_file_scope` Phase 3 safe (every real caller of a changed
# node is pulled into scope, so Phase 1 removes the edge before the node delete).
# If you add a new Symbol->Symbol edge type to the schema, add it here too —
# otherwise changed-node deletion would silently drop its surviving edges.
_SYMBOL_TO_SYMBOL_EDGE_TYPES = (
"EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES",
)


def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> set[str]:
"""Find files whose nodes have edges pointing into changed nodes. Returns set of filenames."""
dependent_files: set[str] = set()

# Query each Symbol-to-Symbol edge table for incoming edges
edge_types = ["EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES"]
params = {"changed_ids": list(changed_node_ids)}

for edge_type in edge_types:
for edge_type in _SYMBOL_TO_SYMBOL_EDGE_TYPES:
query = f"""
MATCH (src:Symbol)-[e:{edge_type}]->(dst:Symbol)
WHERE dst.id IN $changed_ids
Expand All @@ -612,23 +622,52 @@ def _find_dependents(conn: ladybug.Connection, changed_node_ids: set[str]) -> se
return dependent_files


def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None:
"""Delete all nodes and edges originating from the given files.

Skip phantom nodes (filename=""). Deletes ALL edge types in Phase 1,
then nodes in subsequent phases. Route/Client/Producer nodes use
DETACH DELETE as a safety net for any edges missed in Phase 1.

Edges are deleted in batch across all filenames first to avoid LadybugDB
"has connected edges" errors when edges from one file point to nodes
in another file within the same scope.
def _delete_file_scope(
conn: ladybug.Connection,
changed_files: set[str],
dependent_files: set[str],
) -> None:
"""Delete nodes and edges for a scope split into changed vs dependent files.

``changed_files`` are files whose content actually changed (added/modified/
removed): their Symbol nodes are deleted (and re-created by ``_scoped_write``).
``dependent_files`` are files pulled in only to re-resolve their OUTGOING
edges against the changed nodes; their node definitions did not change, so
their nodes are deliberately PRESERVED (they re-MERGE in place on the same
deterministic ``symbol_id``). Skipping phantom nodes (filename="").

Why dependents are preserved (issue #305): the orchestrator computes
dependents from the *changed* nodes only, so a dependent file's node can
have an incoming CALLS edge from an out-of-scope caller. The ``source_file``
on every Symbol->Symbol edge is the CALLER's file (pinned by
``test_source_file_value_matches_symbol_filename``), so Phase 1 below only
deletes edges ORIGINATING in scope; incoming edges from out-of-scope callers
survive. If we then tried to DELETE the dependent node, LadybugDB rejects it
("Node ... has connected edges in table CALLS in the bwd direction, ...
Please delete the edges first or try DETACH DELETE") and the rebuild falls
back to a full rebuild. A naive fix (DETACH DELETE on dependents, or an
extra incoming-edge pass) would silence the crash but permanently drop those
out-of-scope edges, corrupting the graph. Preserving dependent nodes keeps
both the nodes and their incoming edges intact.

Phase 1 deletes ALL edge types across the whole scope (changed + dependent)
first to avoid LadybugDB "has connected edges" errors when edges from one
file point to nodes in another file within the same scope. Route/Client/
Producer nodes use DETACH DELETE as a safety net for any edges missed in
Phase 1.
"""
filename_list = list(filenames)

# Phase 1: Delete ALL edges from ALL scope files at once.
# This avoids ordering issues where file A has an edge from file B
# pointing into it; if we delete A's nodes before B's edges, LadybugDB
# raises "has connected edges" errors.
scope_files = changed_files | dependent_files
scope_list = list(scope_files)
changed_list = list(changed_files)

# Phase 1: Delete ALL edges ORIGINATING from any scope file (changed +
# dependent). Because `source_file` is the caller's file, this deletes edges
# whose source is in scope (including dependents' outgoing edges to changed
# nodes) while intentionally leaving incoming edges from out-of-scope callers
# intact — those must survive so the dependent nodes below can be preserved.
# This list is a superset of `_SYMBOL_TO_SYMBOL_EDGE_TYPES` (it also covers
# Symbol->Route/Client/Producer/UCS and Client/Producer->Route edges); keep
# both lists in sync with the schema.
edge_tables = [
"EXTENDS", "IMPLEMENTS", "INJECTS", "CALLS", "DECLARES", "OVERRIDES",
"UNRESOLVED_AT", "EXPOSES", "DECLARES_CLIENT", "DECLARES_PRODUCER",
Expand All @@ -640,7 +679,7 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None:
WHERE e.source_file IN $filenames
DELETE e
"""
conn.execute(query, {"filenames": filename_list})
conn.execute(query, {"filenames": scope_list})

# Phase 2: Collect all Symbol node IDs for UnresolvedCallSite cleanup.
symbol_ids: list[str] = []
Expand All @@ -649,12 +688,15 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None:
WHERE s.filename IN $filenames
RETURN s.id
"""
result = conn.execute(symbol_ids_query, {"filenames": filename_list})
result = conn.execute(symbol_ids_query, {"filenames": scope_list})
while result.has_next():
row = result.get_next()
symbol_ids.append(row[0])

# Delete UnresolvedCallSite nodes whose caller_id is in the collected set
# Delete UnresolvedCallSite nodes whose caller_id is in the collected set.
# These are children of scope symbols (including preserved dependents);
# deleting them is safe because every scope file — dependents included — is
# reprocessed and re-emits its UnresolvedCallSite nodes in `_scoped_write`.
if symbol_ids:
unresolved_query = """
MATCH (u:UnresolvedCallSite)
Expand All @@ -663,20 +705,30 @@ def _delete_file_scope(conn: ladybug.Connection, filenames: set[str]) -> None:
"""
conn.execute(unresolved_query, {"symbol_ids": symbol_ids})

# Phase 3: Delete Symbol nodes.
# Phase 3: Delete Symbol nodes ONLY for changed files (not dependents).
# Dependent-file nodes are deliberately PRESERVED so their incoming edges
# from out-of-scope callers survive; the dependents are re-MERGEd in place
# by `_scoped_write` on the same deterministic node id. A changed node's
# real incoming edges all come from dependent files (callers pulled into
# scope by `_find_dependents`, which walks every type in
# `_SYMBOL_TO_SYMBOL_EDGE_TYPES`), so Phase 1 already removed them and the
# dependents re-emit them when reprocessed. DETACH DELETE is only a safety
# net for the rare surviving edge whose source was NOT pulled into scope
# (e.g. a phantom caller with filename="", which `_find_dependents` skips);
# such an edge is stale once the node is recreated, so dropping it is fine.
delete_symbols_query = """
MATCH (s:Symbol)
WHERE s.filename IN $filenames
DELETE s
DETACH DELETE s
"""
conn.execute(delete_symbols_query, {"filenames": filename_list})
conn.execute(delete_symbols_query, {"filenames": changed_list})

# Phase 4: Delete Route, Client, Producer nodes.
# Use DETACH DELETE as a safety net in case any edges were missed in Phase 1.
for label in ["Route", "Client", "Producer"]:
conn.execute(
f"MATCH (n:{label}) WHERE n.filename IN $filenames DETACH DELETE n",
{"filenames": filename_list},
{"filenames": scope_list},
)


Expand Down Expand Up @@ -3524,7 +3576,7 @@ def incremental_rebuild(
# Step 4: Scoped deletion
if verbose:
_verbose_stderr_line("[increment] deleting outdated nodes and edges")
_delete_file_scope(conn, scope_files)
_delete_file_scope(conn, changed_files, dependent_files)

# Force deletion to be applied by running a dummy query
conn.execute("MATCH (s:Symbol) RETURN count(*)")
Expand Down
205 changes: 203 additions & 2 deletions tests/test_incremental_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,7 +738,11 @@ def test_find_dependents_returns_incoming_edge_sources(self, tmp_path: Path) ->
conn.close()

def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None:
"""Delete scope for one file, verify other files' nodes/edges untouched."""
"""Delete scope for one file (changed), verify other files' nodes/edges untouched.

Uses the new (changed_files, dependent_files) signature with an empty
dependent set so behavior matches the legacy single-file case.
"""
from build_ast_graph import _delete_file_scope

source_root = tmp_path / "src"
Expand All @@ -761,7 +765,7 @@ def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None:
conn.execute("MATCH (s:Symbol) RETURN count(*)")

# Delete only A.java's scope
_delete_file_scope(conn, {"A.java"})
_delete_file_scope(conn, changed_files={"A.java"}, dependent_files=set())

# Verify A's nodes are gone but B's remain
a_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.A' RETURN count(*)")
Expand All @@ -778,3 +782,200 @@ def test_delete_file_scope_removes_only_matching(self, tmp_path: Path) -> None:
assert b_count > 0

conn.close()

def test_delete_file_scope_preserves_dependent_nodes(self, tmp_path: Path) -> None:
"""Direct unit test for the #305 fix.

Build a C -> B -> A call chain (only B is a dependent of changed A; C is
out of scope because it has no edge into A). Then call
_delete_file_scope(changed_files={A}, dependent_files={B}) and assert:
no exception, A's node is gone, B and C nodes are preserved, and the
out-of-scope C->B CALLS edge survives.
"""
from build_ast_graph import _delete_file_scope, write_ladybug

source_root = tmp_path / "src"
source_root.mkdir()
ladybug_path = tmp_path / "code_graph.lbug"

# C calls B.b; B calls A.a. (pass1-3 needed to produce CALLS edges.)
(source_root / "A.java").write_text(
"package pkg; class A { void a() {} }", encoding="utf-8"
)
(source_root / "B.java").write_text(
"package pkg; class B {\n"
" void b() {\n"
" A a = new A();\n"
" a.a();\n"
" }\n"
"}",
encoding="utf-8",
)
(source_root / "C.java").write_text(
"package pkg; class C {\n"
" void c() {\n"
" B b = new B();\n"
" b.b();\n"
" }\n"
"}",
encoding="utf-8",
)

tables = GraphTables()
asts = pass1_parse(source_root, tables, verbose=False)
pass2_edges(tables, asts, verbose=False)
from build_ast_graph import pass3_calls
pass3_calls(tables, asts, verbose=False)
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False)

db = ladybug.Database(str(ladybug_path))
conn = ladybug.Connection(db)

# Sanity: the C->B CALLS edge must exist for this test to be meaningful.
cb_result = conn.execute(
"MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) "
"RETURN count(*)"
)
cb_count = 0
if cb_result.has_next():
cb_count = cb_result.get_next()[0]
assert cb_count > 0, "seeded graph must contain a C->B CALLS edge"

# A is changed; B is its dependent; C is out of scope.
_delete_file_scope(
conn, changed_files={"A.java"}, dependent_files={"B.java"}
)

# A's node must be gone.
a_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.A' RETURN count(*)")
a_count = 0
if a_result.has_next():
a_count = a_result.get_next()[0]
assert a_count == 0

# B and C nodes must survive.
b_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.B' RETURN count(*)")
c_result = conn.execute("MATCH (s:Symbol) WHERE s.fqn = 'pkg.C' RETURN count(*)")
b_count = 0
c_count = 0
if b_result.has_next():
b_count = b_result.get_next()[0]
if c_result.has_next():
c_count = c_result.get_next()[0]
assert b_count > 0
assert c_count > 0

# The out-of-scope C->B CALLS edge must survive.
cb_after_result = conn.execute(
"MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) "
"RETURN count(*)"
)
cb_after_count = 0
if cb_after_result.has_next():
cb_after_count = cb_after_result.get_next()[0]
assert cb_after_count > 0

conn.close()

def test_incremental_preserves_incoming_edges_to_dependent(self, tmp_path: Path) -> None:
"""End-to-end repro for GitHub issue #305.

Topology C -> B -> A (C.c calls B.b, B.b calls A.a). Change A.java and run
incremental_rebuild. On the unfixed code the dependent B is pulled into
scope but its out-of-scope caller C is not; the surviving C->B CALLS edge
crashes the dependent node delete and the rebuild falls back to full.

After the fix: mode is "incremental", B's node survives, and the C->B
CALLS edge is preserved.
"""
from build_ast_graph import incremental_rebuild, write_ladybug

source_root = tmp_path / "src"
source_root.mkdir()
index_dir = tmp_path / "index"
index_dir.mkdir()
ladybug_path = index_dir / "code_graph.lbug"

(source_root / "A.java").write_text(
"package pkg; class A { void a() {} }", encoding="utf-8"
)
(source_root / "B.java").write_text(
"package pkg; class B {\n"
" void b() {\n"
" A a = new A();\n"
" a.a();\n"
" }\n"
"}",
encoding="utf-8",
)
(source_root / "C.java").write_text(
"package pkg; class C {\n"
" void c() {\n"
" B b = new B();\n"
" b.b();\n"
" }\n"
"}",
encoding="utf-8",
)

# Initial build (pass1-3 for CALLS edges).
tables = GraphTables()
asts = pass1_parse(source_root, tables, verbose=False)
pass2_edges(tables, asts, verbose=False)
from build_ast_graph import pass3_calls
pass3_calls(tables, asts, verbose=False)
write_ladybug(ladybug_path, tables, source_root=source_root, verbose=False)

# Sanity: C->B CALLS edge must exist in the seeded graph.
db = ladybug.Database(str(ladybug_path))
conn = ladybug.Connection(db)
cb_result = conn.execute(
"MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) "
"RETURN count(*)"
)
cb_count = 0
if cb_result.has_next():
cb_count = cb_result.get_next()[0]
assert cb_count > 0, "seeded graph must contain a C->B CALLS edge"
conn.close()

# Initialize hash tracker for all files.
tracker = FileHashTracker(index_dir)
ignore = LayeredIgnore(source_root, use_gitignore=False, builtin_patterns=[])
tracker.detect_changes(source_root, ignore)
for rel_path in ["A.java", "B.java", "C.java"]:
tracker.update({rel_path}, source_root)
tracker.save()

# Change A.java.
(source_root / "A.java").write_text(
"package pkg; class A { void a() {} void a2() {} }", encoding="utf-8"
)

result = incremental_rebuild(source_root, ladybug_path, verbose=False)

assert result.mode == "incremental", (
f"expected incremental, got {result.mode!r} (likely the bwd-edge crash)"
)

# B's node and the C->B CALLS edge must survive.
db = ladybug.Database(str(ladybug_path))
conn = ladybug.Connection(db)
b_result = conn.execute(
"MATCH (s:Symbol) WHERE s.fqn = 'pkg.B' RETURN count(*)"
)
b_count = 0
if b_result.has_next():
b_count = b_result.get_next()[0]
assert b_count > 0, "dependent node B must be preserved"

cb_after_result = conn.execute(
"MATCH (src:Symbol {fqn: 'pkg.C#c()'})-[e:CALLS]->(dst:Symbol {fqn: 'pkg.B#b()'}) "
"RETURN count(*)"
)
cb_after_count = 0
if cb_after_result.has_next():
cb_after_count = cb_after_result.get_next()[0]
assert cb_after_count > 0, "out-of-scope C->B CALLS edge must be preserved"

conn.close()
Loading