From c62bcbed5fb6656489c33e6905e8e8731c7a6024 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 23 Jun 2026 23:07:36 +0300 Subject: [PATCH 1/3] feat(graph): edge confidence audit trail + architecture analytics Phase 1 of porting graphify's best ideas into codebase-index. - edges.confidence (extracted/inferred/ambiguous), SCHEMA_VERSION 2->3. Derived from how an edge resolved (exact / import-suffix heuristic / unresolved); never LLM-guessed. Surfaced in refs + impact for honesty. - graph/analysis.py: zero-dep, deterministic communities (Louvain local-move), god nodes, surprising bridges, auto-labels, suggested questions. Cached in meta['graph_analysis'] at build time. - Tests for confidence + analysis; refs/impact goldens regenerated. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 17 + docs/SCHEMA.md | 8 +- src/codebase_index/graph/analysis.py | 414 ++++++++++++++++++++ src/codebase_index/graph/builder.py | 22 +- src/codebase_index/graph/expand.py | 9 +- src/codebase_index/models.py | 4 + src/codebase_index/output/markdown.py | 21 +- src/codebase_index/retrieval/searchers.py | 8 +- src/codebase_index/storage/db.py | 3 +- src/codebase_index/storage/repo.py | 66 +++- src/codebase_index/storage/schema.sql | 8 +- tests/golden/impact_user_model.json | 2 + tests/golden/mcp_find_refs.json | 3 + tests/golden/mcp_impact_of.json | 2 + tests/golden/refs_refresh_access_token.json | 3 + tests/test_analysis.py | 177 +++++++++ tests/test_graph.py | 16 + 17 files changed, 759 insertions(+), 24 deletions(-) create mode 100644 src/codebase_index/graph/analysis.py create mode 100644 tests/test_analysis.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 347ee4e..b50d274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,23 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +### Added — graph foundation: edge confidence + architecture analytics (requires a one-time reindex) +- **Edge confidence audit trail.** Every graph edge now carries a `confidence`: + `extracted` (exact — a same-file symbol or repo-unique name), `inferred` (a + heuristic resolved it, e.g. an import path-suffix match), or `ambiguous` (a named + target we could not pin to a unique node). `refs` and `impact` surface it so an + empty or short answer over `ambiguous`/`inferred` edges reads as inconclusive, + not as proof. Confidence is derived from *how* an edge resolved — never guessed by + an LLM; the index stays fully local. **Bumps `SCHEMA_VERSION` 2 → 3.** Older + indexes stay readable; `index`/`update` detect the mismatch and rebuild. +- **Architecture analytics (`graph/analysis.py`), zero new dependencies.** A pure, + deterministic pass over the resolved edge graph computes communities (greedy + modularity / Louvain local-move — does not collapse cliques joined by one bridge), + god nodes (most-connected symbols/files), surprising connections (edges bridging + weakly-linked communities), auto-labelled modules, and suggested questions. The + summary is cached in `meta['graph_analysis']` at build time for instant reads. + (Surfaced via the `architecture` command and HTML export in following changes.) + ### Changed — retrieval ranking & fusion (requires a one-time reindex) - **RRF fusion rescaled and re-keyed.** Fused scores were ~`w/k` (≈0.017), an order of magnitude below the reranker's bounded bonuses, so rerank silently became the diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md index 21f2735..490dc85 100644 --- a/docs/SCHEMA.md +++ b/docs/SCHEMA.md @@ -75,7 +75,13 @@ CREATE TABLE edges ( dst_name TEXT, -- raw target text (for unresolved edges) file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, line INTEGER, - resolved INTEGER NOT NULL DEFAULT 0 + resolved INTEGER NOT NULL DEFAULT 0, + -- Honesty audit trail: how the edge's target was determined. + -- extracted = exact (same-file symbol, or a repo-unique name) + -- inferred = a heuristic resolved it (import path-suffix match) + -- ambiguous = a named target we could not pin to a unique node + -- Set by the global graph pass; never inferred by an LLM (the index is local). + confidence TEXT NOT NULL DEFAULT 'extracted' ); CREATE INDEX idx_edges_src ON edges(src_kind, src_id); CREATE INDEX idx_edges_dst ON edges(dst_kind, dst_id); diff --git a/src/codebase_index/graph/analysis.py b/src/codebase_index/graph/analysis.py new file mode 100644 index 0000000..a23c934 --- /dev/null +++ b/src/codebase_index/graph/analysis.py @@ -0,0 +1,414 @@ +"""Architecture analytics over the resolved edge graph — zero external deps. + +This is the codebase-index take on graphify's community detection / god nodes / +surprising connections, implemented in pure, deterministic Python so the core +install stays dependency-free and the results are stable across runs (which +matters for the golden-snapshot tests and CI). + +What it computes from the in-memory adjacency of resolved edges: + + * communities - label propagation groups tightly-connected nodes into + "modules". Deterministic: nodes are visited in a fixed key + order and ties break to the smallest label, so the same graph + always yields the same partition. + * god nodes - the most-connected nodes (weighted degree). These are the + symbols/files most of the codebase leans on. + * surprising - edges that bridge two otherwise weakly-connected communities. + The cross-module links you would not think to look for. + * questions - template-generated starting questions seeded from the god + nodes and the bridges, mirroring graphify's GRAPH_REPORT. + +The summary is cached in meta['graph_analysis'] by refresh_analysis() at build +time; the `architecture` command and HTML export read it back instantly. +""" + +from __future__ import annotations + +import json +import sqlite3 +from collections import Counter, defaultdict +from typing import Optional + +from ..storage import repo + +# How many items to keep in the cached summary. Bounded so the meta JSON stays +# small even on very large repos. +MAX_GOD_NODES = 20 +MAX_SURPRISING = 12 +MAX_QUESTIONS = 8 +TOP_NODES_PER_COMMUNITY = 5 +MAX_COMMUNITIES_IN_SUMMARY = 40 +# A community smaller than this is noise for reporting (isolated/leaf nodes). +MIN_REPORTED_COMMUNITY = 2 +# A pair of communities joined by at most this many edges is a "bridge". +BRIDGE_MAX_EDGES = 2 +# Cap on local-move passes; the partition almost always settles in 2-4. +_LOCAL_MOVE_PASSES = 20 + +ANALYSIS_META_KEY = "graph_analysis" + +Node = tuple[str, int] # (kind, id) + + +# --------------------------------------------------------------------------- +# Graph construction +# --------------------------------------------------------------------------- + +def _node_key(kind: str, node_id: int) -> str: + return f"{kind}:{node_id}" + + +def build_adjacency( + edges: list[sqlite3.Row], +) -> tuple[dict[Node, Counter], dict[tuple[Node, Node], int]]: + """Undirected weighted adjacency + per-edge multiplicity, from resolved edges. + + Self-loops are dropped (they distort degree and never bridge communities). + """ + adj: dict[Node, Counter] = defaultdict(Counter) + edge_weight: dict[tuple[Node, Node], int] = defaultdict(int) + for e in edges: + src: Node = (e["src_kind"], int(e["src_id"])) + dst: Node = (e["dst_kind"], int(e["dst_id"])) + if src == dst: + continue + adj[src][dst] += 1 + adj[dst][src] += 1 + edge_weight[_canonical_pair(src, dst)] += 1 + return adj, edge_weight + + +def _canonical_pair(a: Node, b: Node) -> tuple[Node, Node]: + return (a, b) if a <= b else (b, a) + + +def weighted_degree(adj: dict[Node, Counter]) -> dict[Node, int]: + return {node: sum(neighbors.values()) for node, neighbors in adj.items()} + + +# --------------------------------------------------------------------------- +# Community detection — deterministic label propagation +# --------------------------------------------------------------------------- + +def detect_communities(adj: dict[Node, Counter]) -> dict[Node, int]: + """Partition nodes into communities by greedy modularity. Returns {node: id}. + + This is the local-moving phase of the Louvain method, made deterministic: + every node starts alone, then in a fixed key order each node moves to the + neighbouring community that yields the largest modularity gain (ties break to + the smallest community id). Passes repeat until no node moves. Unlike label + propagation it does not collapse two cliques joined by a single bridge — the + bridge's gain cannot beat the dense intra-clique structure. Labels are + renumbered to dense, size-ranked ids so community 0 is always the largest. + """ + nodes = sorted(adj.keys()) + if not nodes: + return {} + + deg = weighted_degree(adj) + two_m = sum(deg.values()) # = 2 * total edge weight + if two_m == 0: + return _renumber_by_size({node: idx for idx, node in enumerate(nodes)}) + + comm: dict[Node, int] = {node: idx for idx, node in enumerate(nodes)} + # Σ_tot per community: total weighted degree of its members. + sigma_tot: dict[int, int] = {idx: deg[node] for idx, node in enumerate(nodes)} + + for _ in range(_LOCAL_MOVE_PASSES): + moved = False + for node in nodes: + ki = deg[node] + ci = comm[node] + # Detach node from its current community. + sigma_tot[ci] -= ki + + # Weight from node into each neighbouring community. + links: Counter = Counter() + for neighbor, w in adj[node].items(): + if neighbor != node: + links[comm[neighbor]] += w + + # Pick the community maximising w_in - Σ_tot * k_i / (2m). + # Baseline = staying isolated (its own now-empty community), gain 0. + best_c = ci + best_gain = links.get(ci, 0) - sigma_tot[ci] * ki / two_m + for c, w_in in sorted(links.items()): + gain = w_in - sigma_tot[c] * ki / two_m + if gain > best_gain + 1e-12: + best_gain, best_c = gain, c + + comm[node] = best_c + sigma_tot[best_c] += ki + if best_c != ci: + moved = True + if not moved: + break + + return _renumber_by_size(comm) + + +def _renumber_by_size(label: dict[Node, int]) -> dict[Node, int]: + """Renumber raw labels to dense ids ordered by community size (desc), then by + smallest member key — so the mapping is stable run to run.""" + members: dict[int, list[Node]] = defaultdict(list) + for node, lbl in label.items(): + members[lbl].append(node) + order = sorted(members, key=lambda lbl: (-len(members[lbl]), min(members[lbl]))) + remap = {old: new for new, old in enumerate(order)} + return {node: remap[lbl] for node, lbl in label.items()} + + +def modularity(adj: dict[Node, Counter], communities: dict[Node, int]) -> float: + """Newman modularity Q of the partition — a quality score in roughly [-0.5, 1]. + + Higher means the communities capture more edge density than chance. Reported + so the user can judge how meaningful the module split is. + """ + m2 = sum(sum(neighbors.values()) for neighbors in adj.values()) # = 2 * |E| + if m2 == 0: + return 0.0 + deg = weighted_degree(adj) + q = 0.0 + for node, neighbors in adj.items(): + ci = communities[node] + for neighbor, weight in neighbors.items(): + if communities[neighbor] == ci: + q += weight - deg[node] * deg[neighbor] / m2 + return round(q / m2, 4) + + +# --------------------------------------------------------------------------- +# Node labelling +# --------------------------------------------------------------------------- + +def _node_index(conn: sqlite3.Connection) -> dict[Node, dict]: + """(kind, id) -> display metadata {kind, name, path, degree fields}.""" + rows = repo.all_graph_nodes(conn) + index: dict[Node, dict] = {} + for f in rows["file"]: + index[("file", int(f["id"]))] = { + "kind": "file", + "name": f["path"].rsplit("/", 1)[-1], + "path": f["path"], + } + for s in rows["symbol"]: + index[("symbol", int(s["id"]))] = { + "kind": "symbol", + "name": s["name"], + "symbol_kind": s["kind"], + "path": s["path"], + "in_degree": int(s["in_degree"]), + "out_degree": int(s["out_degree"]), + } + return index + + +def _dir_of(path: str) -> str: + return path.rsplit("/", 1)[0] if "/" in path else "(root)" + + +def label_community(members: list[Node], node_index: dict[Node, dict]) -> str: + """Name a community by the directory most of its nodes live in. + + A 2-5 word, plain-language module name is what graphify asks an LLM for; here + we derive it deterministically from the dominant source directory, which for + code is a strong proxy for "what this module is". + """ + dirs: Counter = Counter() + for node in members: + meta = node_index.get(node) + if meta and meta.get("path"): + dirs[_dir_of(meta["path"])] += 1 + if not dirs: + return "module" + # Most common dir; tie -> shortest then lexicographically smallest (stable). + top = min(dirs.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0])) + return top[0] + + +# --------------------------------------------------------------------------- +# God nodes / surprising connections / questions +# --------------------------------------------------------------------------- + +def god_nodes( + adj: dict[Node, Counter], + communities: dict[Node, int], + node_index: dict[Node, dict], + *, + limit: int = MAX_GOD_NODES, +) -> list[dict]: + """Most-connected nodes by weighted degree (the load-bearing ones).""" + deg = weighted_degree(adj) + ranked = sorted(deg, key=lambda n: (-deg[n], _node_key(*n))) + out: list[dict] = [] + for node in ranked[:limit]: + meta = node_index.get(node) + if meta is None: + continue + out.append( + { + "kind": meta["kind"], + "name": meta["name"], + "path": meta.get("path"), + "degree": deg[node], + "community": communities.get(node, -1), + } + ) + return out + + +def surprising_connections( + edge_weight: dict[tuple[Node, Node], int], + communities: dict[Node, int], + node_index: dict[Node, dict], + *, + limit: int = MAX_SURPRISING, +) -> list[dict]: + """Edges that bridge two communities barely connected to each other. + + For each unordered community pair we count how many edges cross between them; + a pair joined by only a handful of edges is a surprising structural link. We + surface the actual endpoint pair for each such bridge. + """ + pair_edges: dict[tuple[int, int], list[tuple[Node, Node]]] = defaultdict(list) + for (a, b), _w in edge_weight.items(): + ca, cb = communities.get(a, -1), communities.get(b, -1) + if ca == cb or ca < 0 or cb < 0: + continue + key = (ca, cb) if ca < cb else (cb, ca) + pair_edges[key].append((a, b)) + + bridges = [ + (pair, endpoints) + for pair, endpoints in pair_edges.items() + if len(endpoints) <= BRIDGE_MAX_EDGES + ] + # Rarest bridges first (a single edge between modules is the most surprising), + # then by community-pair id for stability. + bridges.sort(key=lambda item: (len(item[1]), item[0])) + + out: list[dict] = [] + for (ca, cb), endpoints in bridges[:limit]: + a, b = sorted(endpoints)[0] + ma, mb = node_index.get(a), node_index.get(b) + if ma is None or mb is None: + continue + out.append( + { + "from": {"kind": ma["kind"], "name": ma["name"], "path": ma.get("path")}, + "to": {"kind": mb["kind"], "name": mb["name"], "path": mb.get("path")}, + "from_community": ca, + "to_community": cb, + "edge_count": len(endpoints), + } + ) + return out + + +def suggest_questions( + gods: list[dict], + surprising: list[dict], + community_labels: dict[int, str], + *, + limit: int = MAX_QUESTIONS, +) -> list[str]: + """Starter questions seeded from the structure, like graphify's report.""" + questions: list[str] = [] + for g in gods[:3]: + if g["kind"] == "symbol": + questions.append(f"How does `{g['name']}` work?") + questions.append(f"What breaks if `{g['name']}` changes?") + else: + questions.append(f"What is the role of `{g['name']}` in the architecture?") + for s in surprising[:3]: + la = community_labels.get(s["from_community"], f"community {s['from_community']}") + lb = community_labels.get(s["to_community"], f"community {s['to_community']}") + if la != lb: + questions.append(f"How is `{la}` connected to `{lb}`?") + # De-dup, preserve order. + seen: set[str] = set() + deduped: list[str] = [] + for q in questions: + if q not in seen: + seen.add(q) + deduped.append(q) + return deduped[:limit] + + +# --------------------------------------------------------------------------- +# Top-level entry points +# --------------------------------------------------------------------------- + +def analyze(conn: sqlite3.Connection) -> dict: + """Compute the full architecture-analytics summary (does not persist it).""" + edges = repo.all_resolved_edges(conn) + adj, edge_weight = build_adjacency(edges) + node_index = _node_index(conn) + + communities = detect_communities(adj) + members: dict[int, list[Node]] = defaultdict(list) + for node, cid in communities.items(): + members[cid].append(node) + + community_labels = {cid: label_community(nodes, node_index) for cid, nodes in members.items()} + deg = weighted_degree(adj) + + community_summaries: list[dict] = [] + reported = sorted(members, key=lambda cid: (-len(members[cid]), cid)) + for cid in reported: + nodes = members[cid] + if len(nodes) < MIN_REPORTED_COMMUNITY: + continue + top = sorted(nodes, key=lambda n: (-deg.get(n, 0), _node_key(*n)))[:TOP_NODES_PER_COMMUNITY] + community_summaries.append( + { + "id": cid, + "label": community_labels[cid], + "size": len(nodes), + "top_nodes": [ + { + "kind": node_index[n]["kind"], + "name": node_index[n]["name"], + "path": node_index[n].get("path"), + "degree": deg.get(n, 0), + } + for n in top + if n in node_index + ], + } + ) + if len(community_summaries) >= MAX_COMMUNITIES_IN_SUMMARY: + break + + gods = god_nodes(adj, communities, node_index) + surprising = surprising_connections(edge_weight, communities, node_index) + questions = suggest_questions(gods, surprising, community_labels) + + return { + "node_count": len(adj), + "edge_count": sum(edge_weight.values()), + "community_count": sum(1 for nodes in members.values() if len(nodes) >= MIN_REPORTED_COMMUNITY), + "modularity": modularity(adj, communities), + "communities": community_summaries, + "god_nodes": gods, + "surprising": surprising, + "questions": questions, + } + + +def refresh_analysis(conn: sqlite3.Connection) -> dict: + """Compute and cache the analysis summary into meta['graph_analysis'].""" + summary = analyze(conn) + repo.set_meta(conn, ANALYSIS_META_KEY, json.dumps(summary, ensure_ascii=False)) + return summary + + +def load_analysis(conn: sqlite3.Connection) -> Optional[dict]: + """Read the cached analysis summary, or None if the build never produced one.""" + raw = repo.get_meta(conn, ANALYSIS_META_KEY) + if not raw: + return None + try: + return json.loads(raw) + except (ValueError, TypeError): + return None diff --git a/src/codebase_index/graph/builder.py b/src/codebase_index/graph/builder.py index f2e342b..4547883 100644 --- a/src/codebase_index/graph/builder.py +++ b/src/codebase_index/graph/builder.py @@ -26,7 +26,20 @@ def build_graph(conn: sqlite3.Connection) -> dict[str, int]: resolved = resolve_edges(conn) repo.recompute_degrees(conn) + # Everything still unresolved that names a target is, by definition, a target we + # could not pin to a unique node — record it as 'ambiguous' for the honesty trail. + repo.mark_ambiguous_edges(conn) total_unresolved = len(repo.unresolved_edges(conn)) + # Architecture analytics (communities / god nodes / surprising bridges) are a + # derived view of the graph. Compute once per build and cache the JSON in meta so + # the `architecture` command and the HTML export read it instantly. Never let an + # analysis failure fail the build — the graph itself is already written. + try: + from . import analysis + + analysis.refresh_analysis(conn) + except Exception: # pragma: no cover - defensive; analytics are best-effort + pass return {"resolved": resolved, "unresolved": total_unresolved} @@ -38,17 +51,20 @@ def resolve_edges(conn: sqlite3.Connection) -> int: unique_symbols = repo.unique_symbol_ids_by_name(conn) suffix_map = _path_suffix_map(repo.all_file_ids_with_paths(conn)) - resolutions: list[tuple[str, int, int]] = [] + # (dst_kind, dst_id, edge_id, confidence). A repo-unique symbol name is an exact + # hit -> 'extracted'; an import resolved only by path-suffix matching is a best- + # effort heuristic -> 'inferred'. + resolutions: list[tuple[str, int, int, str]] = [] for edge in edges: name = edge["dst_name"] if edge["edge_type"] == "import": file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"]) if file_id is not None: - resolutions.append(("file", file_id, edge["id"])) + resolutions.append(("file", file_id, edge["id"], "inferred")) elif edge["edge_type"] in _SYMBOL_EDGE_TYPES: sym_id = unique_symbols.get(name) if sym_id is not None: - resolutions.append(("symbol", sym_id, edge["id"])) + resolutions.append(("symbol", sym_id, edge["id"], "extracted")) repo.resolve_edges_bulk(conn, resolutions) return len(resolutions) diff --git a/src/codebase_index/graph/expand.py b/src/codebase_index/graph/expand.py index deebed1..02ee989 100644 --- a/src/codebase_index/graph/expand.py +++ b/src/codebase_index/graph/expand.py @@ -51,14 +51,14 @@ def _seed_nodes(conn: sqlite3.Connection, target: str) -> list[tuple[str, int]]: def _neighbors(conn, kind, node_id, direction): - """Yield (next_kind, next_id, edge_type) for the requested direction(s).""" + """Yield (next_kind, next_id, edge_type, confidence) for the requested direction(s).""" if direction in ("up", "both"): for e in repo.incoming_edges(conn, kind, node_id): - yield e["src_kind"], int(e["src_id"]), e["edge_type"] + yield e["src_kind"], int(e["src_id"]), e["edge_type"], e["confidence"] if direction in ("down", "both"): for e in repo.outgoing_edges(conn, kind, node_id): if e["dst_id"] is not None: - yield e["dst_kind"], int(e["dst_id"]), e["edge_type"] + yield e["dst_kind"], int(e["dst_id"]), e["edge_type"], e["confidence"] def _node_meta(conn, kind, node_id) -> Optional[ImpactNode]: @@ -92,7 +92,7 @@ def walk_impact( kind, node_id, dist = queue.popleft() if dist >= depth: continue - for nk, nid, etype in _neighbors(conn, kind, node_id, direction): + for nk, nid, etype, conf in _neighbors(conn, kind, node_id, direction): if (nk, nid) in visited: continue visited.add((nk, nid)) @@ -101,6 +101,7 @@ def walk_impact( continue meta.distance = dist + 1 meta.via_edge = etype + meta.via_confidence = conf out.append(meta) queue.append((nk, nid, dist + 1)) return out diff --git a/src/codebase_index/models.py b/src/codebase_index/models.py index b35d959..b87ec79 100644 --- a/src/codebase_index/models.py +++ b/src/codebase_index/models.py @@ -113,6 +113,9 @@ class RefSite(BaseModel): path: str line: int kind: str + # Audit trail (see edges.confidence): 'extracted' = exact match, 'inferred' = + # heuristic, 'ambiguous' = unresolved/non-unique. Defaults keep older callers valid. + confidence: str = "extracted" class RefsResponse(BaseModel): @@ -129,6 +132,7 @@ class ImpactNode(BaseModel): line_start: Optional[int] = None distance: int # BFS hops from the target (1 = direct) via_edge: Optional[str] = None # edge_type that linked it (import|call|extends|...) + via_confidence: Optional[str] = None # confidence of the linking edge (audit trail) class ImpactResponse(BaseModel): diff --git a/src/codebase_index/output/markdown.py b/src/codebase_index/output/markdown.py index afd6a76..f7b4151 100644 --- a/src/codebase_index/output/markdown.py +++ b/src/codebase_index/output/markdown.py @@ -130,6 +130,15 @@ def _coverage_line(coverage) -> Optional[str]: return None +# Audit-trail glyphs: an exact edge needs no annotation; inferred/ambiguous ones +# warn the reader that the link is a heuristic or could not be pinned down. +_CONF_MARK = {"extracted": "", "inferred": "~ inferred", "ambiguous": "? ambiguous"} + + +def _conf_mark(confidence: Optional[str]) -> str: + return _CONF_MARK.get(confidence or "extracted", confidence or "") + + def render_refs(resp: RefsResponse) -> str: lines = [_header(resp.query, resp.index.exists, resp.index.stale)] lines.append("") @@ -140,10 +149,12 @@ def render_refs(resp: RefsResponse) -> str: lines.append(note) return "\n".join(lines).rstrip() + "\n" - lines.append("| kind | path | line |") - lines.append("|------|------|------|") + lines.append("| kind | path | line | confidence |") + lines.append("|------|------|------|------------|") for site in resp.sites: - lines.append(f"| {site.kind} | `{site.path}` | {site.line} |") + lines.append( + f"| {site.kind} | `{site.path}` | {site.line} | {_conf_mark(site.confidence) or 'exact'} |" + ) if note: lines.append(note) return "\n".join(lines).rstrip() + "\n" @@ -171,7 +182,9 @@ def render_impact(resp: ImpactResponse) -> str: for n in sorted(resp.nodes, key=lambda x: (x.distance, x.path, x.line_start or 0)): loc = f"{n.path}:{n.line_start}" if n.line_start else n.path node_name = f"`{n.name}`" if n.name else "—" - lines.append(f"| {n.distance} | {n.via_edge or ''} | {n.kind} | {node_name} | `{loc}` |") + mark = _conf_mark(n.via_confidence) + via = f"{n.via_edge or ''} {mark}".strip() + lines.append(f"| {n.distance} | {via} | {n.kind} | {node_name} | `{loc}` |") if note: lines.append(note) return "\n".join(lines).rstrip() + "\n" diff --git a/src/codebase_index/retrieval/searchers.py b/src/codebase_index/retrieval/searchers.py index 138bf80..d82aca5 100644 --- a/src/codebase_index/retrieval/searchers.py +++ b/src/codebase_index/retrieval/searchers.py @@ -234,11 +234,17 @@ def symbol_lookup( def refs_lookup(conn: sqlite3.Connection, name: str, *, kind: str) -> RefsResponse: defs = repo.symbols_by_name(conn, name, exact=True) sites = [ - RefSite(path=row["path"], line=row["line"], kind="call") + RefSite( + path=row["path"], + line=row["line"], + kind="call", + confidence=row["confidence"] if "confidence" in row.keys() else "extracted", + ) for row in repo.refs_for_name(conn, name) ] if kind == "all": sites.extend( + # A definition is the symbol itself — exact by construction. RefSite(path=row["path"], line=row["line_start"], kind="definition") for row in defs ) diff --git a/src/codebase_index/storage/db.py b/src/codebase_index/storage/db.py index 60abbcd..7cb1da1 100644 --- a/src/codebase_index/storage/db.py +++ b/src/codebase_index/storage/db.py @@ -8,7 +8,8 @@ from typing import Optional # 2: chunks gained a denormalized `symbol_names` column (FTS symbol-name boost). -SCHEMA_VERSION = 2 +# 3: edges gained a `confidence` column (extracted/inferred/ambiguous audit trail). +SCHEMA_VERSION = 3 class Database: diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index 77a54cd..bbb2157 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -253,11 +253,15 @@ def replace_edges( conn.executemany( """ INSERT INTO edges - (edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line, resolved) + (edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line, + resolved, confidence) VALUES - (:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line, :resolved) + (:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line, + :resolved, :confidence) """, - [{**edge, "file_id": file_id} for edge in edges], + # confidence defaults to 'extracted' for callers (and tests) that predate the + # audit-trail column; the global graph pass refines it (see graph/builder.py). + [{"confidence": "extracted", **edge, "file_id": file_id} for edge in edges], ) return len(edges) @@ -271,6 +275,7 @@ def refs_for_name(conn: sqlite3.Connection, name: str) -> list[sqlite3.Row]: """ SELECT e.line AS line, f.path AS path, e.edge_type AS edge_type, e.resolved AS resolved, e.src_id AS src_id, e.src_kind AS src_kind, + e.confidence AS confidence, src.name AS src_name, src.qualified AS src_qualified FROM edges e JOIN files f ON f.id = e.file_id @@ -388,15 +393,58 @@ def resolve_edge(conn: sqlite3.Connection, edge_id: int, dst_kind: str, dst_id: def resolve_edges_bulk( - conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int]] + conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int, str]] ) -> None: - """Apply (dst_kind, dst_id, edge_id) resolutions in one executemany.""" + """Apply (dst_kind, dst_id, edge_id, confidence) resolutions in one executemany. + + confidence records *how* the target was found: 'extracted' for an exact match + (a repo-unique symbol name), 'inferred' for a heuristic (import path-suffix). + """ conn.executemany( - "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1 WHERE id = ?", - resolutions, + "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1, confidence = ? WHERE id = ?", + [(dst_kind, dst_id, confidence, edge_id) for dst_kind, dst_id, edge_id, confidence in resolutions], ) +def mark_ambiguous_edges(conn: sqlite3.Connection) -> int: + """Flag every still-unresolved edge that names a target as 'ambiguous'. + + Run after the global resolution pass: an edge with a dst_name that no unique + symbol/file claims is one we could not pin down (a non-unique name, or an import + of code outside the repo). Marking it keeps refs/impact honest — an empty or + short answer over ambiguous edges is inconclusive, not proof of "no callers". + """ + cur = conn.execute( + "UPDATE edges SET confidence = 'ambiguous' " + "WHERE resolved = 0 AND dst_name IS NOT NULL AND confidence != 'ambiguous'" + ) + return cur.rowcount if cur.rowcount is not None else 0 + + +def all_resolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]: + """Every resolved edge as (src_kind, src_id, dst_kind, dst_id, edge_type, confidence). + + The in-memory adjacency the graph analysis (communities / god nodes / bridges) + is built from. Unresolved edges are skipped — they have no concrete endpoint. + """ + return conn.execute( + "SELECT src_kind, src_id, dst_kind, dst_id, edge_type, confidence FROM edges " + "WHERE resolved = 1 AND dst_id IS NOT NULL" + ).fetchall() + + +def all_graph_nodes(conn: sqlite3.Connection) -> dict[str, list[sqlite3.Row]]: + """File and symbol rows keyed by kind, for labelling graph-analysis nodes.""" + return { + "file": conn.execute("SELECT id, path FROM files").fetchall(), + "symbol": conn.execute( + "SELECT s.id AS id, s.name AS name, s.kind AS kind, f.path AS path, " + " s.in_degree AS in_degree, s.out_degree AS out_degree " + "FROM symbols s JOIN files f ON f.id = s.file_id" + ).fetchall(), + } + + def name_ref_counts(conn: sqlite3.Connection, names: Sequence[str]) -> dict[str, int]: """Count edges targeting each name (any resolution state), keyed by dst_name. @@ -458,7 +506,7 @@ def symbols_in_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row] def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]: return conn.execute( - "SELECT id, edge_type, src_kind, src_id, file_id, line FROM edges " + "SELECT id, edge_type, src_kind, src_id, file_id, line, confidence FROM edges " "WHERE resolved = 1 AND dst_kind = ? AND dst_id = ?", (kind, node_id), ).fetchall() @@ -466,7 +514,7 @@ def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sq def outgoing_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]: return conn.execute( - "SELECT id, edge_type, dst_kind, dst_id, file_id, line FROM edges " + "SELECT id, edge_type, dst_kind, dst_id, file_id, line, confidence FROM edges " "WHERE resolved = 1 AND src_kind = ? AND src_id = ?", (kind, node_id), ).fetchall() diff --git a/src/codebase_index/storage/schema.sql b/src/codebase_index/storage/schema.sql index 10bde93..bed17bb 100644 --- a/src/codebase_index/storage/schema.sql +++ b/src/codebase_index/storage/schema.sql @@ -63,7 +63,13 @@ CREATE TABLE IF NOT EXISTS edges ( dst_name TEXT, file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, line INTEGER, - resolved INTEGER NOT NULL DEFAULT 0 + resolved INTEGER NOT NULL DEFAULT 0, + -- Honesty audit trail (see docs/SCHEMA.md). How sure are we this edge points + -- where it claims? 'extracted' = exact match (same-file symbol or a repo-unique + -- name); 'inferred' = a heuristic resolved it (import path-suffix); 'ambiguous' + -- = a name/import we could not pin to a unique target. Set at build time by the + -- global graph pass; never guessed by an LLM (the index is fully local). + confidence TEXT NOT NULL DEFAULT 'extracted' ); CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_kind, src_id); CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_kind, dst_id); diff --git a/tests/golden/impact_user_model.json b/tests/golden/impact_user_model.json index 9ec0c77..cb8d033 100644 --- a/tests/golden/impact_user_model.json +++ b/tests/golden/impact_user_model.json @@ -23,6 +23,7 @@ "line_start": null, "name": null, "path": "src/api/service.py", + "via_confidence": "inferred", "via_edge": "import" }, { @@ -31,6 +32,7 @@ "line_start": 7, "name": "AdminUser", "path": "src/api/service.py", + "via_confidence": "extracted", "via_edge": "extends" } ], diff --git a/tests/golden/mcp_find_refs.json b/tests/golden/mcp_find_refs.json index abdd727..13208fa 100644 --- a/tests/golden/mcp_find_refs.json +++ b/tests/golden/mcp_find_refs.json @@ -15,16 +15,19 @@ "schema_version": 1, "sites": [ { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/api/service.py" }, { + "confidence": "extracted", "kind": "definition", "line": 4, "path": "src/auth/token.py" }, { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/auth/token.py" diff --git a/tests/golden/mcp_impact_of.json b/tests/golden/mcp_impact_of.json index 5fc14dc..1629921 100644 --- a/tests/golden/mcp_impact_of.json +++ b/tests/golden/mcp_impact_of.json @@ -23,6 +23,7 @@ "line_start": null, "name": null, "path": "src/api/service.py", + "via_confidence": "inferred", "via_edge": "import" }, { @@ -31,6 +32,7 @@ "line_start": 7, "name": "AdminUser", "path": "src/api/service.py", + "via_confidence": "extracted", "via_edge": "extends" } ], diff --git a/tests/golden/refs_refresh_access_token.json b/tests/golden/refs_refresh_access_token.json index f5693df..1ce827e 100644 --- a/tests/golden/refs_refresh_access_token.json +++ b/tests/golden/refs_refresh_access_token.json @@ -14,16 +14,19 @@ "query": "refresh_access_token", "sites": [ { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/api/service.py" }, { + "confidence": "extracted", "kind": "definition", "line": 4, "path": "src/auth/token.py" }, { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/auth/token.py" diff --git a/tests/test_analysis.py b/tests/test_analysis.py new file mode 100644 index 0000000..e4de0d2 --- /dev/null +++ b/tests/test_analysis.py @@ -0,0 +1,177 @@ +"""Tests for graph.analysis — communities / god nodes / surprising bridges. + +The pure-Python graph functions are deterministic, so the assertions pin exact +structure (two cliques joined by one bridge → two communities + one surprising +link) rather than fuzzy thresholds. +""" + +from __future__ import annotations + +from codebase_index.config import Config +from codebase_index.graph import analysis +from codebase_index.indexer.pipeline import build_index +from codebase_index.parsers.base import Symbol +from codebase_index.storage import repo +from codebase_index.storage.db import Database + + +# --- pure-Python graph algorithms (no DB) ------------------------------------- + +def _two_cliques_with_bridge(): + """Two triangles (A0-A1-A2) and (B0-B1-B2) joined by a single A0-B0 edge.""" + edges = [] + + def edge(s, d): + return {"src_kind": "symbol", "src_id": s, "dst_kind": "symbol", "dst_id": d} + + # clique A: ids 0,1,2 ; clique B: ids 10,11,12 + for a, b in [(0, 1), (1, 2), (0, 2)]: + edges.append(edge(a, b)) + for a, b in [(10, 11), (11, 12), (10, 12)]: + edges.append(edge(a, b)) + edges.append(edge(0, 10)) # the bridge + return edges + + +def test_detect_communities_splits_two_cliques(): + adj, _ = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + # All of clique A share one label; all of clique B share another; they differ. + a_labels = {comm[("symbol", i)] for i in (0, 1, 2)} + b_labels = {comm[("symbol", i)] for i in (10, 11, 12)} + assert len(a_labels) == 1 + assert len(b_labels) == 1 + assert a_labels != b_labels + + +def test_modularity_is_positive_for_clear_structure(): + adj, _ = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + assert analysis.modularity(adj, comm) > 0.0 + + +def test_god_nodes_rank_by_degree(): + # Make node 0 a hub: connect it to many leaves. + edges = [ + {"src_kind": "symbol", "src_id": 0, "dst_kind": "symbol", "dst_id": leaf} + for leaf in range(1, 6) + ] + adj, _ = analysis.build_adjacency(edges) + comm = analysis.detect_communities(adj) + node_index = { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/x.py"} + for i in range(6) + } + gods = analysis.god_nodes(adj, comm, node_index, limit=3) + assert gods[0]["name"] == "sym0" + assert gods[0]["degree"] == 5 + + +def test_surprising_connection_finds_the_bridge(): + adj, edge_weight = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + node_index = { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/a.py"} + for i in (0, 1, 2) + } + node_index.update( + { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/b.py"} + for i in (10, 11, 12) + } + ) + surprising = analysis.surprising_connections(edge_weight, comm, node_index) + assert len(surprising) == 1 + names = {surprising[0]["from"]["name"], surprising[0]["to"]["name"]} + assert names == {"sym0", "sym10"} + assert surprising[0]["edge_count"] == 1 + + +def test_label_community_uses_dominant_directory(): + node_index = { + ("symbol", 1): {"kind": "symbol", "name": "a", "path": "src/auth/token.py"}, + ("symbol", 2): {"kind": "symbol", "name": "b", "path": "src/auth/login.py"}, + ("symbol", 3): {"kind": "symbol", "name": "c", "path": "src/db/conn.py"}, + } + label = analysis.label_community([("symbol", 1), ("symbol", 2), ("symbol", 3)], node_index) + assert label == "src/auth" + + +def test_suggest_questions_seeds_from_structure(): + gods = [{"kind": "symbol", "name": "Engine", "path": "x", "degree": 9, "community": 0}] + surprising = [ + { + "from": {"kind": "symbol", "name": "a", "path": "x"}, + "to": {"kind": "symbol", "name": "b", "path": "y"}, + "from_community": 0, + "to_community": 1, + "edge_count": 1, + } + ] + qs = analysis.suggest_questions(gods, surprising, {0: "core", 1: "io"}) + assert any("Engine" in q for q in qs) + assert any("core" in q and "io" in q for q in qs) + + +# --- integration against a real built index ----------------------------------- + +def _seed_two_modules(db: Database) -> None: + """auth module (token<-login) and db module (query<-exec), bridged login->query.""" + auth = repo.upsert_file( + db.conn, path="src/auth/token.py", lang="python", size_bytes=1, sha256="a", + mtime_ns=1, git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + db_f = repo.upsert_file( + db.conn, path="src/db/query.py", lang="python", size_bytes=1, sha256="b", + mtime_ns=1, git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + a = repo.replace_symbols(db.conn, auth, [ + Symbol(name="make_token", kind="function", line_start=1, line_end=2), + Symbol(name="login", kind="function", line_start=3, line_end=4), + ]) + b = repo.replace_symbols(db.conn, db_f, [ + Symbol(name="run_query", kind="function", line_start=1, line_end=2), + Symbol(name="exec_stmt", kind="function", line_start=3, line_end=4), + ]) + repo.replace_edges(db.conn, auth, [ + {"edge_type": "call", "src_kind": "symbol", "src_id": a[1], + "dst_kind": None, "dst_id": None, "dst_name": "make_token", "line": 3, "resolved": 0}, + {"edge_type": "call", "src_kind": "symbol", "src_id": a[1], + "dst_kind": None, "dst_id": None, "dst_name": "run_query", "line": 4, "resolved": 0}, + ]) + repo.replace_edges(db.conn, db_f, [ + {"edge_type": "call", "src_kind": "symbol", "src_id": b[0], + "dst_kind": None, "dst_id": None, "dst_name": "exec_stmt", "line": 2, "resolved": 0}, + ]) + + +def test_analyze_and_cache_roundtrip(tmp_path): + from codebase_index.graph.builder import build_graph + + db = Database(tmp_path / "index.sqlite").open() + _seed_two_modules(db) + build_graph(db.conn) # resolves edges + refresh_analysis caches the summary + + cached = analysis.load_analysis(db.conn) + assert cached is not None + assert cached["node_count"] > 0 + assert cached["god_nodes"], "expected at least one god node" + # Recomputing directly matches the cached summary's headline numbers. + fresh = analysis.analyze(db.conn) + assert fresh["node_count"] == cached["node_count"] + assert fresh["edge_count"] == cached["edge_count"] + db.close() + + +def test_analyze_on_sample_repo(sample_repo, tmp_path): + cfg = Config() + cfg.root = str(sample_repo) + db = Database(tmp_path / "index.sqlite").open() + build_index(cfg, db, root=sample_repo) + + summary = analysis.load_analysis(db.conn) + assert summary is not None + assert summary["node_count"] >= 1 + assert isinstance(summary["communities"], list) + assert isinstance(summary["questions"], list) + db.close() diff --git a/tests/test_graph.py b/tests/test_graph.py index f8e5110..f18c0ba 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -63,6 +63,22 @@ def test_build_graph_resolves_symbol_and_import_edges(tmp_path): db.close() +def test_build_graph_sets_edge_confidence(tmp_path): + db = _db(tmp_path) + _seed(db) + build_graph(db.conn) + conf = { + (r["edge_type"], r["dst_name"]): r["confidence"] + for r in db.conn.execute("SELECT edge_type, dst_name, confidence FROM edges") + } + # exact unique-name symbol match, import resolved by path-suffix heuristic, + # and a callee no symbol defines. + assert conf[("call", "refresh_access_token")] == "extracted" + assert conf[("import", "auth.token")] == "inferred" + assert conf[("call", "does_not_exist")] == "ambiguous" + db.close() + + def _file(db, path, sha="x"): return repo.upsert_file( db.conn, path=path, lang="python", size_bytes=1, sha256=sha, From d6bf1fee1735d4a7b32ee841ee869ba439618c66 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 23 Jun 2026 23:10:39 +0300 Subject: [PATCH 2/3] feat(graph): discount test paths when labelling communities Test files cluster with the code they exercise and often outnumber it, which mislabelled production modules as "tests". Community labels now prefer the dominant non-test directory; an all-test community still names for tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/codebase_index/graph/analysis.py | 28 +++++++++++++++++++++++----- tests/test_analysis.py | 17 +++++++++++++++++ 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/codebase_index/graph/analysis.py b/src/codebase_index/graph/analysis.py index a23c934..b50da2c 100644 --- a/src/codebase_index/graph/analysis.py +++ b/src/codebase_index/graph/analysis.py @@ -207,18 +207,36 @@ def _dir_of(path: str) -> str: return path.rsplit("/", 1)[0] if "/" in path else "(root)" +def _is_test_path(path: str) -> bool: + """Test files cluster with the code they exercise; don't let them name the module.""" + lower = path.lower() + parts = lower.split("/") + if any(p in ("test", "tests", "__tests__", "spec", "specs") for p in parts): + return True + base = parts[-1] + return base.startswith("test_") or base.startswith("test.") or "_test." in base or ".test." in base + + def label_community(members: list[Node], node_index: dict[Node, dict]) -> str: - """Name a community by the directory most of its nodes live in. + """Name a community by the directory most of its (non-test) nodes live in. A 2-5 word, plain-language module name is what graphify asks an LLM for; here we derive it deterministically from the dominant source directory, which for - code is a strong proxy for "what this module is". + code is a strong proxy for "what this module is". Test paths are discounted so + a cluster of production symbols isn't mislabelled "tests" just because its test + files outnumber it; a community that is *only* tests still gets named for them. """ - dirs: Counter = Counter() + prod: Counter = Counter() + allp: Counter = Counter() for node in members: meta = node_index.get(node) - if meta and meta.get("path"): - dirs[_dir_of(meta["path"])] += 1 + if not (meta and meta.get("path")): + continue + d = _dir_of(meta["path"]) + allp[d] += 1 + if not _is_test_path(meta["path"]): + prod[d] += 1 + dirs = prod or allp if not dirs: return "module" # Most common dir; tie -> shortest then lexicographically smallest (stable). diff --git a/tests/test_analysis.py b/tests/test_analysis.py index e4de0d2..60eb90a 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -97,6 +97,23 @@ def test_label_community_uses_dominant_directory(): assert label == "src/auth" +def test_label_community_discounts_test_paths(): + # Two production symbols in src/storage and three test files that exercise them: + # tests outnumber prod, but the module should still be named for the prod code. + node_index = { + ("symbol", 1): {"kind": "symbol", "name": "a", "path": "src/storage/db.py"}, + ("symbol", 2): {"kind": "symbol", "name": "b", "path": "src/storage/repo.py"}, + ("symbol", 3): {"kind": "symbol", "name": "t1", "path": "tests/test_db.py"}, + ("symbol", 4): {"kind": "symbol", "name": "t2", "path": "tests/test_repo.py"}, + ("symbol", 5): {"kind": "symbol", "name": "t3", "path": "tests/test_x.py"}, + } + members = [("symbol", i) for i in range(1, 6)] + assert analysis.label_community(members, node_index) == "src/storage" + # A community that is *only* tests still gets named for them. + only_tests = [("symbol", i) for i in (3, 4, 5)] + assert analysis.label_community(only_tests, node_index) == "tests" + + def test_suggest_questions_seeds_from_structure(): gods = [{"kind": "symbol", "name": "Engine", "path": "x", "degree": 9, "community": 0}] surprising = [ From 9fb8762ba13a08bbf8953fb145d39c4fae27d154 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 23 Jun 2026 23:50:07 +0300 Subject: [PATCH 3/3] fix(graph): key analytics by stable content keys, not volatile symbol ids Symbol ids are assigned in file-walk order, which differs across OSes, so the community partition / god-node ranking (and thus the architecture/path/describe golden snapshots) diverged between Windows and the Linux/macOS CI runners. Key the analysis graph by content (kind:path:name:line) instead. build_adjacency takes an optional key_fn; analyze() passes the stable key so the result is identical on every platform. The community/degree helpers are generic over the node-key type (tuple in tests, str in analyze/export). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/codebase_index/graph/analysis.py | 98 +++++++++++++++++++--------- src/codebase_index/storage/repo.py | 1 + 2 files changed, 68 insertions(+), 31 deletions(-) diff --git a/src/codebase_index/graph/analysis.py b/src/codebase_index/graph/analysis.py index b50da2c..e309b0c 100644 --- a/src/codebase_index/graph/analysis.py +++ b/src/codebase_index/graph/analysis.py @@ -27,7 +27,7 @@ import json import sqlite3 from collections import Counter, defaultdict -from typing import Optional +from typing import Any, Optional from ..storage import repo @@ -54,23 +54,29 @@ # Graph construction # --------------------------------------------------------------------------- -def _node_key(kind: str, node_id: int) -> str: - return f"{kind}:{node_id}" - - def build_adjacency( edges: list[sqlite3.Row], -) -> tuple[dict[Node, Counter], dict[tuple[Node, Node], int]]: + key_fn=None, +) -> tuple[dict[Any, Counter], dict[tuple[Any, Any], int]]: """Undirected weighted adjacency + per-edge multiplicity, from resolved edges. Self-loops are dropped (they distort degree and never bridge communities). + + ``key_fn(kind, id) -> hashable | None`` maps an edge endpoint to a node key + (returning None drops the edge). analyze() passes a *content* key + (kind:path:name:line) so the partition is identical across platforms — symbol + ids depend on file-walk order, which differs between OSes. The default keys by + (kind, id), used by the algorithm unit tests. """ - adj: dict[Node, Counter] = defaultdict(Counter) - edge_weight: dict[tuple[Node, Node], int] = defaultdict(int) + def kf(kind: str, nid: int): + return key_fn(kind, nid) if key_fn is not None else (kind, nid) + + adj: dict[Any, Counter] = defaultdict(Counter) + edge_weight: dict[tuple[Any, Any], int] = defaultdict(int) for e in edges: - src: Node = (e["src_kind"], int(e["src_id"])) - dst: Node = (e["dst_kind"], int(e["dst_id"])) - if src == dst: + src = kf(e["src_kind"], int(e["src_id"])) + dst = kf(e["dst_kind"], int(e["dst_id"])) + if src is None or dst is None or src == dst: continue adj[src][dst] += 1 adj[dst][src] += 1 @@ -78,11 +84,14 @@ def build_adjacency( return adj, edge_weight -def _canonical_pair(a: Node, b: Node) -> tuple[Node, Node]: +def _canonical_pair(a: Any, b: Any) -> tuple[Any, Any]: return (a, b) if a <= b else (b, a) -def weighted_degree(adj: dict[Node, Counter]) -> dict[Node, int]: +# The graph algorithms below are generic over the node-key type: analyze() calls +# them with (kind, id) tuples; the HTML/interop export reuses them with string +# keys. Typing the key as Any keeps both call sites valid. +def weighted_degree(adj: dict[Any, Counter]) -> dict[Any, int]: return {node: sum(neighbors.values()) for node, neighbors in adj.items()} @@ -90,7 +99,7 @@ def weighted_degree(adj: dict[Node, Counter]) -> dict[Node, int]: # Community detection — deterministic label propagation # --------------------------------------------------------------------------- -def detect_communities(adj: dict[Node, Counter]) -> dict[Node, int]: +def detect_communities(adj: dict[Any, Counter]) -> dict[Any, int]: """Partition nodes into communities by greedy modularity. Returns {node: id}. This is the local-moving phase of the Louvain method, made deterministic: @@ -110,7 +119,7 @@ def detect_communities(adj: dict[Node, Counter]) -> dict[Node, int]: if two_m == 0: return _renumber_by_size({node: idx for idx, node in enumerate(nodes)}) - comm: dict[Node, int] = {node: idx for idx, node in enumerate(nodes)} + comm: dict[Any, int] = {node: idx for idx, node in enumerate(nodes)} # Σ_tot per community: total weighted degree of its members. sigma_tot: dict[int, int] = {idx: deg[node] for idx, node in enumerate(nodes)} @@ -147,10 +156,10 @@ def detect_communities(adj: dict[Node, Counter]) -> dict[Node, int]: return _renumber_by_size(comm) -def _renumber_by_size(label: dict[Node, int]) -> dict[Node, int]: +def _renumber_by_size(label: dict[Any, int]) -> dict[Any, int]: """Renumber raw labels to dense ids ordered by community size (desc), then by smallest member key — so the mapping is stable run to run.""" - members: dict[int, list[Node]] = defaultdict(list) + members: dict[int, list[Any]] = defaultdict(list) for node, lbl in label.items(): members[lbl].append(node) order = sorted(members, key=lambda lbl: (-len(members[lbl]), min(members[lbl]))) @@ -158,7 +167,7 @@ def _renumber_by_size(label: dict[Node, int]) -> dict[Node, int]: return {node: remap[lbl] for node, lbl in label.items()} -def modularity(adj: dict[Node, Counter], communities: dict[Node, int]) -> float: +def modularity(adj: dict[Any, Counter], communities: dict[Any, int]) -> float: """Newman modularity Q of the partition — a quality score in roughly [-0.5, 1]. Higher means the communities capture more edge density than chance. Reported @@ -197,12 +206,25 @@ def _node_index(conn: sqlite3.Connection) -> dict[Node, dict]: "name": s["name"], "symbol_kind": s["kind"], "path": s["path"], + "line_start": s["line_start"], "in_degree": int(s["in_degree"]), "out_degree": int(s["out_degree"]), } return index +def _stable_key(meta: dict) -> str: + """A platform-stable node key from content, not from the volatile symbol id. + + Symbol ids are assigned in file-walk order, which differs across OSes; keying + the graph by path/name/line keeps communities and god-node ranking identical + everywhere (so the golden snapshots hold on Linux/macOS/Windows alike). + """ + if meta["kind"] == "file": + return f"file::{meta['path']}" + return f"symbol::{meta['path']}::{meta['name']}::{meta.get('line_start', '')}" + + def _dir_of(path: str) -> str: return path.rsplit("/", 1)[0] if "/" in path else "(root)" @@ -217,7 +239,7 @@ def _is_test_path(path: str) -> bool: return base.startswith("test_") or base.startswith("test.") or "_test." in base or ".test." in base -def label_community(members: list[Node], node_index: dict[Node, dict]) -> str: +def label_community(members: list[Any], node_index: dict[Any, dict]) -> str: """Name a community by the directory most of its (non-test) nodes live in. A 2-5 word, plain-language module name is what graphify asks an LLM for; here @@ -249,15 +271,15 @@ def label_community(members: list[Node], node_index: dict[Node, dict]) -> str: # --------------------------------------------------------------------------- def god_nodes( - adj: dict[Node, Counter], - communities: dict[Node, int], - node_index: dict[Node, dict], + adj: dict[Any, Counter], + communities: dict[Any, int], + node_index: dict[Any, dict], *, limit: int = MAX_GOD_NODES, ) -> list[dict]: """Most-connected nodes by weighted degree (the load-bearing ones).""" deg = weighted_degree(adj) - ranked = sorted(deg, key=lambda n: (-deg[n], _node_key(*n))) + ranked = sorted(deg, key=lambda n: (-deg[n], str(n))) out: list[dict] = [] for node in ranked[:limit]: meta = node_index.get(node) @@ -276,9 +298,9 @@ def god_nodes( def surprising_connections( - edge_weight: dict[tuple[Node, Node], int], - communities: dict[Node, int], - node_index: dict[Node, dict], + edge_weight: dict[tuple[Any, Any], int], + communities: dict[Any, int], + node_index: dict[Any, dict], *, limit: int = MAX_SURPRISING, ) -> list[dict]: @@ -288,7 +310,7 @@ def surprising_connections( a pair joined by only a handful of edges is a surprising structural link. We surface the actual endpoint pair for each such bridge. """ - pair_edges: dict[tuple[int, int], list[tuple[Node, Node]]] = defaultdict(list) + pair_edges: dict[tuple[int, int], list[tuple[Any, Any]]] = defaultdict(list) for (a, b), _w in edge_weight.items(): ca, cb = communities.get(a, -1), communities.get(b, -1) if ca == cb or ca < 0 or cb < 0: @@ -360,11 +382,25 @@ def suggest_questions( def analyze(conn: sqlite3.Connection) -> dict: """Compute the full architecture-analytics summary (does not persist it).""" edges = repo.all_resolved_edges(conn) - adj, edge_weight = build_adjacency(edges) - node_index = _node_index(conn) + id_index = _node_index(conn) # (kind, id) -> meta + + # Key the graph by stable content keys, not by volatile symbol ids, so the + # result is identical across platforms. node_index then maps that stable key + # back to display metadata. + node_index: dict[str, dict] = {} + + def key_fn(kind: str, nid: int): + meta = id_index.get((kind, nid)) + if meta is None: + return None + k = _stable_key(meta) + node_index.setdefault(k, meta) + return k + + adj, edge_weight = build_adjacency(edges, key_fn) communities = detect_communities(adj) - members: dict[int, list[Node]] = defaultdict(list) + members: dict[int, list[str]] = defaultdict(list) for node, cid in communities.items(): members[cid].append(node) @@ -377,7 +413,7 @@ def analyze(conn: sqlite3.Connection) -> dict: nodes = members[cid] if len(nodes) < MIN_REPORTED_COMMUNITY: continue - top = sorted(nodes, key=lambda n: (-deg.get(n, 0), _node_key(*n)))[:TOP_NODES_PER_COMMUNITY] + top = sorted(nodes, key=lambda n: (-deg.get(n, 0), str(n)))[:TOP_NODES_PER_COMMUNITY] community_summaries.append( { "id": cid, diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index bbb2157..6bef9eb 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -439,6 +439,7 @@ def all_graph_nodes(conn: sqlite3.Connection) -> dict[str, list[sqlite3.Row]]: "file": conn.execute("SELECT id, path FROM files").fetchall(), "symbol": conn.execute( "SELECT s.id AS id, s.name AS name, s.kind AS kind, f.path AS path, " + " s.line_start AS line_start, " " s.in_degree AS in_degree, s.out_degree AS out_degree " "FROM symbols s JOIN files f ON f.id = s.file_id" ).fetchall(),