diff --git a/CHANGELOG.md b/CHANGELOG.md index 347ee4e..b50d274 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,23 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +### Added — graph foundation: edge confidence + architecture analytics (requires a one-time reindex) +- **Edge confidence audit trail.** Every graph edge now carries a `confidence`: + `extracted` (exact — a same-file symbol or repo-unique name), `inferred` (a + heuristic resolved it, e.g. an import path-suffix match), or `ambiguous` (a named + target we could not pin to a unique node). `refs` and `impact` surface it so an + empty or short answer over `ambiguous`/`inferred` edges reads as inconclusive, + not as proof. Confidence is derived from *how* an edge resolved — never guessed by + an LLM; the index stays fully local. **Bumps `SCHEMA_VERSION` 2 → 3.** Older + indexes stay readable; `index`/`update` detect the mismatch and rebuild. +- **Architecture analytics (`graph/analysis.py`), zero new dependencies.** A pure, + deterministic pass over the resolved edge graph computes communities (greedy + modularity / Louvain local-move — does not collapse cliques joined by one bridge), + god nodes (most-connected symbols/files), surprising connections (edges bridging + weakly-linked communities), auto-labelled modules, and suggested questions. The + summary is cached in `meta['graph_analysis']` at build time for instant reads. + (Surfaced via the `architecture` command and HTML export in following changes.) + ### Changed — retrieval ranking & fusion (requires a one-time reindex) - **RRF fusion rescaled and re-keyed.** Fused scores were ~`w/k` (≈0.017), an order of magnitude below the reranker's bounded bonuses, so rerank silently became the diff --git a/docs/SCHEMA.md b/docs/SCHEMA.md index 21f2735..490dc85 100644 --- a/docs/SCHEMA.md +++ b/docs/SCHEMA.md @@ -75,7 +75,13 @@ CREATE TABLE edges ( dst_name TEXT, -- raw target text (for unresolved edges) file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, line INTEGER, - resolved INTEGER NOT NULL DEFAULT 0 + resolved INTEGER NOT NULL DEFAULT 0, + -- Honesty audit trail: how the edge's target was determined. + -- extracted = exact (same-file symbol, or a repo-unique name) + -- inferred = a heuristic resolved it (import path-suffix match) + -- ambiguous = a named target we could not pin to a unique node + -- Set by the global graph pass; never inferred by an LLM (the index is local). + confidence TEXT NOT NULL DEFAULT 'extracted' ); CREATE INDEX idx_edges_src ON edges(src_kind, src_id); CREATE INDEX idx_edges_dst ON edges(dst_kind, dst_id); diff --git a/src/codebase_index/graph/analysis.py b/src/codebase_index/graph/analysis.py new file mode 100644 index 0000000..e309b0c --- /dev/null +++ b/src/codebase_index/graph/analysis.py @@ -0,0 +1,468 @@ +"""Architecture analytics over the resolved edge graph — zero external deps. + +This is the codebase-index take on graphify's community detection / god nodes / +surprising connections, implemented in pure, deterministic Python so the core +install stays dependency-free and the results are stable across runs (which +matters for the golden-snapshot tests and CI). + +What it computes from the in-memory adjacency of resolved edges: + + * communities - label propagation groups tightly-connected nodes into + "modules". Deterministic: nodes are visited in a fixed key + order and ties break to the smallest label, so the same graph + always yields the same partition. + * god nodes - the most-connected nodes (weighted degree). These are the + symbols/files most of the codebase leans on. + * surprising - edges that bridge two otherwise weakly-connected communities. + The cross-module links you would not think to look for. + * questions - template-generated starting questions seeded from the god + nodes and the bridges, mirroring graphify's GRAPH_REPORT. + +The summary is cached in meta['graph_analysis'] by refresh_analysis() at build +time; the `architecture` command and HTML export read it back instantly. +""" + +from __future__ import annotations + +import json +import sqlite3 +from collections import Counter, defaultdict +from typing import Any, Optional + +from ..storage import repo + +# How many items to keep in the cached summary. Bounded so the meta JSON stays +# small even on very large repos. +MAX_GOD_NODES = 20 +MAX_SURPRISING = 12 +MAX_QUESTIONS = 8 +TOP_NODES_PER_COMMUNITY = 5 +MAX_COMMUNITIES_IN_SUMMARY = 40 +# A community smaller than this is noise for reporting (isolated/leaf nodes). +MIN_REPORTED_COMMUNITY = 2 +# A pair of communities joined by at most this many edges is a "bridge". +BRIDGE_MAX_EDGES = 2 +# Cap on local-move passes; the partition almost always settles in 2-4. +_LOCAL_MOVE_PASSES = 20 + +ANALYSIS_META_KEY = "graph_analysis" + +Node = tuple[str, int] # (kind, id) + + +# --------------------------------------------------------------------------- +# Graph construction +# --------------------------------------------------------------------------- + +def build_adjacency( + edges: list[sqlite3.Row], + key_fn=None, +) -> tuple[dict[Any, Counter], dict[tuple[Any, Any], int]]: + """Undirected weighted adjacency + per-edge multiplicity, from resolved edges. + + Self-loops are dropped (they distort degree and never bridge communities). + + ``key_fn(kind, id) -> hashable | None`` maps an edge endpoint to a node key + (returning None drops the edge). analyze() passes a *content* key + (kind:path:name:line) so the partition is identical across platforms — symbol + ids depend on file-walk order, which differs between OSes. The default keys by + (kind, id), used by the algorithm unit tests. + """ + def kf(kind: str, nid: int): + return key_fn(kind, nid) if key_fn is not None else (kind, nid) + + adj: dict[Any, Counter] = defaultdict(Counter) + edge_weight: dict[tuple[Any, Any], int] = defaultdict(int) + for e in edges: + src = kf(e["src_kind"], int(e["src_id"])) + dst = kf(e["dst_kind"], int(e["dst_id"])) + if src is None or dst is None or src == dst: + continue + adj[src][dst] += 1 + adj[dst][src] += 1 + edge_weight[_canonical_pair(src, dst)] += 1 + return adj, edge_weight + + +def _canonical_pair(a: Any, b: Any) -> tuple[Any, Any]: + return (a, b) if a <= b else (b, a) + + +# The graph algorithms below are generic over the node-key type: analyze() calls +# them with (kind, id) tuples; the HTML/interop export reuses them with string +# keys. Typing the key as Any keeps both call sites valid. +def weighted_degree(adj: dict[Any, Counter]) -> dict[Any, int]: + return {node: sum(neighbors.values()) for node, neighbors in adj.items()} + + +# --------------------------------------------------------------------------- +# Community detection — deterministic label propagation +# --------------------------------------------------------------------------- + +def detect_communities(adj: dict[Any, Counter]) -> dict[Any, int]: + """Partition nodes into communities by greedy modularity. Returns {node: id}. + + This is the local-moving phase of the Louvain method, made deterministic: + every node starts alone, then in a fixed key order each node moves to the + neighbouring community that yields the largest modularity gain (ties break to + the smallest community id). Passes repeat until no node moves. Unlike label + propagation it does not collapse two cliques joined by a single bridge — the + bridge's gain cannot beat the dense intra-clique structure. Labels are + renumbered to dense, size-ranked ids so community 0 is always the largest. + """ + nodes = sorted(adj.keys()) + if not nodes: + return {} + + deg = weighted_degree(adj) + two_m = sum(deg.values()) # = 2 * total edge weight + if two_m == 0: + return _renumber_by_size({node: idx for idx, node in enumerate(nodes)}) + + comm: dict[Any, int] = {node: idx for idx, node in enumerate(nodes)} + # Σ_tot per community: total weighted degree of its members. + sigma_tot: dict[int, int] = {idx: deg[node] for idx, node in enumerate(nodes)} + + for _ in range(_LOCAL_MOVE_PASSES): + moved = False + for node in nodes: + ki = deg[node] + ci = comm[node] + # Detach node from its current community. + sigma_tot[ci] -= ki + + # Weight from node into each neighbouring community. + links: Counter = Counter() + for neighbor, w in adj[node].items(): + if neighbor != node: + links[comm[neighbor]] += w + + # Pick the community maximising w_in - Σ_tot * k_i / (2m). + # Baseline = staying isolated (its own now-empty community), gain 0. + best_c = ci + best_gain = links.get(ci, 0) - sigma_tot[ci] * ki / two_m + for c, w_in in sorted(links.items()): + gain = w_in - sigma_tot[c] * ki / two_m + if gain > best_gain + 1e-12: + best_gain, best_c = gain, c + + comm[node] = best_c + sigma_tot[best_c] += ki + if best_c != ci: + moved = True + if not moved: + break + + return _renumber_by_size(comm) + + +def _renumber_by_size(label: dict[Any, int]) -> dict[Any, int]: + """Renumber raw labels to dense ids ordered by community size (desc), then by + smallest member key — so the mapping is stable run to run.""" + members: dict[int, list[Any]] = defaultdict(list) + for node, lbl in label.items(): + members[lbl].append(node) + order = sorted(members, key=lambda lbl: (-len(members[lbl]), min(members[lbl]))) + remap = {old: new for new, old in enumerate(order)} + return {node: remap[lbl] for node, lbl in label.items()} + + +def modularity(adj: dict[Any, Counter], communities: dict[Any, int]) -> float: + """Newman modularity Q of the partition — a quality score in roughly [-0.5, 1]. + + Higher means the communities capture more edge density than chance. Reported + so the user can judge how meaningful the module split is. + """ + m2 = sum(sum(neighbors.values()) for neighbors in adj.values()) # = 2 * |E| + if m2 == 0: + return 0.0 + deg = weighted_degree(adj) + q = 0.0 + for node, neighbors in adj.items(): + ci = communities[node] + for neighbor, weight in neighbors.items(): + if communities[neighbor] == ci: + q += weight - deg[node] * deg[neighbor] / m2 + return round(q / m2, 4) + + +# --------------------------------------------------------------------------- +# Node labelling +# --------------------------------------------------------------------------- + +def _node_index(conn: sqlite3.Connection) -> dict[Node, dict]: + """(kind, id) -> display metadata {kind, name, path, degree fields}.""" + rows = repo.all_graph_nodes(conn) + index: dict[Node, dict] = {} + for f in rows["file"]: + index[("file", int(f["id"]))] = { + "kind": "file", + "name": f["path"].rsplit("/", 1)[-1], + "path": f["path"], + } + for s in rows["symbol"]: + index[("symbol", int(s["id"]))] = { + "kind": "symbol", + "name": s["name"], + "symbol_kind": s["kind"], + "path": s["path"], + "line_start": s["line_start"], + "in_degree": int(s["in_degree"]), + "out_degree": int(s["out_degree"]), + } + return index + + +def _stable_key(meta: dict) -> str: + """A platform-stable node key from content, not from the volatile symbol id. + + Symbol ids are assigned in file-walk order, which differs across OSes; keying + the graph by path/name/line keeps communities and god-node ranking identical + everywhere (so the golden snapshots hold on Linux/macOS/Windows alike). + """ + if meta["kind"] == "file": + return f"file::{meta['path']}" + return f"symbol::{meta['path']}::{meta['name']}::{meta.get('line_start', '')}" + + +def _dir_of(path: str) -> str: + return path.rsplit("/", 1)[0] if "/" in path else "(root)" + + +def _is_test_path(path: str) -> bool: + """Test files cluster with the code they exercise; don't let them name the module.""" + lower = path.lower() + parts = lower.split("/") + if any(p in ("test", "tests", "__tests__", "spec", "specs") for p in parts): + return True + base = parts[-1] + return base.startswith("test_") or base.startswith("test.") or "_test." in base or ".test." in base + + +def label_community(members: list[Any], node_index: dict[Any, dict]) -> str: + """Name a community by the directory most of its (non-test) nodes live in. + + A 2-5 word, plain-language module name is what graphify asks an LLM for; here + we derive it deterministically from the dominant source directory, which for + code is a strong proxy for "what this module is". Test paths are discounted so + a cluster of production symbols isn't mislabelled "tests" just because its test + files outnumber it; a community that is *only* tests still gets named for them. + """ + prod: Counter = Counter() + allp: Counter = Counter() + for node in members: + meta = node_index.get(node) + if not (meta and meta.get("path")): + continue + d = _dir_of(meta["path"]) + allp[d] += 1 + if not _is_test_path(meta["path"]): + prod[d] += 1 + dirs = prod or allp + if not dirs: + return "module" + # Most common dir; tie -> shortest then lexicographically smallest (stable). + top = min(dirs.items(), key=lambda kv: (-kv[1], len(kv[0]), kv[0])) + return top[0] + + +# --------------------------------------------------------------------------- +# God nodes / surprising connections / questions +# --------------------------------------------------------------------------- + +def god_nodes( + adj: dict[Any, Counter], + communities: dict[Any, int], + node_index: dict[Any, dict], + *, + limit: int = MAX_GOD_NODES, +) -> list[dict]: + """Most-connected nodes by weighted degree (the load-bearing ones).""" + deg = weighted_degree(adj) + ranked = sorted(deg, key=lambda n: (-deg[n], str(n))) + out: list[dict] = [] + for node in ranked[:limit]: + meta = node_index.get(node) + if meta is None: + continue + out.append( + { + "kind": meta["kind"], + "name": meta["name"], + "path": meta.get("path"), + "degree": deg[node], + "community": communities.get(node, -1), + } + ) + return out + + +def surprising_connections( + edge_weight: dict[tuple[Any, Any], int], + communities: dict[Any, int], + node_index: dict[Any, dict], + *, + limit: int = MAX_SURPRISING, +) -> list[dict]: + """Edges that bridge two communities barely connected to each other. + + For each unordered community pair we count how many edges cross between them; + a pair joined by only a handful of edges is a surprising structural link. We + surface the actual endpoint pair for each such bridge. + """ + pair_edges: dict[tuple[int, int], list[tuple[Any, Any]]] = defaultdict(list) + for (a, b), _w in edge_weight.items(): + ca, cb = communities.get(a, -1), communities.get(b, -1) + if ca == cb or ca < 0 or cb < 0: + continue + key = (ca, cb) if ca < cb else (cb, ca) + pair_edges[key].append((a, b)) + + bridges = [ + (pair, endpoints) + for pair, endpoints in pair_edges.items() + if len(endpoints) <= BRIDGE_MAX_EDGES + ] + # Rarest bridges first (a single edge between modules is the most surprising), + # then by community-pair id for stability. + bridges.sort(key=lambda item: (len(item[1]), item[0])) + + out: list[dict] = [] + for (ca, cb), endpoints in bridges[:limit]: + a, b = sorted(endpoints)[0] + ma, mb = node_index.get(a), node_index.get(b) + if ma is None or mb is None: + continue + out.append( + { + "from": {"kind": ma["kind"], "name": ma["name"], "path": ma.get("path")}, + "to": {"kind": mb["kind"], "name": mb["name"], "path": mb.get("path")}, + "from_community": ca, + "to_community": cb, + "edge_count": len(endpoints), + } + ) + return out + + +def suggest_questions( + gods: list[dict], + surprising: list[dict], + community_labels: dict[int, str], + *, + limit: int = MAX_QUESTIONS, +) -> list[str]: + """Starter questions seeded from the structure, like graphify's report.""" + questions: list[str] = [] + for g in gods[:3]: + if g["kind"] == "symbol": + questions.append(f"How does `{g['name']}` work?") + questions.append(f"What breaks if `{g['name']}` changes?") + else: + questions.append(f"What is the role of `{g['name']}` in the architecture?") + for s in surprising[:3]: + la = community_labels.get(s["from_community"], f"community {s['from_community']}") + lb = community_labels.get(s["to_community"], f"community {s['to_community']}") + if la != lb: + questions.append(f"How is `{la}` connected to `{lb}`?") + # De-dup, preserve order. + seen: set[str] = set() + deduped: list[str] = [] + for q in questions: + if q not in seen: + seen.add(q) + deduped.append(q) + return deduped[:limit] + + +# --------------------------------------------------------------------------- +# Top-level entry points +# --------------------------------------------------------------------------- + +def analyze(conn: sqlite3.Connection) -> dict: + """Compute the full architecture-analytics summary (does not persist it).""" + edges = repo.all_resolved_edges(conn) + id_index = _node_index(conn) # (kind, id) -> meta + + # Key the graph by stable content keys, not by volatile symbol ids, so the + # result is identical across platforms. node_index then maps that stable key + # back to display metadata. + node_index: dict[str, dict] = {} + + def key_fn(kind: str, nid: int): + meta = id_index.get((kind, nid)) + if meta is None: + return None + k = _stable_key(meta) + node_index.setdefault(k, meta) + return k + + adj, edge_weight = build_adjacency(edges, key_fn) + + communities = detect_communities(adj) + members: dict[int, list[str]] = defaultdict(list) + for node, cid in communities.items(): + members[cid].append(node) + + community_labels = {cid: label_community(nodes, node_index) for cid, nodes in members.items()} + deg = weighted_degree(adj) + + community_summaries: list[dict] = [] + reported = sorted(members, key=lambda cid: (-len(members[cid]), cid)) + for cid in reported: + nodes = members[cid] + if len(nodes) < MIN_REPORTED_COMMUNITY: + continue + top = sorted(nodes, key=lambda n: (-deg.get(n, 0), str(n)))[:TOP_NODES_PER_COMMUNITY] + community_summaries.append( + { + "id": cid, + "label": community_labels[cid], + "size": len(nodes), + "top_nodes": [ + { + "kind": node_index[n]["kind"], + "name": node_index[n]["name"], + "path": node_index[n].get("path"), + "degree": deg.get(n, 0), + } + for n in top + if n in node_index + ], + } + ) + if len(community_summaries) >= MAX_COMMUNITIES_IN_SUMMARY: + break + + gods = god_nodes(adj, communities, node_index) + surprising = surprising_connections(edge_weight, communities, node_index) + questions = suggest_questions(gods, surprising, community_labels) + + return { + "node_count": len(adj), + "edge_count": sum(edge_weight.values()), + "community_count": sum(1 for nodes in members.values() if len(nodes) >= MIN_REPORTED_COMMUNITY), + "modularity": modularity(adj, communities), + "communities": community_summaries, + "god_nodes": gods, + "surprising": surprising, + "questions": questions, + } + + +def refresh_analysis(conn: sqlite3.Connection) -> dict: + """Compute and cache the analysis summary into meta['graph_analysis'].""" + summary = analyze(conn) + repo.set_meta(conn, ANALYSIS_META_KEY, json.dumps(summary, ensure_ascii=False)) + return summary + + +def load_analysis(conn: sqlite3.Connection) -> Optional[dict]: + """Read the cached analysis summary, or None if the build never produced one.""" + raw = repo.get_meta(conn, ANALYSIS_META_KEY) + if not raw: + return None + try: + return json.loads(raw) + except (ValueError, TypeError): + return None diff --git a/src/codebase_index/graph/builder.py b/src/codebase_index/graph/builder.py index f2e342b..4547883 100644 --- a/src/codebase_index/graph/builder.py +++ b/src/codebase_index/graph/builder.py @@ -26,7 +26,20 @@ def build_graph(conn: sqlite3.Connection) -> dict[str, int]: resolved = resolve_edges(conn) repo.recompute_degrees(conn) + # Everything still unresolved that names a target is, by definition, a target we + # could not pin to a unique node — record it as 'ambiguous' for the honesty trail. + repo.mark_ambiguous_edges(conn) total_unresolved = len(repo.unresolved_edges(conn)) + # Architecture analytics (communities / god nodes / surprising bridges) are a + # derived view of the graph. Compute once per build and cache the JSON in meta so + # the `architecture` command and the HTML export read it instantly. Never let an + # analysis failure fail the build — the graph itself is already written. + try: + from . import analysis + + analysis.refresh_analysis(conn) + except Exception: # pragma: no cover - defensive; analytics are best-effort + pass return {"resolved": resolved, "unresolved": total_unresolved} @@ -38,17 +51,20 @@ def resolve_edges(conn: sqlite3.Connection) -> int: unique_symbols = repo.unique_symbol_ids_by_name(conn) suffix_map = _path_suffix_map(repo.all_file_ids_with_paths(conn)) - resolutions: list[tuple[str, int, int]] = [] + # (dst_kind, dst_id, edge_id, confidence). A repo-unique symbol name is an exact + # hit -> 'extracted'; an import resolved only by path-suffix matching is a best- + # effort heuristic -> 'inferred'. + resolutions: list[tuple[str, int, int, str]] = [] for edge in edges: name = edge["dst_name"] if edge["edge_type"] == "import": file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"]) if file_id is not None: - resolutions.append(("file", file_id, edge["id"])) + resolutions.append(("file", file_id, edge["id"], "inferred")) elif edge["edge_type"] in _SYMBOL_EDGE_TYPES: sym_id = unique_symbols.get(name) if sym_id is not None: - resolutions.append(("symbol", sym_id, edge["id"])) + resolutions.append(("symbol", sym_id, edge["id"], "extracted")) repo.resolve_edges_bulk(conn, resolutions) return len(resolutions) diff --git a/src/codebase_index/graph/expand.py b/src/codebase_index/graph/expand.py index deebed1..02ee989 100644 --- a/src/codebase_index/graph/expand.py +++ b/src/codebase_index/graph/expand.py @@ -51,14 +51,14 @@ def _seed_nodes(conn: sqlite3.Connection, target: str) -> list[tuple[str, int]]: def _neighbors(conn, kind, node_id, direction): - """Yield (next_kind, next_id, edge_type) for the requested direction(s).""" + """Yield (next_kind, next_id, edge_type, confidence) for the requested direction(s).""" if direction in ("up", "both"): for e in repo.incoming_edges(conn, kind, node_id): - yield e["src_kind"], int(e["src_id"]), e["edge_type"] + yield e["src_kind"], int(e["src_id"]), e["edge_type"], e["confidence"] if direction in ("down", "both"): for e in repo.outgoing_edges(conn, kind, node_id): if e["dst_id"] is not None: - yield e["dst_kind"], int(e["dst_id"]), e["edge_type"] + yield e["dst_kind"], int(e["dst_id"]), e["edge_type"], e["confidence"] def _node_meta(conn, kind, node_id) -> Optional[ImpactNode]: @@ -92,7 +92,7 @@ def walk_impact( kind, node_id, dist = queue.popleft() if dist >= depth: continue - for nk, nid, etype in _neighbors(conn, kind, node_id, direction): + for nk, nid, etype, conf in _neighbors(conn, kind, node_id, direction): if (nk, nid) in visited: continue visited.add((nk, nid)) @@ -101,6 +101,7 @@ def walk_impact( continue meta.distance = dist + 1 meta.via_edge = etype + meta.via_confidence = conf out.append(meta) queue.append((nk, nid, dist + 1)) return out diff --git a/src/codebase_index/models.py b/src/codebase_index/models.py index b35d959..b87ec79 100644 --- a/src/codebase_index/models.py +++ b/src/codebase_index/models.py @@ -113,6 +113,9 @@ class RefSite(BaseModel): path: str line: int kind: str + # Audit trail (see edges.confidence): 'extracted' = exact match, 'inferred' = + # heuristic, 'ambiguous' = unresolved/non-unique. Defaults keep older callers valid. + confidence: str = "extracted" class RefsResponse(BaseModel): @@ -129,6 +132,7 @@ class ImpactNode(BaseModel): line_start: Optional[int] = None distance: int # BFS hops from the target (1 = direct) via_edge: Optional[str] = None # edge_type that linked it (import|call|extends|...) + via_confidence: Optional[str] = None # confidence of the linking edge (audit trail) class ImpactResponse(BaseModel): diff --git a/src/codebase_index/output/markdown.py b/src/codebase_index/output/markdown.py index afd6a76..f7b4151 100644 --- a/src/codebase_index/output/markdown.py +++ b/src/codebase_index/output/markdown.py @@ -130,6 +130,15 @@ def _coverage_line(coverage) -> Optional[str]: return None +# Audit-trail glyphs: an exact edge needs no annotation; inferred/ambiguous ones +# warn the reader that the link is a heuristic or could not be pinned down. +_CONF_MARK = {"extracted": "", "inferred": "~ inferred", "ambiguous": "? ambiguous"} + + +def _conf_mark(confidence: Optional[str]) -> str: + return _CONF_MARK.get(confidence or "extracted", confidence or "") + + def render_refs(resp: RefsResponse) -> str: lines = [_header(resp.query, resp.index.exists, resp.index.stale)] lines.append("") @@ -140,10 +149,12 @@ def render_refs(resp: RefsResponse) -> str: lines.append(note) return "\n".join(lines).rstrip() + "\n" - lines.append("| kind | path | line |") - lines.append("|------|------|------|") + lines.append("| kind | path | line | confidence |") + lines.append("|------|------|------|------------|") for site in resp.sites: - lines.append(f"| {site.kind} | `{site.path}` | {site.line} |") + lines.append( + f"| {site.kind} | `{site.path}` | {site.line} | {_conf_mark(site.confidence) or 'exact'} |" + ) if note: lines.append(note) return "\n".join(lines).rstrip() + "\n" @@ -171,7 +182,9 @@ def render_impact(resp: ImpactResponse) -> str: for n in sorted(resp.nodes, key=lambda x: (x.distance, x.path, x.line_start or 0)): loc = f"{n.path}:{n.line_start}" if n.line_start else n.path node_name = f"`{n.name}`" if n.name else "—" - lines.append(f"| {n.distance} | {n.via_edge or ''} | {n.kind} | {node_name} | `{loc}` |") + mark = _conf_mark(n.via_confidence) + via = f"{n.via_edge or ''} {mark}".strip() + lines.append(f"| {n.distance} | {via} | {n.kind} | {node_name} | `{loc}` |") if note: lines.append(note) return "\n".join(lines).rstrip() + "\n" diff --git a/src/codebase_index/retrieval/searchers.py b/src/codebase_index/retrieval/searchers.py index 138bf80..d82aca5 100644 --- a/src/codebase_index/retrieval/searchers.py +++ b/src/codebase_index/retrieval/searchers.py @@ -234,11 +234,17 @@ def symbol_lookup( def refs_lookup(conn: sqlite3.Connection, name: str, *, kind: str) -> RefsResponse: defs = repo.symbols_by_name(conn, name, exact=True) sites = [ - RefSite(path=row["path"], line=row["line"], kind="call") + RefSite( + path=row["path"], + line=row["line"], + kind="call", + confidence=row["confidence"] if "confidence" in row.keys() else "extracted", + ) for row in repo.refs_for_name(conn, name) ] if kind == "all": sites.extend( + # A definition is the symbol itself — exact by construction. RefSite(path=row["path"], line=row["line_start"], kind="definition") for row in defs ) diff --git a/src/codebase_index/storage/db.py b/src/codebase_index/storage/db.py index 60abbcd..7cb1da1 100644 --- a/src/codebase_index/storage/db.py +++ b/src/codebase_index/storage/db.py @@ -8,7 +8,8 @@ from typing import Optional # 2: chunks gained a denormalized `symbol_names` column (FTS symbol-name boost). -SCHEMA_VERSION = 2 +# 3: edges gained a `confidence` column (extracted/inferred/ambiguous audit trail). +SCHEMA_VERSION = 3 class Database: diff --git a/src/codebase_index/storage/repo.py b/src/codebase_index/storage/repo.py index 77a54cd..6bef9eb 100644 --- a/src/codebase_index/storage/repo.py +++ b/src/codebase_index/storage/repo.py @@ -253,11 +253,15 @@ def replace_edges( conn.executemany( """ INSERT INTO edges - (edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line, resolved) + (edge_type, src_kind, src_id, dst_kind, dst_id, dst_name, file_id, line, + resolved, confidence) VALUES - (:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line, :resolved) + (:edge_type, :src_kind, :src_id, :dst_kind, :dst_id, :dst_name, :file_id, :line, + :resolved, :confidence) """, - [{**edge, "file_id": file_id} for edge in edges], + # confidence defaults to 'extracted' for callers (and tests) that predate the + # audit-trail column; the global graph pass refines it (see graph/builder.py). + [{"confidence": "extracted", **edge, "file_id": file_id} for edge in edges], ) return len(edges) @@ -271,6 +275,7 @@ def refs_for_name(conn: sqlite3.Connection, name: str) -> list[sqlite3.Row]: """ SELECT e.line AS line, f.path AS path, e.edge_type AS edge_type, e.resolved AS resolved, e.src_id AS src_id, e.src_kind AS src_kind, + e.confidence AS confidence, src.name AS src_name, src.qualified AS src_qualified FROM edges e JOIN files f ON f.id = e.file_id @@ -388,15 +393,59 @@ def resolve_edge(conn: sqlite3.Connection, edge_id: int, dst_kind: str, dst_id: def resolve_edges_bulk( - conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int]] + conn: sqlite3.Connection, resolutions: Sequence[tuple[str, int, int, str]] ) -> None: - """Apply (dst_kind, dst_id, edge_id) resolutions in one executemany.""" + """Apply (dst_kind, dst_id, edge_id, confidence) resolutions in one executemany. + + confidence records *how* the target was found: 'extracted' for an exact match + (a repo-unique symbol name), 'inferred' for a heuristic (import path-suffix). + """ conn.executemany( - "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1 WHERE id = ?", - resolutions, + "UPDATE edges SET dst_kind = ?, dst_id = ?, resolved = 1, confidence = ? WHERE id = ?", + [(dst_kind, dst_id, confidence, edge_id) for dst_kind, dst_id, edge_id, confidence in resolutions], ) +def mark_ambiguous_edges(conn: sqlite3.Connection) -> int: + """Flag every still-unresolved edge that names a target as 'ambiguous'. + + Run after the global resolution pass: an edge with a dst_name that no unique + symbol/file claims is one we could not pin down (a non-unique name, or an import + of code outside the repo). Marking it keeps refs/impact honest — an empty or + short answer over ambiguous edges is inconclusive, not proof of "no callers". + """ + cur = conn.execute( + "UPDATE edges SET confidence = 'ambiguous' " + "WHERE resolved = 0 AND dst_name IS NOT NULL AND confidence != 'ambiguous'" + ) + return cur.rowcount if cur.rowcount is not None else 0 + + +def all_resolved_edges(conn: sqlite3.Connection) -> list[sqlite3.Row]: + """Every resolved edge as (src_kind, src_id, dst_kind, dst_id, edge_type, confidence). + + The in-memory adjacency the graph analysis (communities / god nodes / bridges) + is built from. Unresolved edges are skipped — they have no concrete endpoint. + """ + return conn.execute( + "SELECT src_kind, src_id, dst_kind, dst_id, edge_type, confidence FROM edges " + "WHERE resolved = 1 AND dst_id IS NOT NULL" + ).fetchall() + + +def all_graph_nodes(conn: sqlite3.Connection) -> dict[str, list[sqlite3.Row]]: + """File and symbol rows keyed by kind, for labelling graph-analysis nodes.""" + return { + "file": conn.execute("SELECT id, path FROM files").fetchall(), + "symbol": conn.execute( + "SELECT s.id AS id, s.name AS name, s.kind AS kind, f.path AS path, " + " s.line_start AS line_start, " + " s.in_degree AS in_degree, s.out_degree AS out_degree " + "FROM symbols s JOIN files f ON f.id = s.file_id" + ).fetchall(), + } + + def name_ref_counts(conn: sqlite3.Connection, names: Sequence[str]) -> dict[str, int]: """Count edges targeting each name (any resolution state), keyed by dst_name. @@ -458,7 +507,7 @@ def symbols_in_file(conn: sqlite3.Connection, file_id: int) -> list[sqlite3.Row] def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]: return conn.execute( - "SELECT id, edge_type, src_kind, src_id, file_id, line FROM edges " + "SELECT id, edge_type, src_kind, src_id, file_id, line, confidence FROM edges " "WHERE resolved = 1 AND dst_kind = ? AND dst_id = ?", (kind, node_id), ).fetchall() @@ -466,7 +515,7 @@ def incoming_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sq def outgoing_edges(conn: sqlite3.Connection, kind: str, node_id: int) -> list[sqlite3.Row]: return conn.execute( - "SELECT id, edge_type, dst_kind, dst_id, file_id, line FROM edges " + "SELECT id, edge_type, dst_kind, dst_id, file_id, line, confidence FROM edges " "WHERE resolved = 1 AND src_kind = ? AND src_id = ?", (kind, node_id), ).fetchall() diff --git a/src/codebase_index/storage/schema.sql b/src/codebase_index/storage/schema.sql index 10bde93..bed17bb 100644 --- a/src/codebase_index/storage/schema.sql +++ b/src/codebase_index/storage/schema.sql @@ -63,7 +63,13 @@ CREATE TABLE IF NOT EXISTS edges ( dst_name TEXT, file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, line INTEGER, - resolved INTEGER NOT NULL DEFAULT 0 + resolved INTEGER NOT NULL DEFAULT 0, + -- Honesty audit trail (see docs/SCHEMA.md). How sure are we this edge points + -- where it claims? 'extracted' = exact match (same-file symbol or a repo-unique + -- name); 'inferred' = a heuristic resolved it (import path-suffix); 'ambiguous' + -- = a name/import we could not pin to a unique target. Set at build time by the + -- global graph pass; never guessed by an LLM (the index is fully local). + confidence TEXT NOT NULL DEFAULT 'extracted' ); CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_kind, src_id); CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_kind, dst_id); diff --git a/tests/golden/impact_user_model.json b/tests/golden/impact_user_model.json index 9ec0c77..cb8d033 100644 --- a/tests/golden/impact_user_model.json +++ b/tests/golden/impact_user_model.json @@ -23,6 +23,7 @@ "line_start": null, "name": null, "path": "src/api/service.py", + "via_confidence": "inferred", "via_edge": "import" }, { @@ -31,6 +32,7 @@ "line_start": 7, "name": "AdminUser", "path": "src/api/service.py", + "via_confidence": "extracted", "via_edge": "extends" } ], diff --git a/tests/golden/mcp_find_refs.json b/tests/golden/mcp_find_refs.json index abdd727..13208fa 100644 --- a/tests/golden/mcp_find_refs.json +++ b/tests/golden/mcp_find_refs.json @@ -15,16 +15,19 @@ "schema_version": 1, "sites": [ { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/api/service.py" }, { + "confidence": "extracted", "kind": "definition", "line": 4, "path": "src/auth/token.py" }, { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/auth/token.py" diff --git a/tests/golden/mcp_impact_of.json b/tests/golden/mcp_impact_of.json index 5fc14dc..1629921 100644 --- a/tests/golden/mcp_impact_of.json +++ b/tests/golden/mcp_impact_of.json @@ -23,6 +23,7 @@ "line_start": null, "name": null, "path": "src/api/service.py", + "via_confidence": "inferred", "via_edge": "import" }, { @@ -31,6 +32,7 @@ "line_start": 7, "name": "AdminUser", "path": "src/api/service.py", + "via_confidence": "extracted", "via_edge": "extends" } ], diff --git a/tests/golden/refs_refresh_access_token.json b/tests/golden/refs_refresh_access_token.json index f5693df..1ce827e 100644 --- a/tests/golden/refs_refresh_access_token.json +++ b/tests/golden/refs_refresh_access_token.json @@ -14,16 +14,19 @@ "query": "refresh_access_token", "sites": [ { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/api/service.py" }, { + "confidence": "extracted", "kind": "definition", "line": 4, "path": "src/auth/token.py" }, { + "confidence": "extracted", "kind": "call", "line": 11, "path": "src/auth/token.py" diff --git a/tests/test_analysis.py b/tests/test_analysis.py new file mode 100644 index 0000000..60eb90a --- /dev/null +++ b/tests/test_analysis.py @@ -0,0 +1,194 @@ +"""Tests for graph.analysis — communities / god nodes / surprising bridges. + +The pure-Python graph functions are deterministic, so the assertions pin exact +structure (two cliques joined by one bridge → two communities + one surprising +link) rather than fuzzy thresholds. +""" + +from __future__ import annotations + +from codebase_index.config import Config +from codebase_index.graph import analysis +from codebase_index.indexer.pipeline import build_index +from codebase_index.parsers.base import Symbol +from codebase_index.storage import repo +from codebase_index.storage.db import Database + + +# --- pure-Python graph algorithms (no DB) ------------------------------------- + +def _two_cliques_with_bridge(): + """Two triangles (A0-A1-A2) and (B0-B1-B2) joined by a single A0-B0 edge.""" + edges = [] + + def edge(s, d): + return {"src_kind": "symbol", "src_id": s, "dst_kind": "symbol", "dst_id": d} + + # clique A: ids 0,1,2 ; clique B: ids 10,11,12 + for a, b in [(0, 1), (1, 2), (0, 2)]: + edges.append(edge(a, b)) + for a, b in [(10, 11), (11, 12), (10, 12)]: + edges.append(edge(a, b)) + edges.append(edge(0, 10)) # the bridge + return edges + + +def test_detect_communities_splits_two_cliques(): + adj, _ = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + # All of clique A share one label; all of clique B share another; they differ. + a_labels = {comm[("symbol", i)] for i in (0, 1, 2)} + b_labels = {comm[("symbol", i)] for i in (10, 11, 12)} + assert len(a_labels) == 1 + assert len(b_labels) == 1 + assert a_labels != b_labels + + +def test_modularity_is_positive_for_clear_structure(): + adj, _ = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + assert analysis.modularity(adj, comm) > 0.0 + + +def test_god_nodes_rank_by_degree(): + # Make node 0 a hub: connect it to many leaves. + edges = [ + {"src_kind": "symbol", "src_id": 0, "dst_kind": "symbol", "dst_id": leaf} + for leaf in range(1, 6) + ] + adj, _ = analysis.build_adjacency(edges) + comm = analysis.detect_communities(adj) + node_index = { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/x.py"} + for i in range(6) + } + gods = analysis.god_nodes(adj, comm, node_index, limit=3) + assert gods[0]["name"] == "sym0" + assert gods[0]["degree"] == 5 + + +def test_surprising_connection_finds_the_bridge(): + adj, edge_weight = analysis.build_adjacency(_two_cliques_with_bridge()) + comm = analysis.detect_communities(adj) + node_index = { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/a.py"} + for i in (0, 1, 2) + } + node_index.update( + { + ("symbol", i): {"kind": "symbol", "name": f"sym{i}", "path": "src/b.py"} + for i in (10, 11, 12) + } + ) + surprising = analysis.surprising_connections(edge_weight, comm, node_index) + assert len(surprising) == 1 + names = {surprising[0]["from"]["name"], surprising[0]["to"]["name"]} + assert names == {"sym0", "sym10"} + assert surprising[0]["edge_count"] == 1 + + +def test_label_community_uses_dominant_directory(): + node_index = { + ("symbol", 1): {"kind": "symbol", "name": "a", "path": "src/auth/token.py"}, + ("symbol", 2): {"kind": "symbol", "name": "b", "path": "src/auth/login.py"}, + ("symbol", 3): {"kind": "symbol", "name": "c", "path": "src/db/conn.py"}, + } + label = analysis.label_community([("symbol", 1), ("symbol", 2), ("symbol", 3)], node_index) + assert label == "src/auth" + + +def test_label_community_discounts_test_paths(): + # Two production symbols in src/storage and three test files that exercise them: + # tests outnumber prod, but the module should still be named for the prod code. + node_index = { + ("symbol", 1): {"kind": "symbol", "name": "a", "path": "src/storage/db.py"}, + ("symbol", 2): {"kind": "symbol", "name": "b", "path": "src/storage/repo.py"}, + ("symbol", 3): {"kind": "symbol", "name": "t1", "path": "tests/test_db.py"}, + ("symbol", 4): {"kind": "symbol", "name": "t2", "path": "tests/test_repo.py"}, + ("symbol", 5): {"kind": "symbol", "name": "t3", "path": "tests/test_x.py"}, + } + members = [("symbol", i) for i in range(1, 6)] + assert analysis.label_community(members, node_index) == "src/storage" + # A community that is *only* tests still gets named for them. + only_tests = [("symbol", i) for i in (3, 4, 5)] + assert analysis.label_community(only_tests, node_index) == "tests" + + +def test_suggest_questions_seeds_from_structure(): + gods = [{"kind": "symbol", "name": "Engine", "path": "x", "degree": 9, "community": 0}] + surprising = [ + { + "from": {"kind": "symbol", "name": "a", "path": "x"}, + "to": {"kind": "symbol", "name": "b", "path": "y"}, + "from_community": 0, + "to_community": 1, + "edge_count": 1, + } + ] + qs = analysis.suggest_questions(gods, surprising, {0: "core", 1: "io"}) + assert any("Engine" in q for q in qs) + assert any("core" in q and "io" in q for q in qs) + + +# --- integration against a real built index ----------------------------------- + +def _seed_two_modules(db: Database) -> None: + """auth module (token<-login) and db module (query<-exec), bridged login->query.""" + auth = repo.upsert_file( + db.conn, path="src/auth/token.py", lang="python", size_bytes=1, sha256="a", + mtime_ns=1, git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + db_f = repo.upsert_file( + db.conn, path="src/db/query.py", lang="python", size_bytes=1, sha256="b", + mtime_ns=1, git_status=None, parser="treesitter", indexed_at="t", is_generated=False, + ) + a = repo.replace_symbols(db.conn, auth, [ + Symbol(name="make_token", kind="function", line_start=1, line_end=2), + Symbol(name="login", kind="function", line_start=3, line_end=4), + ]) + b = repo.replace_symbols(db.conn, db_f, [ + Symbol(name="run_query", kind="function", line_start=1, line_end=2), + Symbol(name="exec_stmt", kind="function", line_start=3, line_end=4), + ]) + repo.replace_edges(db.conn, auth, [ + {"edge_type": "call", "src_kind": "symbol", "src_id": a[1], + "dst_kind": None, "dst_id": None, "dst_name": "make_token", "line": 3, "resolved": 0}, + {"edge_type": "call", "src_kind": "symbol", "src_id": a[1], + "dst_kind": None, "dst_id": None, "dst_name": "run_query", "line": 4, "resolved": 0}, + ]) + repo.replace_edges(db.conn, db_f, [ + {"edge_type": "call", "src_kind": "symbol", "src_id": b[0], + "dst_kind": None, "dst_id": None, "dst_name": "exec_stmt", "line": 2, "resolved": 0}, + ]) + + +def test_analyze_and_cache_roundtrip(tmp_path): + from codebase_index.graph.builder import build_graph + + db = Database(tmp_path / "index.sqlite").open() + _seed_two_modules(db) + build_graph(db.conn) # resolves edges + refresh_analysis caches the summary + + cached = analysis.load_analysis(db.conn) + assert cached is not None + assert cached["node_count"] > 0 + assert cached["god_nodes"], "expected at least one god node" + # Recomputing directly matches the cached summary's headline numbers. + fresh = analysis.analyze(db.conn) + assert fresh["node_count"] == cached["node_count"] + assert fresh["edge_count"] == cached["edge_count"] + db.close() + + +def test_analyze_on_sample_repo(sample_repo, tmp_path): + cfg = Config() + cfg.root = str(sample_repo) + db = Database(tmp_path / "index.sqlite").open() + build_index(cfg, db, root=sample_repo) + + summary = analysis.load_analysis(db.conn) + assert summary is not None + assert summary["node_count"] >= 1 + assert isinstance(summary["communities"], list) + assert isinstance(summary["questions"], list) + db.close() diff --git a/tests/test_graph.py b/tests/test_graph.py index f8e5110..f18c0ba 100644 --- a/tests/test_graph.py +++ b/tests/test_graph.py @@ -63,6 +63,22 @@ def test_build_graph_resolves_symbol_and_import_edges(tmp_path): db.close() +def test_build_graph_sets_edge_confidence(tmp_path): + db = _db(tmp_path) + _seed(db) + build_graph(db.conn) + conf = { + (r["edge_type"], r["dst_name"]): r["confidence"] + for r in db.conn.execute("SELECT edge_type, dst_name, confidence FROM edges") + } + # exact unique-name symbol match, import resolved by path-suffix heuristic, + # and a callee no symbol defines. + assert conf[("call", "refresh_access_token")] == "extracted" + assert conf[("import", "auth.token")] == "inferred" + assert conf[("call", "does_not_exist")] == "ambiguous" + db.close() + + def _file(db, path, sha="x"): return repo.upsert_file( db.conn, path=path, lang="python", size_bytes=1, sha256=sha,