From 966e6cd035255d316f7aca62d0b9bf5f4c117cf7 Mon Sep 17 00:00:00 2001 From: denfry Date: Tue, 23 Jun 2026 23:18:14 +0300 Subject: [PATCH] feat(architecture): `architecture` command + `architecture_overview` MCP tool Phase 2 of porting graphify into codebase-index. Surfaces the analytics cached in Phase 1 as a high-level codebase map. - `codebase-index architecture` (+ --json): modules with auto-labels, god nodes, surprising cross-module bridges, suggested questions; reads the cached summary (no recompute). Reports "unavailable" + reindex hint on a pre-analytics index. - MCP tool `architecture_overview` exposes the same payload via the stable envelope; added to server instructions. - service.architecture_payload (shared by CLI + MCP) + render_architecture. - Tests: payload available/unavailable paths; CLI + MCP goldens regenerated. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 10 ++ README.md | 6 +- docs/MCP.md | 6 +- src/codebase_index/cli.py | 22 ++++ src/codebase_index/mcp/server.py | 22 +++- src/codebase_index/output/markdown.py | 56 ++++++++++ src/codebase_index/service.py | 27 +++++ tests/golden/architecture.json | 151 +++++++++++++++++++++++++ tests/golden/mcp_architecture.json | 153 ++++++++++++++++++++++++++ tests/test_analysis.py | 35 ++++++ tests/test_cli_golden.py | 1 + tests/test_mcp_golden.py | 2 + tests/test_mcp_server.py | 2 + 13 files changed, 489 insertions(+), 4 deletions(-) create mode 100644 tests/golden/architecture.json create mode 100644 tests/golden/mcp_architecture.json diff --git a/CHANGELOG.md b/CHANGELOG.md index b50d274..02f1bb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,16 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +### Added — `architecture` command + `architecture_overview` MCP tool +- **`codebase-index architecture`** prints a high-level map of the codebase from + the analytics cached at index time: detected modules (with auto-derived labels), + god nodes (most-connected symbols/files), surprising cross-module connections, + and suggested starting questions. `--json` for the structured payload. +- **`architecture_overview` MCP tool** exposes the same map to MCP clients, so an + agent can orient itself before diving into specifics. Reports + `available: false` (rather than crashing) on an index built before the analytics + existed; a reindex fixes it. + ### Added — graph foundation: edge confidence + architecture analytics (requires a one-time reindex) - **Edge confidence audit trail.** Every graph edge now carries a `confidence`: `extracted` (exact — a same-file symbol or repo-unique name), `inferred` (a diff --git a/README.md b/README.md index 363d414..1fc48f6 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,8 @@ See [CHANGELOG.md](CHANGELOG.md) and MCP is now available as a stdio server via `codebase-index mcp --root `. It exposes `healthcheck`, `search_code`, `find_symbol`, `find_refs`, -`impact_of`, `explain_code`, and `index_stats`; see [docs/MCP.md](docs/MCP.md). +`impact_of`, `explain_code`, `architecture_overview`, and `index_stats`; +see [docs/MCP.md](docs/MCP.md). ``` You: "Where is user authentication implemented?" @@ -387,6 +388,9 @@ codebase-index refs "AuthService.login" # Analyze impact of a change codebase-index impact "src/auth/AuthService.ts" +# Map the codebase: modules, god nodes, surprising links, suggested questions +codebase-index architecture + # View index statistics codebase-index stats diff --git a/docs/MCP.md b/docs/MCP.md index d56c675..c68ced9 100644 --- a/docs/MCP.md +++ b/docs/MCP.md @@ -37,6 +37,7 @@ The MCP server exposes the same retrieval contract as the CLI. | `find_refs` | Return callers/references for a symbol | `refs` | | `impact_of` | Return affected files/symbols from graph expansion | `impact` | | `explain_code` | Intent-aware retrieval packet for a natural-language question | `explain` | +| `architecture_overview` | Modules, god nodes, surprising connections, suggested questions | `architecture` | | `index_stats` | Return counts, language coverage, graph stats, freshness | `stats` | ## Output contract @@ -64,7 +65,8 @@ branch on the contract without sniffing the shape: breaking change (field removal or type change); additive fields keep the same version. The current version is **1**. - `tool` (string) — the emitting tool name (`search_code`, `find_symbol`, - `find_refs`, `impact_of`, `explain_code`, `index_stats`, `healthcheck`). + `find_refs`, `impact_of`, `explain_code`, `architecture_overview`, + `index_stats`, `healthcheck`). - The no-index / error path carries the same envelope plus an `"error"` field. Rules: @@ -156,7 +158,7 @@ same trust boundaries: - Done: `src/codebase_index/mcp/server.py` thin adapter over retrieval/storage code. - Done: `codebase-index mcp --root ` CLI entrypoint. - Done: `healthcheck`, `search_code`, `find_symbol`, `find_refs`, `impact_of`, `explain_code`, - and `index_stats` tools. + `architecture_overview`, and `index_stats` tools. - Done: focused tests for tool registration, missing-index behavior, config resolution, and run entrypoint. - Done: explicit `schema_version` + `tool` envelope on every structured tool payload (including the error path), asserted by `tests/test_mcp_server.py` and `tests/test_mcp_golden.py`. diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index 692158a..ff7e66c 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -499,6 +499,28 @@ def explain( typer.echo(json_renderer.render(payload) if want_json else md_renderer.render(payload)) +@app.command("architecture") +def architecture( + ctx: typer.Context, + json_flag: bool = typer.Option(False, "--json", help="Emit machine-readable JSON."), +) -> None: + """High-level map of the codebase: modules, god nodes, surprising links, questions. + + Reads the analytics cached at index time (no recompute). Rebuild the index if it + reports no analysis available. + """ + from .output import json as json_renderer + from .output import markdown as md_renderer + from .service import architecture_payload + + is_json = json_flag or bool(ctx.obj and ctx.obj.get("json")) + db_path, cfg = _ensure_index(ctx) + payload = architecture_payload(db_path, cfg) + typer.echo( + json_renderer.render(payload) if is_json else md_renderer.render_architecture(payload) + ) + + @app.command("graph") def graph_view( ctx: typer.Context, diff --git a/src/codebase_index/mcp/server.py b/src/codebase_index/mcp/server.py index ae16c07..203423c 100644 --- a/src/codebase_index/mcp/server.py +++ b/src/codebase_index/mcp/server.py @@ -41,7 +41,8 @@ instructions=( "Local codebase index. Use search_code for general queries, find_symbol for exact " "symbol lookups, find_refs to find callers/usages, impact_of for blast-radius analysis, " - "and explain_code for architecture/how-it-works questions." + "explain_code for architecture/how-it-works questions, and architecture_overview to map " + "the codebase's modules, god nodes, and surprising connections before diving in." ), ) @@ -263,6 +264,25 @@ def explain_code( return _emit("explain_code", payload) +@_tool() +def architecture_overview() -> str: + """High-level map of the codebase from the cached graph analytics. + + Returns the detected modules (communities), god nodes (most-connected + symbols/files), surprising cross-module connections, and suggested starting + questions. Use this to orient before diving into specifics. Rebuild the index + if it reports ``available: false``. + """ + db_path, cfg = _resolve_db() + if not db_path.exists(): + return _emit("architecture_overview", _no_index_payload()) + + from ..service import architecture_payload + + payload = architecture_payload(db_path, cfg) + return _emit("architecture_overview", payload) + + @_tool() def index_stats() -> str: """Return index freshness, file count, symbol count, and per-language coverage.""" diff --git a/src/codebase_index/output/markdown.py b/src/codebase_index/output/markdown.py index f7b4151..50245e0 100644 --- a/src/codebase_index/output/markdown.py +++ b/src/codebase_index/output/markdown.py @@ -160,6 +160,62 @@ def render_refs(resp: RefsResponse) -> str: return "\n".join(lines).rstrip() + "\n" +def render_architecture(payload: dict) -> str: + """Render the architecture overview: modules, god nodes, surprising links, questions.""" + if not payload.get("available", False): + reason = payload.get("reason", "No architecture analysis available.") + return f"_{reason}_\n" + + idx = payload.get("index", {}) + freshness = "fresh" if not idx.get("stale") else "STALE" + lines = [ + f"**Architecture overview** | **index:** {freshness} | " + f"{payload.get('node_count', 0)} nodes · {payload.get('edge_count', 0)} edges · " + f"{payload.get('community_count', 0)} modules · modularity {payload.get('modularity', 0)}", + "", + ] + + communities = payload.get("communities", []) + if communities: + lines.append("### Modules") + lines.append("| # | module | size | key nodes |") + lines.append("|---|--------|------|-----------|") + for c in communities: + tops = ", ".join(f"`{t['name']}`" for t in c.get("top_nodes", [])[:4]) + lines.append(f"| {c['id']} | {c['label']} | {c['size']} | {tops} |") + lines.append("") + + gods = payload.get("god_nodes", []) + if gods: + lines.append("### God nodes (most-connected)") + lines.append("| node | kind | degree | location |") + lines.append("|------|------|--------|----------|") + for g in gods: + loc = g.get("path") or "" + lines.append(f"| `{g['name']}` | {g['kind']} | {g['degree']} | `{loc}` |") + lines.append("") + + surprising = payload.get("surprising", []) + if surprising: + lines.append("### Surprising connections (cross-module bridges)") + for s in surprising: + fr, to = s["from"], s["to"] + lines.append( + f"- `{fr['name']}` ({fr.get('path') or '?'}) ↔ " + f"`{to['name']}` ({to.get('path') or '?'}) — {s['edge_count']} edge(s)" + ) + lines.append("") + + questions = payload.get("questions", []) + if questions: + lines.append("### Suggested questions") + for q in questions: + lines.append(f"- {q}") + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + def _header(query: str, exists: bool, stale: bool) -> str: freshness = "fresh" if not stale else "STALE" if not exists: diff --git a/src/codebase_index/service.py b/src/codebase_index/service.py index 5e7981e..b481bf2 100644 --- a/src/codebase_index/service.py +++ b/src/codebase_index/service.py @@ -98,6 +98,33 @@ def search_payload( ) +def architecture_payload(db_path: Path, cfg: "Config") -> dict[str, Any]: + """The cached architecture analytics (communities / god nodes / surprising / + questions) plus index freshness — the payload both CLI and MCP serialize. + + Returns ``available: False`` when no analysis is cached (an index built before + this feature, or an empty graph); the caller tells the user to reindex. + """ + from .graph import analysis + from .indexer.freshness import compute_freshness + from .storage.db import Database + + with Database(db_path) as db: + fresh = compute_freshness(db.conn, Path(cfg.root), cfg) + summary = analysis.load_analysis(db.conn) + if summary is None: + return { + "exists": True, + "available": False, + "reason": ( + "No architecture analysis cached. Rebuild the index " + "(`codebase-index index`) to compute it." + ), + "index": fresh.model_dump(), + } + return {"exists": True, "available": True, "index": fresh.model_dump(), **summary} + + def stats_payload(conn: sqlite3.Connection) -> dict[str, Any]: """Index size, freshness, and per-language coverage with the graph tier.""" from .parsers.languages import has_full_graph diff --git a/tests/golden/architecture.json b/tests/golden/architecture.json new file mode 100644 index 0000000..accd339 --- /dev/null +++ b/tests/golden/architecture.json @@ -0,0 +1,151 @@ +{ + "available": true, + "communities": [ + { + "id": 0, + "label": "src/api", + "size": 3, + "top_nodes": [ + { + "degree": 2, + "kind": "file", + "name": "service.py", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "file", + "name": "token.py", + "path": "src/auth/token.py" + }, + { + "degree": 1, + "kind": "file", + "name": "user.py", + "path": "src/models/user.py" + } + ] + }, + { + "id": 1, + "label": "src/auth", + "size": 3, + "top_nodes": [ + { + "degree": 2, + "kind": "symbol", + "name": "refresh_access_token", + "path": "src/auth/token.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "renew", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "login", + "path": "src/auth/token.py" + } + ] + }, + { + "id": 2, + "label": "src/api", + "size": 2, + "top_nodes": [ + { + "degree": 1, + "kind": "symbol", + "name": "AdminUser", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "User", + "path": "src/models/user.py" + } + ] + } + ], + "community_count": 3, + "edge_count": 5, + "exists": true, + "god_nodes": [ + { + "community": 0, + "degree": 2, + "kind": "file", + "name": "service.py", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 2, + "kind": "symbol", + "name": "refresh_access_token", + "path": "src/auth/token.py" + }, + { + "community": 0, + "degree": 1, + "kind": "file", + "name": "token.py", + "path": "src/auth/token.py" + }, + { + "community": 0, + "degree": 1, + "kind": "file", + "name": "user.py", + "path": "src/models/user.py" + }, + { + "community": 2, + "degree": 1, + "kind": "symbol", + "name": "AdminUser", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 1, + "kind": "symbol", + "name": "renew", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 1, + "kind": "symbol", + "name": "login", + "path": "src/auth/token.py" + }, + { + "community": 2, + "degree": 1, + "kind": "symbol", + "name": "User", + "path": "src/models/user.py" + } + ], + "index": { + "built_at": "", + "exists": true, + "files_changed_since_build": 0, + "head_commit": "", + "stale": false + }, + "modularity": 0.82, + "node_count": 8, + "questions": [ + "What is the role of `service.py` in the architecture?", + "How does `refresh_access_token` work?", + "What breaks if `refresh_access_token` changes?", + "What is the role of `token.py` in the architecture?" + ], + "surprising": [] +} diff --git a/tests/golden/mcp_architecture.json b/tests/golden/mcp_architecture.json new file mode 100644 index 0000000..7647c88 --- /dev/null +++ b/tests/golden/mcp_architecture.json @@ -0,0 +1,153 @@ +{ + "available": true, + "communities": [ + { + "id": 0, + "label": "src/api", + "size": 3, + "top_nodes": [ + { + "degree": 2, + "kind": "file", + "name": "service.py", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "file", + "name": "token.py", + "path": "src/auth/token.py" + }, + { + "degree": 1, + "kind": "file", + "name": "user.py", + "path": "src/models/user.py" + } + ] + }, + { + "id": 1, + "label": "src/auth", + "size": 3, + "top_nodes": [ + { + "degree": 2, + "kind": "symbol", + "name": "refresh_access_token", + "path": "src/auth/token.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "renew", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "login", + "path": "src/auth/token.py" + } + ] + }, + { + "id": 2, + "label": "src/api", + "size": 2, + "top_nodes": [ + { + "degree": 1, + "kind": "symbol", + "name": "AdminUser", + "path": "src/api/service.py" + }, + { + "degree": 1, + "kind": "symbol", + "name": "User", + "path": "src/models/user.py" + } + ] + } + ], + "community_count": 3, + "edge_count": 5, + "exists": true, + "god_nodes": [ + { + "community": 0, + "degree": 2, + "kind": "file", + "name": "service.py", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 2, + "kind": "symbol", + "name": "refresh_access_token", + "path": "src/auth/token.py" + }, + { + "community": 0, + "degree": 1, + "kind": "file", + "name": "token.py", + "path": "src/auth/token.py" + }, + { + "community": 0, + "degree": 1, + "kind": "file", + "name": "user.py", + "path": "src/models/user.py" + }, + { + "community": 2, + "degree": 1, + "kind": "symbol", + "name": "AdminUser", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 1, + "kind": "symbol", + "name": "renew", + "path": "src/api/service.py" + }, + { + "community": 1, + "degree": 1, + "kind": "symbol", + "name": "login", + "path": "src/auth/token.py" + }, + { + "community": 2, + "degree": 1, + "kind": "symbol", + "name": "User", + "path": "src/models/user.py" + } + ], + "index": { + "built_at": "", + "exists": true, + "files_changed_since_build": 0, + "head_commit": "", + "stale": false + }, + "modularity": 0.82, + "node_count": 8, + "questions": [ + "What is the role of `service.py` in the architecture?", + "How does `refresh_access_token` work?", + "What breaks if `refresh_access_token` changes?", + "What is the role of `token.py` in the architecture?" + ], + "schema_version": 1, + "surprising": [], + "tool": "architecture_overview" +} diff --git a/tests/test_analysis.py b/tests/test_analysis.py index 60eb90a..95fe513 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -180,6 +180,41 @@ def test_analyze_and_cache_roundtrip(tmp_path): db.close() +def test_architecture_payload_available(tmp_path): + from codebase_index.graph.builder import build_graph + from codebase_index.service import architecture_payload + + path = tmp_path / "index.sqlite" + db = Database(path).open() + _seed_two_modules(db) + build_graph(db.conn) + db.close() + + cfg = Config() + cfg.root = str(tmp_path) + payload = architecture_payload(path, cfg) + assert payload["available"] is True + assert payload["god_nodes"] + assert "index" in payload + + +def test_architecture_payload_unavailable_without_analysis(tmp_path): + from codebase_index.output import markdown + from codebase_index.service import architecture_payload + + # A bare index (schema only, never analysed) reports unavailable, not a crash. + path = tmp_path / "index.sqlite" + Database(path).open().close() + + cfg = Config() + cfg.root = str(tmp_path) + payload = architecture_payload(path, cfg) + assert payload["available"] is False + assert "reason" in payload + rendered = markdown.render_architecture(payload) + assert "No architecture analysis" in rendered + + def test_analyze_on_sample_repo(sample_repo, tmp_path): cfg = Config() cfg.root = str(sample_repo) diff --git a/tests/test_cli_golden.py b/tests/test_cli_golden.py index 608875d..602e51e 100644 --- a/tests/test_cli_golden.py +++ b/tests/test_cli_golden.py @@ -64,6 +64,7 @@ def indexed_repo(tmp_path_factory): ("refs_refresh_access_token", ["refs", "refresh_access_token"]), ("impact_user_model", ["impact", "src/models/user.py", "--direction", "up"]), ("explain_auth", ["explain", "how does authentication work"]), + ("architecture", ["architecture"]), ("stats", ["stats"]), ] diff --git a/tests/test_mcp_golden.py b/tests/test_mcp_golden.py index 20b4da8..a134051 100644 --- a/tests/test_mcp_golden.py +++ b/tests/test_mcp_golden.py @@ -68,6 +68,7 @@ def _call(indexed_repo, tool_fn, **kwargs): "mcp_find_refs": (lambda: mcp_server.find_refs, {"symbol": "refresh_access_token"}), "mcp_impact_of": (lambda: mcp_server.impact_of, {"target": "src/models/user.py", "direction": "up"}), "mcp_explain_code": (lambda: mcp_server.explain_code, {"query": "how does authentication work"}), + "mcp_architecture": (lambda: mcp_server.architecture_overview, {}), "mcp_index_stats": (lambda: mcp_server.index_stats, {}), } @@ -79,6 +80,7 @@ def _call(indexed_repo, tool_fn, **kwargs): "mcp_find_refs": "find_refs", "mcp_impact_of": "impact_of", "mcp_explain_code": "explain_code", + "mcp_architecture": "architecture_overview", "mcp_index_stats": "index_stats", } diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 063e4c1..418ffab 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -36,6 +36,7 @@ def test_mcp_server_has_expected_tools(): "find_refs", "impact_of", "explain_code", + "architecture_overview", "index_stats", } @@ -69,6 +70,7 @@ def test_search_code_no_index(): "find_refs": lambda: _call(mcp_server.find_refs, symbol="foo"), "impact_of": lambda: _call(mcp_server.impact_of, target="foo.py"), "explain_code": lambda: _call(mcp_server.explain_code, query="how does foo work"), + "architecture_overview": lambda: _call(mcp_server.architecture_overview), "index_stats": lambda: _call(mcp_server.index_stats), }