diff --git a/.claude/skills/codebase-index/SKILL.md b/.claude/skills/codebase-index/SKILL.md index 2105974..6b1e1b9 100644 --- a/.claude/skills/codebase-index/SKILL.md +++ b/.claude/skills/codebase-index/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/.codex/skills/codebase-index/SKILL.md b/.codex/skills/codebase-index/SKILL.md index 2105974..6b1e1b9 100644 --- a/.codex/skills/codebase-index/SKILL.md +++ b/.codex/skills/codebase-index/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/.opencode/skills/codebase-index/SKILL.md b/.opencode/skills/codebase-index/SKILL.md index 2105974..6b1e1b9 100644 --- a/.opencode/skills/codebase-index/SKILL.md +++ b/.opencode/skills/codebase-index/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/CHANGELOG.md b/CHANGELOG.md index e31b3b8..f040c57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,16 @@ All notable changes to this project are documented here. The format is based on ## [Unreleased] +### Added — graph visualization upgrade + interop exports +- **HTML graph is now legible at a glance**: nodes are coloured by module + (community), sized by connectivity (god nodes are biggest), and edges are styled + by confidence — solid `extracted`, dashed `inferred`, red-dotted `ambiguous` — + with a legend. Community/degree are computed on the displayed subgraph. +- **`codebase-index graph --format graphml|dot|neo4j`** exports the same enriched + graph for external tools: **GraphML** (Gephi / yEd / NetworkX), **DOT** + (Graphviz, edge style = confidence), and **Cypher** (Neo4j / FalkorDB). All + pure-stdlib, zero new dependencies. `--format html` (default) is unchanged. + ### Added — graph navigation: `path` and `describe` - **`codebase-index path `** — shortest undirected dependency/call path between two symbols or files ("how is X connected to Y"). Renders the node chain diff --git a/README.md b/README.md index 658a4c5..9018ea7 100644 --- a/README.md +++ b/README.md @@ -397,6 +397,11 @@ codebase-index path "renew" "refresh_access_token" # Node card: definition, callers, callees, centrality, module codebase-index describe "Database" +# Visualize the graph (modules coloured, size = connectivity, edge style = confidence) +codebase-index graph --open +# …or export for external tools: graphml (Gephi/yEd), dot (Graphviz), neo4j (Cypher) +codebase-index graph --format graphml -o graph.graphml + # View index statistics codebase-index stats diff --git a/skill/SKILL.md b/skill/SKILL.md index 2105974..6b1e1b9 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/skills/codebase-index/SKILL.md b/skills/codebase-index/SKILL.md index 2105974..6b1e1b9 100644 --- a/skills/codebase-index/SKILL.md +++ b/skills/codebase-index/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/src/codebase_index/cli.py b/src/codebase_index/cli.py index 7c79dfd..4906446 100644 --- a/src/codebase_index/cli.py +++ b/src/codebase_index/cli.py @@ -566,35 +566,47 @@ def graph_view( target: Optional[str] = typer.Argument(None, help="Optional file path or symbol to center."), depth: int = typer.Option(2, "--depth"), direction: str = typer.Option("both", "--direction", help="up|down|both"), - output: Optional[Path] = typer.Option(None, "--output", "-o", help="HTML file path."), + fmt: str = typer.Option("html", "--format", help="html|graphml|dot|neo4j"), + output: Optional[Path] = typer.Option(None, "--output", "-o", help="Output file path."), open_browser: bool = typer.Option(False, "--open", help="Open the HTML graph in a browser."), json_flag: bool = typer.Option(False, "--json", help="Emit machine-readable JSON."), ) -> None: - """Export an interactive HTML graph of indexed files, symbols, and edges.""" + """Export the graph of indexed files, symbols, and edges. + + Default is an interactive HTML view (modules coloured, size by connectivity, + edge style by confidence). --format also writes graphml (Gephi/yEd), dot + (Graphviz), or neo4j (Cypher) for external graph tools. + """ import json as _json - from .graph.export import export_graph_html + from .graph import export as gexport from .service import cache_dir_for from .storage.db import Database + exporters = { + "html": (gexport.export_graph_html, "graph.html"), + "graphml": (gexport.export_graph_graphml, "graph.graphml"), + "dot": (gexport.export_graph_dot, "graph.dot"), + "neo4j": (gexport.export_graph_neo4j, "graph.cypher"), + } + if fmt not in exporters: + typer.echo(f"[codebase-index] invalid --format '{fmt}'. Valid: {', '.join(exporters)}.") + raise typer.Exit(code=2) + is_json = json_flag or bool(ctx.obj and ctx.obj.get("json")) db_path, cfg = _ensure_index(ctx) - out = output or cache_dir_for(cfg) / "graph.html" + exporter, default_name = exporters[fmt] + out = output or cache_dir_for(cfg) / default_name with Database(db_path) as db: - stats = export_graph_html( - db.conn, - out, - target=target, - depth=depth, - direction=direction, - ) + stats = exporter(db.conn, out, target=target, depth=depth, direction=direction) - if open_browser: + if open_browser and fmt == "html": _open_in_browser(out) payload = { "path": str(out), + "format": fmt, "target": target, "depth": depth, "direction": direction, @@ -603,7 +615,7 @@ def graph_view( if is_json: typer.echo(_json.dumps(payload)) else: - typer.echo(f"Graph written to {out}") + typer.echo(f"Graph ({fmt}) written to {out}") typer.echo(f"nodes={stats['nodes']} edges={stats['edges']}") diff --git a/src/codebase_index/graph/export.py b/src/codebase_index/graph/export.py index 0942756..8e0c2fd 100644 --- a/src/codebase_index/graph/export.py +++ b/src/codebase_index/graph/export.py @@ -43,7 +43,7 @@ def _edge_rows( params.append(limit) return conn.execute( f""" - SELECT e.edge_type, e.resolved, e.line, e.dst_name, + SELECT e.edge_type, e.resolved, e.line, e.dst_name, e.confidence, e.src_kind, e.dst_kind, src_file.path AS src_file_path, src_sym_file.path AS src_symbol_file_path, @@ -73,8 +73,13 @@ def _node_key(kind: str, path: str, name: str | None = None) -> str: def _graph_data(rows: list[sqlite3.Row]) -> dict[str, Any]: + from collections import Counter, defaultdict + + from .analysis import detect_communities, weighted_degree + nodes: dict[str, dict[str, Any]] = {} edges: list[dict[str, Any]] = [] + adj: dict[str, Counter] = defaultdict(Counter) for row in rows: src_path = row["src_file_path"] or row["src_symbol_file_path"] or "" src_name = row["src_symbol_name"] @@ -101,8 +106,20 @@ def _graph_data(rows: list[sqlite3.Row]) -> dict[str, Any]: "target": dst_key, "type": row["edge_type"], "line": row["line"], + "confidence": row["confidence"] if "confidence" in row.keys() else "extracted", } ) + if src_key != dst_key: + adj[src_key][dst_key] += 1 + adj[dst_key][src_key] += 1 + + # Colour by module and size by centrality, computed on the displayed subgraph. + # The analysis functions are generic over the node key type, so string keys work. + communities = detect_communities(adj) + degree = weighted_degree(adj) + for key, node in nodes.items(): + node["community"] = communities.get(key, -1) + node["degree"] = degree.get(key, 0) return {"nodes": list(nodes.values()), "edges": edges} @@ -149,12 +166,17 @@ def export_graph_html( th,td {{ text-align:left; padding:8px; border-bottom:1px solid #e5e7eb; vertical-align:top; }} th {{ position:sticky; top:0; background:#f1f5f9; z-index:1; }} .edge {{ stroke:#94a3b8; stroke-width:1.3; }} +.edge.inferred {{ stroke-dasharray:5 3; }} /* heuristic-resolved */ +.edge.ambiguous {{ stroke:#ef4444; stroke-dasharray:2 3; }} /* unresolved target */ .node {{ cursor:pointer; }} -.node circle {{ fill:#fff; stroke:#2563eb; stroke-width:2; }} -.node.file circle {{ stroke:#059669; }} +.node circle {{ stroke:#1f2937; stroke-width:1.5; }} +.node.file circle {{ stroke-width:2.5; }} .node text {{ font-size:11px; fill:#111827; }} .dim {{ opacity:.12; }} -.selected circle {{ fill:#dbeafe; }} +.selected circle {{ stroke:#111827; stroke-width:3; }} +.legend {{ font-size:11px; color:#475569; display:flex; gap:14px; align-items:center; flex-wrap:wrap; }} +.legend b {{ color:#1f2937; }} +.legend svg {{ width:34px; height:8px; vertical-align:middle; }} @@ -162,6 +184,13 @@ def export_graph_html(

codebase-index graph

+ + colour = module + size = connectivity + extracted + inferred + ambiguous +
@@ -179,6 +208,15 @@ def export_graph_html( const rows = document.getElementById('edgeRows'); const counts = document.getElementById('counts'); const byId = new Map(data.nodes.map(n => [n.id, n])); +// Stable, readable categorical palette; community id indexes into it. +const PALETTE = ['#2563eb','#059669','#d97706','#7c3aed','#db2777','#0891b2', + '#65a30d','#dc2626','#4f46e5','#ca8a04','#0d9488','#9333ea']; +function colorFor(n) {{ + const c = n.community; + if (c === undefined || c < 0) return '#cbd5e1'; + return PALETTE[c % PALETTE.length]; +}} +function radiusFor(n) {{ return (n.name ? 8 : 11) + Math.min(14, Math.sqrt(n.degree || 0) * 2); }} function label(n) {{ return n.name ? `${{n.name}} (${{n.path}})` : n.path; }} function draw(filter = '') {{ svg.textContent = ''; @@ -194,7 +232,7 @@ def export_graph_html( const line = document.createElementNS('http://www.w3.org/2000/svg', 'line'); line.setAttribute('x1', s.x); line.setAttribute('y1', s.y); line.setAttribute('x2', t.x); line.setAttribute('y2', t.y); - line.setAttribute('class', 'edge'); + line.setAttribute('class', 'edge ' + (e.confidence || 'extracted')); svg.appendChild(line); const tr = document.createElement('tr'); for (const val of [e.type, label(s), label(t), e.line || '']) {{ @@ -207,7 +245,9 @@ def export_graph_html( g.setAttribute('class', `node ${{n.name ? 'symbol' : 'file'}}`); g.setAttribute('transform', `translate(${{n.x}},${{n.y}})`); const c = document.createElementNS('http://www.w3.org/2000/svg', 'circle'); - c.setAttribute('r', n.name ? 12 : 16); + c.setAttribute('r', radiusFor(n)); + c.setAttribute('fill', colorFor(n)); + c.setAttribute('fill-opacity', '0.85'); const txt = document.createElementNS('http://www.w3.org/2000/svg', 'text'); txt.setAttribute('x', 18); txt.setAttribute('y', 4); txt.textContent = n.name || n.path.split('/').pop(); @@ -226,3 +266,116 @@ def export_graph_html( output.parent.mkdir(parents=True, exist_ok=True) output.write_text(html, encoding="utf-8") return {"nodes": len(data["nodes"]), "edges": len(data["edges"])} + + +# --------------------------------------------------------------------------- +# Interop exports — GraphML (Gephi/yEd), DOT (Graphviz), Cypher (Neo4j). +# All reuse _edge_rows + _graph_data, so they carry the same community/degree/ +# confidence enrichment as the HTML view. Pure-stdlib, zero dependencies. +# --------------------------------------------------------------------------- + +def _collect(conn, *, target, depth, direction, limit) -> dict[str, Any]: + return _graph_data(_edge_rows(conn, target=target, depth=depth, direction=direction, limit=limit)) + + +def _write(output: Path, text: str) -> None: + output.parent.mkdir(parents=True, exist_ok=True) + output.write_text(text, encoding="utf-8") + + +def export_graph_graphml( + conn: sqlite3.Connection, output: Path, *, + target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500, +) -> dict[str, int]: + """GraphML for Gephi / yEd / NetworkX. Node ids are dense (n0, n1, …).""" + from xml.sax.saxutils import escape, quoteattr + + data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit) + ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])} + lines = [ + '', + '', + ] + for k, ty in (("kind", "string"), ("name", "string"), ("path", "string"), + ("community", "long"), ("degree", "long")): + lines.append(f' ') + for k in ("edge_type", "confidence"): + lines.append(f' ') + lines.append(' ') + for n in data["nodes"]: + lines.append(f' ') + lines.append(f' {escape(n.get("kind") or "")}') + lines.append(f' {escape(n.get("name") or "")}') + lines.append(f' {escape(n.get("path") or "")}') + lines.append(f' {int(n.get("community", -1))}') + lines.append(f' {int(n.get("degree", 0))}') + lines.append(" ") + for i, e in enumerate(data["edges"]): + s = ids.get(e["source"]) + t = ids.get(e["target"]) + if s is None or t is None: + continue + lines.append(f' ') + lines.append(f' {escape(e["type"])}') + lines.append(f' {escape(e.get("confidence") or "")}') + lines.append(" ") + lines += [" ", "", ""] + _write(output, "\n".join(lines)) + return {"nodes": len(data["nodes"]), "edges": len(data["edges"])} + + +def export_graph_dot( + conn: sqlite3.Connection, output: Path, *, + target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500, +) -> dict[str, int]: + """Graphviz DOT. Edge style encodes confidence (solid/dashed/dotted).""" + data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit) + ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])} + style = {"extracted": "solid", "inferred": "dashed", "ambiguous": "dotted"} + + def esc(s: str) -> str: + return s.replace("\\", "\\\\").replace('"', '\\"') + + lines = ["digraph codebase_index {", " rankdir=LR;", ' node [shape=box, fontsize=10];'] + for n in data["nodes"]: + lbl = esc(f'{n["name"]}\n{n["path"]}' if n.get("name") else (n.get("path") or "")) + lines.append(f' {ids[n["id"]]} [label="{lbl}"];') + for e in data["edges"]: + s = ids.get(e["source"]) + t = ids.get(e["target"]) + if s is None or t is None: + continue + st = style.get(e.get("confidence") or "extracted", "solid") + lines.append(f' {s} -> {t} [label="{esc(e["type"])}", style={st}];') + lines += ["}", ""] + _write(output, "\n".join(lines)) + return {"nodes": len(data["nodes"]), "edges": len(data["edges"])} + + +def export_graph_neo4j( + conn: sqlite3.Connection, output: Path, *, + target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500, +) -> dict[str, int]: + """Cypher script (MERGE statements) to load the graph into Neo4j / FalkorDB.""" + data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit) + + def lit(s: str) -> str: + return "'" + (s or "").replace("\\", "\\\\").replace("'", "\\'") + "'" + + lines = ["// codebase-index graph export for Neo4j / FalkorDB"] + for n in data["nodes"]: + node_label = "Symbol" if n.get("name") else "File" + lines.append( + f"MERGE (:{node_label} {{key:{lit(n['id'])}, name:{lit(n.get('name') or '')}, " + f"path:{lit(n.get('path') or '')}, community:{int(n.get('community', -1))}, " + f"degree:{int(n.get('degree', 0))}}});" + ) + for e in data["edges"]: + rel = (e["type"] or "edge").upper() + lines.append( + f"MATCH (a {{key:{lit(e['source'])}}}), (b {{key:{lit(e['target'])}}}) " + f"MERGE (a)-[:{rel} {{confidence:{lit(e.get('confidence') or 'extracted')}}}]->(b);" + ) + lines.append("") + _write(output, "\n".join(lines)) + return {"nodes": len(data["nodes"]), "edges": len(data["edges"])} diff --git a/src/codebase_index/skill_template/SKILL.md b/src/codebase_index/skill_template/SKILL.md index 2105974..6b1e1b9 100644 --- a/src/codebase_index/skill_template/SKILL.md +++ b/src/codebase_index/skill_template/SKILL.md @@ -38,18 +38,32 @@ Pick the subcommand by intent: | User intent | Command | |---|---| | "how does X work" / "explain X" / "walk me through" | `codebase-index explain "$QUERY" --json` | -| overview / architecture | `codebase-index explain "architecture overview" --token-budget 3000 --json` | +| overview / architecture / "map the codebase" | `codebase-index architecture --json` | | general / unsure | `codebase-index search "$QUERY" --json` | | keyword / "where is" | `codebase-index search "$QUERY" --json` | | a specific symbol name | `codebase-index symbol "" --json` | | "who calls / references" | `codebase-index refs "" --json` | | "what breaks if I change" | `codebase-index impact "" --json` | +| "how is X connected to Y" / dependency path | `codebase-index path "" "" --json` | +| "what is X" / describe a symbol's role | `codebase-index describe "" --json` | | visual graph / "open graph" (for the human, not for you to read) | `codebase-index graph "" --open` | +`architecture` returns the codebase map computed at index time — detected modules +(communities), god nodes (most-connected symbols), surprising cross-module links, +and suggested questions. Reach for it on "give me an overview" / "where do I +start" questions instead of a broad `explain`. + +`path "A" "B"` returns the shortest dependency/call chain between two symbols or +files; `describe "X"` returns a node card (definition, callers, callees, +in/out degree, module, god-node rank). Both annotate edges with a `confidence` +(`extracted` exact, `inferred` heuristic, `ambiguous` unresolved) — treat a path +or callee list that leans on `inferred`/`ambiguous` edges as less certain. + The `graph` command renders an HTML dependency graph for a person to look at — it is not a retrieval packet. Use it only when the user explicitly wants a visual graph; for "what depends on X" answer from `impact`/`refs` instead. In a headless -session prefer `--out ` over `--open`. +session prefer `--output ` over `--open`. `--format graphml|dot|neo4j` +exports the graph for external tools (Gephi/yEd, Graphviz, Neo4j) instead of HTML. `explain` has a higher default token budget (2200) and HOW_IT_WORKS intent weights — use it whenever the question is about understanding behavior or flow. @@ -150,8 +164,8 @@ Never start with a full-repo scan when the index exists and is fresh. # "how does the auth flow work?" codebase-index explain "auth flow" --json -# "explain the overall architecture" -codebase-index explain "architecture overview" --token-budget 3000 --json +# "explain the overall architecture" / "where do I start" — modules, god nodes +codebase-index architecture --json # "where is auth token refresh implemented?" codebase-index search "auth token refresh" --json @@ -168,6 +182,12 @@ codebase-index symbol "AuthService" --json # precise symbol search (faster, no FTS noise) codebase-index search "AuthService" --mode symbol --json +# "how is the API layer connected to the database?" +codebase-index path "ApiController" "Database" --json + +# "what is the Database class and how is it used?" +codebase-index describe "Database" --json + # generate and open an HTML graph around a file or symbol codebase-index graph "User" --direction both --depth 2 --open ``` diff --git a/tests/test_graph_export_cli.py b/tests/test_graph_export_cli.py index 862c8db..6c15d1d 100644 --- a/tests/test_graph_export_cli.py +++ b/tests/test_graph_export_cli.py @@ -31,8 +31,50 @@ def test_graph_command_writes_html(sample_repo, tmp_path): assert res.exit_code == 0, res.output data = json.loads(res.output) assert data["path"] == str(out) + assert data["format"] == "html" assert data["nodes"] >= 1 assert out.exists() text = out.read_text(encoding="utf-8") assert "codebase-index graph" in text assert "graph-data" in text + # Phase-4 enrichment: module colours + confidence legend are present. + assert "= module" in text + assert "inferred" in text and "ambiguous" in text + + +def test_graph_command_writes_graphml(sample_repo, tmp_path): + assert runner.invoke(app, ["--root", str(sample_repo), "index"]).exit_code == 0 + out = tmp_path / "g.graphml" + res = runner.invoke( + app, ["--root", str(sample_repo), "--json", "graph", "--format", "graphml", + "--output", str(out)], + ) + assert res.exit_code == 0, res.output + assert json.loads(res.output)["format"] == "graphml" + text = out.read_text(encoding="utf-8") + assert "