diff --git a/packages/cli/src/commands/analyze.test.ts b/packages/cli/src/commands/analyze.test.ts index a4a9f330..2ede5161 100644 --- a/packages/cli/src/commands/analyze.test.ts +++ b/packages/cli/src/commands/analyze.test.ts @@ -14,15 +14,19 @@ import assert from "node:assert/strict"; import { spawn } from "node:child_process"; -import { mkdtemp, writeFile } from "node:fs/promises"; +import { mkdir, mkdtemp, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { test } from "node:test"; import { upsertRegistry } from "../registry.js"; import { checkFastPath, + detectCoverageReport, isWorkingTreeDirty, + resolveCoverageEnabled, resolveMaxSummariesCap, + resolveSbomEnabled, + resolveScanEnabled, resolveSummariesEnabled, } from "./analyze.js"; @@ -148,37 +152,66 @@ test("resolveMaxSummariesCap: seed of 5 yields a cap of 0 under the 10% rule", a }); // --------------------------------------------------------------------------- -// resolveSummariesEnabled — env kill-switch + P04 default-on contract. +// resolveSummariesEnabled — fast-default contract: LLM summaries are opt-in. +// `codehub analyze` runs tree-sitter + SCIP + cochange phases only by default, +// so a fresh invocation never spends on Bedrock or blocks on a network hop. // --------------------------------------------------------------------------- -test("resolveSummariesEnabled: default-on when both env and flag are absent (P04)", () => { - assert.equal(resolveSummariesEnabled(undefined, {}), true); +test("resolveSummariesEnabled: default-off when both env and flag are absent", () => { + assert.equal(resolveSummariesEnabled(undefined, {}), false); }); -test("resolveSummariesEnabled: explicit --summaries keeps it on", () => { +test("resolveSummariesEnabled: explicit --summaries opts in", () => { assert.equal(resolveSummariesEnabled(true, {}), true); }); -test("resolveSummariesEnabled: explicit --no-summaries turns it off", () => { +test("resolveSummariesEnabled: explicit --no-summaries stays off", () => { assert.equal(resolveSummariesEnabled(false, {}), false); }); +test("resolveSummariesEnabled: CODEHUB_BEDROCK_SUMMARIES=1 opts in (env-only)", () => { + // Operators can enable summaries for a whole CI job without editing every + // invocation. Only the literal "1" triggers — anything else is treated as + // absent, mirroring the kill-switch semantics below. + assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_SUMMARIES: "1" }), true); + assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_SUMMARIES: "0" }), false); + assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_SUMMARIES: "" }), false); +}); + test("resolveSummariesEnabled: CODEHUB_BEDROCK_DISABLED=1 kills the phase", () => { assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_DISABLED: "1" }), false); }); -test("resolveSummariesEnabled: env kill-switch wins over --summaries=true", () => { +test("resolveSummariesEnabled: kill-switch wins over --summaries=true", () => { // Operator passed --summaries explicitly but the env var forces off. // Required so CI / restricted environments can lock out Bedrock without // auditing every invocation site. assert.equal(resolveSummariesEnabled(true, { CODEHUB_BEDROCK_DISABLED: "1" }), false); }); -test("resolveSummariesEnabled: CODEHUB_BEDROCK_DISABLED=0 does not kill the phase", () => { - // Only the literal "1" triggers the kill-switch — anything else is a - // no-op. This keeps operator intent unambiguous. - assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_DISABLED: "0" }), true); - assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_DISABLED: "" }), true); +test("resolveSummariesEnabled: kill-switch wins over CODEHUB_BEDROCK_SUMMARIES=1", () => { + // Both env vars set → disable wins. This lets a CI environment pin the + // opt-in globally while still allowing per-job kill-switch overrides. + assert.equal( + resolveSummariesEnabled(undefined, { + CODEHUB_BEDROCK_SUMMARIES: "1", + CODEHUB_BEDROCK_DISABLED: "1", + }), + false, + ); +}); + +test("resolveSummariesEnabled: --no-summaries wins over CODEHUB_BEDROCK_SUMMARIES=1", () => { + // Explicit CLI false beats env opt-in. Matches how --no-flag usually + // wins against ambient config everywhere else in the CLI. + assert.equal(resolveSummariesEnabled(false, { CODEHUB_BEDROCK_SUMMARIES: "1" }), false); +}); + +test("resolveSummariesEnabled: CODEHUB_BEDROCK_DISABLED=0 does not enable the phase", () => { + // Only the literal "1" on the opt-in var flips this; anything else leaves + // summaries in their (fast, off) default. + assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_DISABLED: "0" }), false); + assert.equal(resolveSummariesEnabled(undefined, { CODEHUB_BEDROCK_DISABLED: "" }), false); }); // --------------------------------------------------------------------------- @@ -245,3 +278,118 @@ test("isWorkingTreeDirty: returns false when the git binary is unavailable", asy else process.env["PATH"] = originalPath; } }); + +// --------------------------------------------------------------------------- +// resolveSbomEnabled — default on, --no-sbom opts out. +// --------------------------------------------------------------------------- + +test("resolveSbomEnabled: default-on when flag is absent", () => { + assert.equal(resolveSbomEnabled(undefined), true); +}); + +test("resolveSbomEnabled: explicit true keeps it on", () => { + assert.equal(resolveSbomEnabled(true), true); +}); + +test("resolveSbomEnabled: explicit false turns it off (--no-sbom)", () => { + assert.equal(resolveSbomEnabled(false), false); +}); + +// --------------------------------------------------------------------------- +// resolveScanEnabled — default on, --no-scan opts out. +// --------------------------------------------------------------------------- + +test("resolveScanEnabled: default-on when flag is absent", () => { + assert.equal(resolveScanEnabled(undefined), true); +}); + +test("resolveScanEnabled: explicit true keeps it on", () => { + assert.equal(resolveScanEnabled(true), true); +}); + +test("resolveScanEnabled: explicit false turns it off (--no-scan)", () => { + assert.equal(resolveScanEnabled(false), false); +}); + +// --------------------------------------------------------------------------- +// detectCoverageReport + resolveCoverageEnabled — auto-detect semantics. +// --------------------------------------------------------------------------- + +test("detectCoverageReport: returns undefined when no report exists", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-none-")); + assert.equal(await detectCoverageReport(dir), undefined); +}); + +test("detectCoverageReport: finds coverage/lcov.info", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-lcov-")); + await mkdir(join(dir, "coverage"), { recursive: true }); + await writeFile(join(dir, "coverage", "lcov.info"), "TN:\n"); + assert.equal(await detectCoverageReport(dir), "coverage/lcov.info"); +}); + +test("detectCoverageReport: finds top-level lcov.info", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-lcov2-")); + await writeFile(join(dir, "lcov.info"), "TN:\n"); + assert.equal(await detectCoverageReport(dir), "lcov.info"); +}); + +test("detectCoverageReport: finds coverage.xml (cobertura)", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-xml-")); + await writeFile(join(dir, "coverage.xml"), "\n"); + assert.equal(await detectCoverageReport(dir), "coverage.xml"); +}); + +test("detectCoverageReport: finds jacoco xml at the Gradle path", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-jacoco-")); + await mkdir(join(dir, "build", "reports", "jacoco", "test"), { recursive: true }); + await writeFile(join(dir, "build", "reports", "jacoco", "test", "jacocoTestReport.xml"), ""); + assert.equal(await detectCoverageReport(dir), "build/reports/jacoco/test/jacocoTestReport.xml"); +}); + +test("detectCoverageReport: finds coverage.json (coverage.py)", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-json-")); + await writeFile(join(dir, "coverage.json"), "{}\n"); + assert.equal(await detectCoverageReport(dir), "coverage.json"); +}); + +test("detectCoverageReport: prefers coverage/lcov.info over top-level lcov.info", async () => { + // Probe order matches the phase's `CANDIDATES` array so the analyze + // wrapper and the phase agree on which report is the "one" when a + // repo has both. + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-both-")); + await mkdir(join(dir, "coverage"), { recursive: true }); + await writeFile(join(dir, "coverage", "lcov.info"), "TN:\n"); + await writeFile(join(dir, "lcov.info"), "TN:\n"); + assert.equal(await detectCoverageReport(dir), "coverage/lcov.info"); +}); + +test("resolveCoverageEnabled: explicit true short-circuits detection", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-force-on-")); + // No report on disk; explicit true still returns true so the phase + // runs and the operator sees the "no report found" warning. + assert.equal(await resolveCoverageEnabled(true, dir), true); +}); + +test("resolveCoverageEnabled: explicit false short-circuits detection", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-force-off-")); + await writeFile(join(dir, "lcov.info"), "TN:\n"); + // Report IS on disk; explicit false still returns false so the phase + // is a silent no-op. + assert.equal(await resolveCoverageEnabled(false, dir), false); +}); + +test("resolveCoverageEnabled: undefined + no report → undefined (silent)", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-auto-none-")); + // No flag, no report → plumb `undefined` through so the phase is a + // silent no-op. Critically, this does NOT return `false` — that would + // still be equivalent behavior from the phase's perspective, but + // `undefined` is the documented "auto" sentinel and round-trips + // through `pipelineOptions` as omitted-key. + assert.equal(await resolveCoverageEnabled(undefined, dir), undefined); +}); + +test("resolveCoverageEnabled: undefined + report found → true (auto-on)", async () => { + const dir = await mkdtemp(join(tmpdir(), "och-analyze-cov-auto-on-")); + await writeFile(join(dir, "lcov.info"), "TN:\n"); + assert.equal(await resolveCoverageEnabled(undefined, dir), true); +}); diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index 4877ed07..1a7976b4 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -79,23 +79,40 @@ export interface AnalyzeOptions { readonly verbose?: boolean; readonly skipAgentsMd?: boolean; /** - * When true, emit `.codehub/sbom.cyclonedx.json` and - * `.codehub/sbom.spdx.json` from Dependency nodes. Off by default so - * `codehub analyze` stays quiet for repos where supply-chain docs are - * out of scope. + * Emit `.codehub/sbom.cyclonedx.json` and `.codehub/sbom.spdx.json` + * from Dependency nodes. **Default: on.** Serialization is cheap, purely + * local, and every CI pipeline that scans artifacts wants one. Pass + * `false` (CLI: `--no-sbom`) to suppress. */ readonly sbom?: boolean; /** - * When true, run the coverage overlay phase which detects lcov / - * cobertura / jacoco / coverage.py reports and populates - * `coveragePercent` + `coveredLines` on File nodes. Off by default. + * Run the coverage overlay phase — detects lcov / cobertura / jacoco / + * coverage.py reports and populates `coveragePercent` + `coveredLines` + * on File nodes. **Default: auto.** When `undefined`, `runAnalyze` + * probes the repo for a report at the well-known paths and enables the + * phase only when one is found (silent no-op otherwise). Pass `true` to + * force-enable and surface the "no report found" warning, or `false` + * (CLI: `--no-coverage`) to suppress entirely. */ readonly coverage?: boolean; /** - * When true (the post-P04 default), the `summarize` phase walks LSP- - * confirmed callable symbols and invokes Bedrock to generate structured - * summaries within the resolved cost cap. Pass `false` (or - * `CODEHUB_BEDROCK_DISABLED=1`) to force the phase off. + * Run Priority-1 security scanners at the end of `analyze` and write + * `.codehub/scan.sarif` + ingest findings into the graph. **Default: + * on.** Most scanners are local binaries (semgrep, bandit, ruff, + * vulture, radon, detect-secrets, betterleaks, ty); the network-backed + * ones (osv-scanner, grype, npm/pip audit) are silently skipped when + * `--offline` is set. Pass `false` (CLI: `--no-scan`) to suppress — the + * graph pipeline runs unchanged. + */ + readonly scan?: boolean; + /** + * Opt into the `summarize` phase — walks LSP-confirmed callable symbols + * and invokes Bedrock to generate structured summaries within the + * resolved cost cap. **Off by default**: a bare `codehub analyze` is + * fast, local, deterministic, and never spends on LLM calls. Enable + * per-invocation with `true` (CLI: `--summaries`) or environment-wide + * with `CODEHUB_BEDROCK_SUMMARIES=1`. `CODEHUB_BEDROCK_DISABLED=1` + * force-disables regardless of flag state. */ readonly summaries?: boolean; /** @@ -196,12 +213,23 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi // reports mode="full" with reason="no-prior-graph". const incrementalFrom = opts.force === true ? undefined : await loadPreviousGraph(repoPath); - // Resolve the effective `summaries` flag. P04 flipped the default ON, so - // `undefined` now means "on". The `CODEHUB_BEDROCK_DISABLED=1` env kill- - // switch forces off regardless of the flag; `offline` is enforced later - // inside the phase itself (the phase's own invariant). + // Resolve the effective `summaries` flag. Summaries are opt-in: a bare + // `codehub analyze` runs the fast, local, deterministic pipeline + // (tree-sitter + SCIP + cochanges) and skips the Bedrock summarize phase + // entirely. Opt in via `--summaries` or `CODEHUB_BEDROCK_SUMMARIES=1`. + // The `CODEHUB_BEDROCK_DISABLED=1` env kill-switch forces off regardless + // of the flag; `offline` is enforced later inside the phase itself. const summariesEnabled = resolveSummariesEnabled(opts.summaries, process.env); + // Resolve sbom/coverage/scan defaults. SBOM and scan default ON (cheap, + // local, and they feed the MCP surface agents actually use). Coverage + // auto-detects: probe the known report paths and only enable the phase + // when one exists — so bare `codehub analyze` on a repo with no coverage + // data stays silent instead of warning about a missing report. + const sbomEnabled = resolveSbomEnabled(opts.sbom); + const scanEnabled = resolveScanEnabled(opts.scan); + const coverageResolved = await resolveCoverageEnabled(opts.coverage, repoPath); + // Open a read-only store upfront so the `summarize` phase can probe the // prior summary rows before work is queued AND so we can inspect the // prior run's `storeMeta.stats` to resolve `--max-summaries auto`. We @@ -250,8 +278,8 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi ...(opts.embeddingsBatchSize !== undefined ? { embeddingsBatchSize: opts.embeddingsBatchSize } : {}), - ...(opts.sbom !== undefined ? { sbom: opts.sbom } : {}), - ...(opts.coverage !== undefined ? { coverage: opts.coverage } : {}), + sbom: sbomEnabled, + ...(coverageResolved !== undefined ? { coverage: coverageResolved } : {}), summaries: summariesEnabled, maxSummariesPerRun: resolvedMaxSummaries, ...(opts.summaryModel !== undefined ? { summaryModel: opts.summaryModel } : {}), @@ -430,6 +458,30 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi `graph ${result.graphHash.slice(0, 8)}, ${durationMs} ms${incrementalLine}${cacheLine}`, ); + // Scan phase — run Priority-1 scanners and write .codehub/scan.sarif so + // `verdict`, `list_findings`, and `list_findings_delta` work on day one. + // Run AFTER the graph + registry write so a scanner failure cannot + // regress the index. Network-backed scanners (osv-scanner, grype, npm/ + // pip audit) self-skip under --offline. We do NOT propagate the scan's + // severity-gated exit code — analyze remains the "build the graph" + // command; operators who want the gate invoke `codehub verdict` or + // `codehub scan` directly. + if (scanEnabled) { + try { + const scanMod = await import("./scan.js"); + const scanSummary = await scanMod.runScan(repoPath, { + repo: repoName, + ...(opts.home !== undefined ? { home: opts.home } : {}), + }); + log( + `codehub analyze: scan — ${scanSummary.runs.length} scanner(s), ` + + `${scanSummary.totalFindings} finding(s), sarif=${scanSummary.outputPath}`, + ); + } catch (err) { + log(`codehub analyze: scan skipped: ${(err as Error).message}`); + } + } + return { repoPath, repoName, @@ -518,16 +570,20 @@ export async function loadPreviousGraph( /** * Resolve the effective `summaries` flag, honoring the - * `CODEHUB_BEDROCK_DISABLED=1` env kill-switch and the P04 default-on - * contract (absent flag → enabled). + * `CODEHUB_BEDROCK_DISABLED=1` env kill-switch. + * + * `codehub analyze` is a fast, local, deterministic index by default — + * tree-sitter + SCIP + cochanges + graph phases only. The Bedrock-backed + * summarize phase is opt-in via `--summaries` (or `CODEHUB_BEDROCK_SUMMARIES=1`) + * so a fresh `codehub analyze` never spends on LLM calls, blocks on a + * network hop, or needs AWS creds. * - * Truth table (post-P04): - * - env var set + flag undefined → false (kill-switch wins) - * - env var set + flag true → false (kill-switch wins) - * - env var set + flag false → false - * - env var unset + flag undefined → true (default on) - * - env var unset + flag true → true - * - env var unset + flag false → false (explicit --no-summaries) + * Truth table: + * - env kill-switch set (any flag state) → false (kill-switch wins) + * - env opt-in set + flag undefined → true (env opts in) + * - flag true → true (explicit --summaries) + * - flag false → false (explicit --no-summaries) + * - flag undefined + no env → false (default off — fast path) * * Exported for unit tests; the production call site reads `process.env`. */ @@ -536,9 +592,98 @@ export function resolveSummariesEnabled( env: NodeJS.ProcessEnv | Record, ): boolean { if (env["CODEHUB_BEDROCK_DISABLED"] === "1") return false; + if (flag === true) return true; + if (flag === false) return false; + return env["CODEHUB_BEDROCK_SUMMARIES"] === "1"; +} + +/** + * Resolve the effective `sbom` flag. Default ON — serializing Dependency + * nodes to CycloneDX + SPDX is cheap, local, and every supply-chain audit + * wants it. Pass `false` to suppress. + * + * Exported for unit tests. + */ +export function resolveSbomEnabled(flag: boolean | undefined): boolean { + return flag !== false; +} + +/** + * Resolve the effective `scan` flag. Default ON — Priority-1 scanners are + * mostly local binaries that produce the SARIF `verdict`, `list_findings`, + * and `list_findings_delta` all read. Pass `false` (CLI: `--no-scan`) to + * suppress — the scanners that need network (osv-scanner, grype, npm/pip + * audit) are silently skipped anyway when `--offline` is set, so the + * on-default stays honest under offline operation. + * + * Exported for unit tests. + */ +export function resolveScanEnabled(flag: boolean | undefined): boolean { return flag !== false; } +/** + * Coverage-report candidate paths, mirrored from + * `packages/ingestion/src/pipeline/phases/coverage.ts:58-64`. Kept in sync + * by hand: the analyze wrapper needs to know whether a report exists + * *before* it sets `options.coverage=true`, because the phase warns when + * coverage is explicitly enabled but no report is found. When `undefined` + * is plumbed through instead, the phase is a silent no-op. + */ +const COVERAGE_CANDIDATE_PATHS = [ + "coverage/lcov.info", + "lcov.info", + "coverage.xml", + "build/reports/jacoco/test/jacocoTestReport.xml", + "coverage.json", +] as const; + +/** + * Probe the repo for a coverage report at one of the known paths. Returns + * the first match (relative to `repoPath`) or `undefined`. Used by the + * analyze wrapper to decide whether to enable the coverage phase when no + * explicit flag is passed. + * + * Exported so tests can assert which paths are probed without actually + * running `runAnalyze`. + */ +export async function detectCoverageReport(repoPath: string): Promise { + const { access } = await import("node:fs/promises"); + for (const rel of COVERAGE_CANDIDATE_PATHS) { + try { + await access(resolve(repoPath, rel)); + return rel; + } catch { + // Intentional: we're probing; missing-file is the whole point. + } + } + return undefined; +} + +/** + * Resolve the effective `coverage` flag, honoring explicit true/false and + * silently auto-detecting when the flag is `undefined`. This lets a bare + * `codehub analyze` overlay coverage on File nodes when a report is + * present and stay silent otherwise (no spurious "no report found" + * warning on repos that don't have tests). + * + * - `flag === true` → pipeline sees `true` (phase runs, warns if absent). + * - `flag === false` → pipeline sees `false` (phase no-op). + * - `flag === undefined` + report found → pipeline sees `true`. + * - `flag === undefined` + no report → pipeline sees `undefined` (no-op). + * + * Exported for unit tests. + */ +export async function resolveCoverageEnabled( + flag: boolean | undefined, + repoPath: string, +): Promise { + if (flag === true) return true; + if (flag === false) return false; + const detected = await detectCoverageReport(repoPath); + return detected !== undefined ? true : undefined; +} + /** * Resolve `--max-summaries auto` / explicit numeric caps into a concrete * numeric budget the pipeline can consume. diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 55b24b82..253cf73b 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -48,16 +48,29 @@ program .option("--skip-agents-md", "Do not write the AGENTS.md / CLAUDE.md stanza") .option( "--sbom", - "Emit .codehub/sbom.cyclonedx.json + .codehub/sbom.spdx.json from Dependency nodes", + "Emit .codehub/sbom.cyclonedx.json + .codehub/sbom.spdx.json from Dependency nodes. Default ON — use --no-sbom to suppress.", + ) + .option("--no-sbom", "Suppress SBOM emission. Equivalent to omitting `sbom: true`.") + .option( + "--coverage", + "Force the coverage overlay phase on and warn when no report is found. Default AUTO — `codehub analyze` auto-detects lcov/cobertura/jacoco/coverage.py reports and silently skips when none exist.", + ) + .option("--no-coverage", "Force the coverage overlay phase off even when a report is present.") + .option( + "--scan", + "Run Priority-1 scanners after analyze, write .codehub/scan.sarif, and ingest findings into the graph. Default ON — use --no-scan to suppress.", + ) + .option( + "--no-scan", + "Skip the post-analyze scan step. The graph pipeline runs unchanged; `codehub verdict` / `list_findings` work against the last SARIF on disk.", ) - .option("--coverage", "Overlay lcov/cobertura/jacoco/coverage.py report onto File nodes") .option( "--summaries", - "Enable the summarize phase (default ON: structured Bedrock summaries per callable). Use --no-summaries to disable.", + "Opt into the summarize phase (structured Bedrock summaries per callable). Default OFF — `codehub analyze` is fast, local, deterministic by default. Also enabled by CODEHUB_BEDROCK_SUMMARIES=1.", ) .option( "--no-summaries", - "Disable the summarize phase entirely (equivalent to CODEHUB_BEDROCK_DISABLED=1).", + "Explicitly disable the summarize phase (equivalent to CODEHUB_BEDROCK_DISABLED=1). Only meaningful when combined with CODEHUB_BEDROCK_SUMMARIES=1.", ) .option( "--max-summaries ", @@ -93,10 +106,16 @@ program process.env["OCH_NATIVE_PARSER"] = "1"; } // Pass the raw flag straight through to `runAnalyze`. The env - // kill-switch (`CODEHUB_BEDROCK_DISABLED=1`) is re-checked inside - // `runAnalyze` via `resolveSummariesEnabled` so tests that call - // `runAnalyze` directly honor the same truth table. - const summaries = opts["summaries"] === false ? false : undefined; + // kill-switch (`CODEHUB_BEDROCK_DISABLED=1`) and the env opt-in + // (`CODEHUB_BEDROCK_SUMMARIES=1`) are re-checked inside `runAnalyze` + // via `resolveSummariesEnabled` so tests that call `runAnalyze` + // directly honor the same truth table. Summaries are OFF by default + // — the fast, local, deterministic analyze path. Pass `--summaries` + // or set `CODEHUB_BEDROCK_SUMMARIES=1` to opt in. + let summaries: boolean | undefined; + if (opts["summaries"] === true) summaries = true; + else if (opts["summaries"] === false) summaries = false; + else summaries = undefined; // --max-summaries accepts either a positive integer or the literal // string "auto". Unknown strings fall back to "auto" so the CLI never @@ -137,9 +156,18 @@ program offline: opts["offline"] === true, verbose: opts["verbose"] === true, skipAgentsMd: opts["skipAgentsMd"] === true, - sbom: opts["sbom"] === true, - coverage: opts["coverage"] === true, - ...(summaries === false ? { summaries } : {}), + // `sbom`, `coverage`, `scan` are three-state (true / false / auto). + // commander encodes `--no-sbom` as `opts.sbom === false`, `--sbom` as + // `true`, and omitted as `undefined`. Forward all three verbatim — + // `runAnalyze` reads the resolvers (resolveSbomEnabled / resolveScan- + // Enabled / resolveCoverageEnabled) to pick the effective value. + ...(opts["sbom"] === false ? { sbom: false as const } : {}), + ...(opts["sbom"] === true ? { sbom: true as const } : {}), + ...(opts["coverage"] === false ? { coverage: false as const } : {}), + ...(opts["coverage"] === true ? { coverage: true as const } : {}), + ...(opts["scan"] === false ? { scan: false as const } : {}), + ...(opts["scan"] === true ? { scan: true as const } : {}), + ...(summaries !== undefined ? { summaries } : {}), maxSummariesPerRun, ...(typeof opts["summaryModel"] === "string" ? { summaryModel: opts["summaryModel"] } : {}), skills: opts["skills"] === true, diff --git a/packages/docs/src/content/docs/guides/indexing-a-repo.md b/packages/docs/src/content/docs/guides/indexing-a-repo.md index 25582df0..46b9344e 100644 --- a/packages/docs/src/content/docs/guides/indexing-a-repo.md +++ b/packages/docs/src/content/docs/guides/indexing-a-repo.md @@ -96,13 +96,36 @@ On the default LadybugDB layout: On the single-file DuckDB fallback, `graph.duckdb` replaces both `graph.lbug` and `temporal.duckdb`. -## Other useful flags - -- `--sbom` — emit a CycloneDX SBOM alongside the index. -- `--coverage` — bridge coverage data into the graph. +## What runs by default + +A bare `codehub analyze` produces a production-grade `.codehub/` folder +in one command: + +- Graph pipeline (tree-sitter parse + SCIP resolution + communities + + processes + cochanges + ownership + dependencies + detectors). +- SBOM emission (CycloneDX + SPDX) — **default on**; suppress with + `--no-sbom`. +- Priority-1 scanners → `.codehub/scan.sarif` + findings ingested into + the graph — **default on**; suppress with `--no-scan`. + Network-backed scanners (osv-scanner, grype, npm/pip audit) self-skip + under `--offline`, so the on-default stays honest. +- Coverage overlay — **default auto**: runs only when a report is + present at `coverage/lcov.info`, `lcov.info`, `coverage.xml`, + `build/reports/jacoco/test/jacocoTestReport.xml`, or `coverage.json`. + Silent no-op otherwise. Force with `--coverage`; force off with + `--no-coverage`. + +Everything else — embeddings, summaries, skills — is opt-in. + +## Opt-in flags + +- `--embeddings` — compute semantic vectors for queries by meaning. + Requires `codehub setup --embeddings` first. - `--summaries` / `--no-summaries` — LLM-generated symbol summaries - (default on; capped by `--max-summaries`, default auto = 10% of - callables, hard cap 500). + (default off — `codehub analyze` is fast, local, deterministic by + default; opt in with `--summaries` or `CODEHUB_BEDROCK_SUMMARIES=1`). + When enabled, the budget is capped by `--max-summaries`, default + `auto` = 10% of callables, hard cap 500. - `--skills` — generate Claude Code skills from the graph. - `--native-parser` — opt into the native tree-sitter N-API addon on Node 22 (the default runtime is `web-tree-sitter` / WASM). diff --git a/packages/docs/src/content/docs/reference/cli.md b/packages/docs/src/content/docs/reference/cli.md index 39d9866f..8d5e8e73 100644 --- a/packages/docs/src/content/docs/reference/cli.md +++ b/packages/docs/src/content/docs/reference/cli.md @@ -30,9 +30,10 @@ codehub analyze [path] | `--offline` | off | Zero sockets. | | `--verbose` | off | Per-phase pipeline progress. | | `--skip-agents-md` | off | Skip the AGENTS.md / CLAUDE.md stanza. | -| `--sbom` | off | Emit `sbom.cyclonedx.json` + `sbom.spdx.json` from `Dependency` nodes. | -| `--coverage` | off | Overlay lcov / cobertura / jacoco / coverage.py reports onto `File` nodes. | -| `--summaries` / `--no-summaries` | on | LLM symbol summaries (Bedrock). | +| `--sbom` / `--no-sbom` | **on** | Emit `sbom.cyclonedx.json` + `sbom.spdx.json` from `Dependency` nodes. Use `--no-sbom` to suppress. | +| `--scan` / `--no-scan` | **on** | Run Priority-1 scanners, write `.codehub/scan.sarif`, and ingest findings into the graph. Network-backed scanners (osv-scanner, grype, npm/pip audit) self-skip under `--offline`. Use `--no-scan` to suppress. | +| `--coverage` / `--no-coverage` | **auto** | Overlay lcov / cobertura / jacoco / coverage.py reports onto `File` nodes. `auto` probes `coverage/lcov.info`, `lcov.info`, `coverage.xml`, `build/reports/jacoco/test/jacocoTestReport.xml`, `coverage.json` in that order and enables the phase when one exists (silent no-op otherwise). `--coverage` forces on and warns if nothing is found; `--no-coverage` forces off. | +| `--summaries` / `--no-summaries` | off | LLM symbol summaries (Bedrock). Opt in with `--summaries` or `CODEHUB_BEDROCK_SUMMARIES=1`; kill with `--no-summaries` or `CODEHUB_BEDROCK_DISABLED=1`. | | `--max-summaries ` | `auto` (10% of SCIP-confirmed callables, cap 500) | Summary budget. | | `--summary-model ` | — | Override the Bedrock summary model id. | | `--skills` | off | Emit one `SKILL.md` per Community (≥5 symbols) under `.codehub/skills/`. | diff --git a/packages/docs/src/content/docs/reference/configuration.md b/packages/docs/src/content/docs/reference/configuration.md index 23a232d4..3507645d 100644 --- a/packages/docs/src/content/docs/reference/configuration.md +++ b/packages/docs/src/content/docs/reference/configuration.md @@ -56,7 +56,8 @@ When none of the above are set, the local ONNX backend |---|---| | `CODEHUB_DISABLE_SCIP` | Set to `1` to make the `scip-index` ingestion phase a no-op. Heuristic edges still flow. | | `CODEHUB_ALLOW_BUILD_SCRIPTS` | Set to `1` to allow SCIP indexers that require a build (Rust, Java) to run. Off by default for clean-room safety. | -| `CODEHUB_BEDROCK_DISABLED` | Set to `1` to disable the LLM summarize phase. Equivalent to `--no-summaries`. | +| `CODEHUB_BEDROCK_SUMMARIES` | Set to `1` to opt the LLM summarize phase in. Equivalent to `--summaries`. Off by default — `codehub analyze` runs fast, local, deterministic phases only. | +| `CODEHUB_BEDROCK_DISABLED` | Set to `1` to force-disable the LLM summarize phase. Equivalent to `--no-summaries`. Wins over `CODEHUB_BEDROCK_SUMMARIES=1` and `--summaries`. | | `NO_COLOR` | Standard convention; disables colored console output. | ## On-disk layout: `.codehub/`