Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ docs/proposals/
local/
test_output.txt
scripts/verify-counts.ts
scripts/.parity-out/
scripts/.parity-tmp-entry.ts
marketing/

# Playwright test results
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@
"analyze:token-inventory": "npx tsx scripts/token-inventory.ts --save",
"analyze:context-composition": "npx tsx scripts/explore-context-composition.ts",
"analyze:e2e-real-data": "npx tsx scripts/e2e-real-data.ts",
"parity": "npx tsx scripts/parse-parity.ts",
"analyze:catalog-fetch": "node scripts/test-catalog-fetch.mjs",
"smoke-test": "node scripts/smoke-test.mjs",
"gen-synth-logs": "npx tsx scripts/generate-synthetic-logs.ts",
Expand Down
85 changes: 85 additions & 0 deletions scripts/parity-parse-entry.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE in the project root for license information.
*--------------------------------------------------------------------------------------------*/

/*
* Parse-parity entry. Bundled and run once per source tree by scripts/parse-parity.ts.
*
* It parses the real local logs with `parseAllLogs` and writes a deterministic, canonical
* representation of the ParseResult (stable key order, sorted collections) as NDJSON, plus a
* SHA-256 of that text. Two trees that produce the same SHA produced byte-for-byte identical
* parsed output — the check this repo uses to prove a refactor (e.g. the O(n^2) JSONL streaming
* fix) changed performance only, not the parsed result.
*
* Usage (driven by parse-parity.ts, not run directly):
* node <bundle>.cjs <outNdjsonPath>
*/

import * as crypto from 'crypto';
import * as fs from 'fs';
import { findLogsDirs, parseAllLogs } from '../src/core/parser';

/** Recursively sort object keys so serialization is independent of insertion order. */
function sortValue(v: unknown): unknown {
if (Array.isArray(v)) return v.map(sortValue);
if (v && typeof v === 'object') {
const out: Record<string, unknown> = {};
for (const k of Object.keys(v as Record<string, unknown>).sort()) {
out[k] = sortValue((v as Record<string, unknown>)[k]);
}
return out;
}
return v;
}

function stable(v: unknown): string {
return JSON.stringify(sortValue(v));
}

function main(): void {
const outPath = process.argv[2];
if (!outPath) throw new Error('usage: <bundle>.cjs <outNdjsonPath>');

const dirs = findLogsDirs();
const result = parseAllLogs(dirs);

const lines: string[] = [];

// Workspaces — keyed and sorted by id.
for (const id of [...result.workspaces.keys()].sort()) {
lines.push(`WS\t${id}\t${stable(result.workspaces.get(id))}`);
}

// Sessions — sorted by sessionId so readdir ordering between trees cannot cause a false diff.
const sessions = [...result.sessions].sort((a, b) =>
a.sessionId < b.sessionId ? -1 : a.sessionId > b.sessionId ? 1 : 0,
);
for (const s of sessions) lines.push(`SESSION\t${s.sessionId}\t${stable(s)}`);

// Edit-location index — Map<requestId, Map<uri, linesAdded>>.
for (const reqId of [...result.editLocIndex.keys()].sort()) {
const inner = result.editLocIndex.get(reqId)!;
const obj: Record<string, number> = {};
for (const uri of [...inner.keys()].sort()) obj[uri] = inner.get(uri)!;
lines.push(`EDIT\t${reqId}\t${stable(obj)}`);
}

// Session-source index.
for (const sid of [...result.sessionSourceIndex.keys()].sort()) {
lines.push(`SRC\t${sid}\t${stable(result.sessionSourceIndex.get(sid))}`);
}

const body = lines.join('\n');
const sha = crypto.createHash('sha256').update(body).digest('hex');
fs.writeFileSync(outPath, body);
fs.writeFileSync(`${outPath}.sha256`, sha);

// eslint-disable-next-line no-console
console.log(
` sessions=${result.sessions.length} workspaces=${result.workspaces.size} ` +
`editLoc=${result.editLocIndex.size} sources=${result.sessionSourceIndex.size} sha256=${sha}`,
);
}

main();
147 changes: 147 additions & 0 deletions scripts/parse-parity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See LICENSE in the project root for license information.
*--------------------------------------------------------------------------------------------*/

/*
* Parsed-output parity check between two source trees (typically this branch vs. main).
*
* Why this exists: a performance refactor of the parser (e.g. the O(n^2) JSONL streaming fix in
* parser-vscode-files.ts) must change speed only, not the parsed result. There is no golden
* snapshot in the repo, and the harness itself only lives on the feature branch — so instead of
* comparing against a committed baseline, it builds the parser from BOTH trees and diffs their
* canonical output against the same real local logs.
*
* How: for each tree it copies scripts/parity-parse-entry.ts in, esbuild-bundles it against that
* tree's own src/, runs it to emit canonical NDJSON + a SHA-256, then compares the two hashes.
* Equal hash => byte-for-byte identical parsed output.
*
* Run:
* npm run parity # compares branch vs. the default main checkout
* npm run parity -- --main C:\path\to\main-checkout
*
* Notes:
* - The "main" tree must be a checkout of this repo with node_modules installed.
* - The main side runs the unoptimized parser, so a large real log set can take a couple of
* minutes there — that slowness is exactly what the branch fixes.
* - Output artifacts land in scripts/.parity-out/ (gitignored).
*/

import { spawnSync } from 'child_process';
import { build } from 'esbuild';
import * as fs from 'fs';
import * as path from 'path';

const branchRoot = path.resolve(__dirname, '..');

/** Locate the `main` branch's working tree via `git worktree list`. Works regardless of where the
* main checkout lives on disk (it need not be a sibling of this worktree) and is cross-platform. */
function findMainWorktree(): string | undefined {
const res = spawnSync('git', ['worktree', 'list', '--porcelain'], {
cwd: branchRoot,
encoding: 'utf8',
});
if (res.status !== 0 || !res.stdout) return undefined;
let current: string | undefined;
for (const line of res.stdout.split('\n')) {
if (line.startsWith('worktree ')) current = line.slice('worktree '.length).trim();
else if (line.trim() === 'branch refs/heads/main' && current) return path.resolve(current);
}
return undefined;
}

function parseMainRoot(): string {
const args = process.argv.slice(2);
const i = args.indexOf('--main');
if (i >= 0 && args[i + 1]) return path.resolve(args[i + 1]);
if (process.env.PARITY_MAIN) return path.resolve(process.env.PARITY_MAIN);

// Default: discover the main worktree from git rather than hard-coding a path, so this runs on
// any OS and any checkout layout. Fail with an actionable message if it cannot be found.
const fromGit = findMainWorktree();
if (fromGit && fs.existsSync(path.join(fromGit, 'src', 'core', 'parser.ts'))) return fromGit;

throw new Error(
'parse-parity: could not locate a main checkout to compare against. Pass --main <path>, set ' +
'PARITY_MAIN, or add a worktree for the main branch (git worktree add <path> main).',
);
}

const mainRoot = parseMainRoot();
const outDir = path.join(branchRoot, 'scripts', '.parity-out');
const entrySrc = fs.readFileSync(path.join(branchRoot, 'scripts', 'parity-parse-entry.ts'), 'utf8');

async function runFor(label: string, root: string): Promise<{ sha: string; ndjson: string }> {
if (!fs.existsSync(path.join(root, 'src', 'core', 'parser.ts'))) {
throw new Error(`${label}: no src/core/parser.ts under ${root}`);
}
// Inject the identical entry + canonicalizer into the target tree so ONLY the parser differs.
const tmpEntry = path.join(root, 'scripts', '.parity-tmp-entry.ts');
fs.writeFileSync(tmpEntry, entrySrc);
const bundle = path.join(outDir, `${label}.cjs`);
const ndjson = path.join(outDir, `${label}.ndjson`);
try {
await build({
entryPoints: [tmpEntry],
bundle: true,
platform: 'node',
format: 'cjs',
outfile: bundle,
absWorkingDir: root, // resolve ../src and node_modules from the target tree
logLevel: 'error',
});
// eslint-disable-next-line no-console
console.log(`\n[${label}] parsing real logs (${root})...`);
const res = spawnSync(process.execPath, [bundle, ndjson], { stdio: 'inherit' });
if (res.status !== 0) throw new Error(`${label}: parse exited with code ${res.status}`);
const sha = fs.readFileSync(`${ndjson}.sha256`, 'utf8').trim();
return { sha, ndjson };
} finally {
fs.rmSync(tmpEntry, { force: true });
}
}

function reportFirstDiff(branchNdjson: string, mainNdjson: string): void {
const bl = fs.readFileSync(branchNdjson, 'utf8').split('\n');
const ml = fs.readFileSync(mainNdjson, 'utf8').split('\n');
const n = Math.max(bl.length, ml.length);
for (let i = 0; i < n; i++) {
if (bl[i] !== ml[i]) {
const cut = (s: string | undefined): string => (s === undefined ? '<missing>' : s.slice(0, 400));
// eslint-disable-next-line no-console
console.log(`\nFirst difference at line ${i + 1}:`);
// eslint-disable-next-line no-console
console.log(` branch: ${cut(bl[i])}`);
// eslint-disable-next-line no-console
console.log(` main: ${cut(ml[i])}`);
break;
}
}
// eslint-disable-next-line no-console
console.log(`\n(branch lines=${bl.length}, main lines=${ml.length})`);
}

async function run(): Promise<void> {
fs.mkdirSync(outDir, { recursive: true });
/* eslint-disable no-console */
console.log('Parse-parity check');
console.log(` branch tree: ${branchRoot}`);
console.log(` main tree: ${mainRoot}`);

const branch = await runFor('branch', branchRoot);
const main = await runFor('main', mainRoot);

console.log(`\nbranch sha256 = ${branch.sha}`);
console.log(`main sha256 = ${main.sha}`);

if (branch.sha === main.sha) {
console.log('\nPARITY OK — parsed output is byte-for-byte identical across both trees.');
process.exit(0);
}
console.log('\nPARITY FAILED — parsed output differs between the two trees.');
reportFirstDiff(branch.ndjson, main.ndjson);
process.exit(1);
/* eslint-enable no-console */
}

void run();
51 changes: 50 additions & 1 deletion src/core/parser-shared.ts
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,56 @@ export function maybeForceGc(): void {
if (now - lastForcedGcAt < FORCE_GC_MIN_INTERVAL_MS) return;
if (process.memoryUsage().rss < FORCE_GC_RSS_THRESHOLD) return;
lastForcedGcAt = now;
try { gc(); } catch { /* gc unavailable */ }
// Count only a GC that actually ran, so `forcedGc` matches its documented meaning even if the
// engine's gc() ever throws (the !gc guard above already handles the unavailable case).
try {
gc();
parseTiming.forcedGc++;
} catch { /* gc unavailable */ }
}

/* ---- Cold-parse sub-phase attribution (issue #106 follow-up) ----
* The end-to-end `sync-timing` line proved phase-2 parse is ~99% of sync wall-clock. These
* counters break that phase down by sub-step so a single re-run shows where the time actually
* goes (chat-file parse vs edit-state parse vs CLI events vs forced-GC pauses) instead of guessing.
* Pure measurement: never alters parsed output. Reset per parse run. */
export interface ParseTiming {
/** Cumulative ms spent in parseSessionFile (read + strip + JSON.parse + request build). */
chatMs: number;
/** Cumulative ms spent parsing chatEditingSessions state files. */
editMs: number;
/** Cumulative ms spent parsing CLI events.jsonl files. */
cliMs: number;
/** Number of chat session files parsed. */
chatFiles: number;
/** Number of edit-state files parsed. */
editFiles: number;
/** Number of times a proactive full GC was actually forced. */
forcedGc: number;
}

const parseTiming: ParseTiming = { chatMs: 0, editMs: 0, cliMs: 0, chatFiles: 0, editFiles: 0, forcedGc: 0 };

/** Accumulate elapsed time (ms) for a cold-parse sub-step. */
export function addParseTiming(kind: 'chat' | 'edit' | 'cli', ms: number): void {
if (kind === 'chat') { parseTiming.chatMs += ms; parseTiming.chatFiles++; }
else if (kind === 'edit') { parseTiming.editMs += ms; parseTiming.editFiles++; }
else parseTiming.cliMs += ms;
}

/** Reset all sub-phase counters at the start of a parse run. */
export function resetParseTiming(): void {
parseTiming.chatMs = 0;
parseTiming.editMs = 0;
parseTiming.cliMs = 0;
parseTiming.chatFiles = 0;
parseTiming.editFiles = 0;
parseTiming.forcedGc = 0;
}

/** Snapshot the current sub-phase counters for logging. */
export function getParseTiming(): ParseTiming {
return { ...parseTiming };
}

export const CODE_BLOCK_RE = /```(\w+)?\n([\s\S]*?)```/g;
Expand Down
21 changes: 21 additions & 0 deletions src/core/parser-vscode-files.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,17 @@ describe('forEachJsonlLine (sync streaming reader)', () => {
expect(lines[1]).toBe('b');
});
});

it('emits a single line that spans many read chunks intact', () => {
// A Copilot CLI events.jsonl line can embed tens of MB of inline base64. The reader must scan
// only each freshly read chunk for newlines rather than re-splitting the whole growing buffer,
// or one long line costs O(n^2) CPU (issue #106 follow-up). This line spans ~3.5 read chunks.
const big = 'x'.repeat(READ_CHUNK * 3 + 512);
withTempFile('huge.jsonl', `first\n${big}\nlast\n`, (fp) => {
const lines = collectLines((cb) => forEachJsonlLine(fp, cb));
expect(lines).toEqual(['first', big, 'last']);
});
});
});

describe('forEachJsonlLineAsync (async streaming reader)', () => {
Expand All @@ -404,6 +415,16 @@ describe('forEachJsonlLineAsync (async streaming reader)', () => {
expect(lines).toEqual(['one', 'two', 'last']);
});

it('emits a single line that spans many read chunks intact', async () => {
// Guards the same O(n^2) regression as the sync reader: the async cold-parse path streams the
// large CLI events.jsonl files, where a single inline-base64 line can span many chunks.
const big = 'y'.repeat(READ_CHUNK * 3 + 512);
const fp = makeTempFile('huge-async.jsonl', `first\n${big}\nlast\n`);
const lines: string[] = [];
await forEachJsonlLineAsync(fp, (line) => lines.push(line));
expect(lines).toEqual(['first', big, 'last']);
});

it('reports final byte progress equal to the file size', async () => {
const fp = makeTempFile('a.jsonl', 'one\ntwo\n');
const total = fs.statSync(fp).size;
Expand Down
Loading
Loading