Skip to content

Commit 0f319c7

Browse files
committed
feat(xsd): parser scaffolding (Phase 3b)
parseSchemaSet({ schemaDir, entrypoints }) loads a working set of XSDs, follows xsd:import schemaLocation references recursively, and indexes every top-level declaration (element/complexType/simpleType/group/ attributeGroup/attribute) by canonical Clark-style qname. fast-xml-parser configured with preserveOrder so sibling order across different tag names is retained, and no value coercion that would mutate XSD attribute strings. Returns a typed schema set: - documents: per-file metadata + raw schemaNode - namespaceByPrefix: per-document prefix -> URI maps - importGraph: per-document outgoing imports with resolved targets - declarationsByQName: canonical qname -> declarations[] QName resolution is conservative: declaration qnames use the document's target namespace; attribute qnames (ref/type/base) resolve through the document's prefix map and surface as { resolved: false } when the prefix or namespace is unknown rather than guessing. No DB writes in this phase. Smoke command bun run xsd:smoke parses wml.xsd from the cache and reports counts (820 complexTypes, 389 simpleTypes, 67 groups, 47 elements, etc). Also tightens DB test isolation: an afterAll TRUNCATE leaves the dev DB clean instead of carrying the last test's xsd_profiles row.
1 parent d6be885 commit 0f319c7

12 files changed

Lines changed: 802 additions & 17 deletions

File tree

bun.lock

Lines changed: 13 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"db:migrate": "bun scripts/db-migrate.ts",
2323
"db:sync-sources": "bun scripts/sync-sources.ts",
2424
"xsd:fetch": "bun scripts/fetch-xsd.ts",
25+
"xsd:smoke": "bun scripts/ingest-xsd/smoke.ts",
2526
"test": "bun test tests/",
2627
"ingest": "bun scripts/ingest/pipeline.ts",
2728
"ingest:chunk": "bun scripts/ingest/chunk.ts",
@@ -35,6 +36,7 @@
3536
"@semantic-release/exec": "^7.1.0",
3637
"@semantic-release/git": "^10.0.1",
3738
"@semantic-release/github": "^12.0.2",
39+
"fast-xml-parser": "^5.7.2",
3840
"lefthook": "^2.0.16",
3941
"semantic-release": "^25.0.2",
4042
"typescript": "~5.9.3"

scripts/ingest-xsd/ast.ts

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/**
2+
* Helpers for navigating the preserveOrder AST emitted by fast-xml-parser.
3+
*
4+
* AST shape: every element is a single-key object { tagName: children[], ":@"?: { "@_attrName": value } }.
5+
* Text nodes are { "#text": string }. Children always live in an array, so sibling
6+
* order is preserved across different tag names.
7+
*/
8+
9+
import type { PreserveOrderDocument, PreserveOrderNode } from "./types.ts";
10+
11+
/** Strip an XML namespace prefix from a tag name: "xsd:element" → "element". */
12+
export function stripPrefix(tag: string): string {
13+
const colon = tag.indexOf(":");
14+
return colon < 0 ? tag : tag.slice(colon + 1);
15+
}
16+
17+
/** Return the single tag name on a preserveOrder node, or null for non-element nodes. */
18+
export function nodeTag(node: PreserveOrderNode): string | null {
19+
for (const k of Object.keys(node)) {
20+
if (k !== ":@") return k;
21+
}
22+
return null;
23+
}
24+
25+
/** Return the children array of a preserveOrder element. */
26+
export function nodeChildren(node: PreserveOrderNode): PreserveOrderNode[] {
27+
const tag = nodeTag(node);
28+
if (!tag) return [];
29+
const v = node[tag];
30+
return Array.isArray(v) ? (v as PreserveOrderNode[]) : [];
31+
}
32+
33+
/** Return attributes on a preserveOrder element. fast-xml-parser nests them under ":@" with "@_" prefix. */
34+
export function nodeAttrs(node: PreserveOrderNode): Record<string, string> {
35+
const raw = node[":@"];
36+
if (!raw || typeof raw !== "object") return {};
37+
const out: Record<string, string> = {};
38+
for (const [k, v] of Object.entries(raw as Record<string, unknown>)) {
39+
const name = k.startsWith("@_") ? k.slice(2) : k;
40+
if (typeof v === "string") out[name] = v;
41+
else if (v != null) out[name] = String(v);
42+
}
43+
return out;
44+
}
45+
46+
/**
47+
* Find the first element in `doc` (or under `parent`) whose stripped tag name
48+
* matches one of the given local names. Used to locate the xsd:schema root
49+
* regardless of whether the file uses `xsd:`, `xs:`, or no prefix.
50+
*/
51+
export function findFirstByLocalName(
52+
nodes: PreserveOrderDocument | PreserveOrderNode[],
53+
localNames: string[],
54+
): PreserveOrderNode | null {
55+
for (const node of nodes) {
56+
const tag = nodeTag(node);
57+
if (tag && localNames.includes(stripPrefix(tag))) return node;
58+
}
59+
return null;
60+
}
61+
62+
/**
63+
* Iterate immediate children of an element whose stripped tag name matches `localName`.
64+
*/
65+
export function* eachChildByLocalName(
66+
parent: PreserveOrderNode,
67+
localName: string,
68+
): Generator<PreserveOrderNode> {
69+
for (const child of nodeChildren(parent)) {
70+
const tag = nodeTag(child);
71+
if (tag && stripPrefix(tag) === localName) yield child;
72+
}
73+
}

scripts/ingest-xsd/parse-schema.ts

Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
/**
2+
* Parse a working set of XSDs into an in-memory schema set.
3+
*
4+
* Walks xsd:import schemaLocation references recursively starting from
5+
* `entrypoints`, and indexes every top-level declaration by canonical qname.
6+
*
7+
* No DB writes here. Subsequent phases (3c+) walk documents/declarations to
8+
* produce xsd_symbols, edges, etc.
9+
*/
10+
11+
import { readFile } from "node:fs/promises";
12+
import { isAbsolute, normalize, relative, resolve, sep } from "node:path";
13+
import { XMLParser } from "fast-xml-parser";
14+
import { eachChildByLocalName, findFirstByLocalName, nodeAttrs, stripPrefix } from "./ast.ts";
15+
import { declarationQNameKey } from "./qname.ts";
16+
import type {
17+
Declaration,
18+
DeclarationKind,
19+
ImportEdge,
20+
ParsedSchemaDocument,
21+
ParsedSchemaSet,
22+
PreserveOrderDocument,
23+
PreserveOrderNode,
24+
} from "./types.ts";
25+
import { vocabularyForNamespace } from "./vocabulary.ts";
26+
27+
const xmlParser = new XMLParser({
28+
preserveOrder: true,
29+
ignoreAttributes: false,
30+
attributeNamePrefix: "@_",
31+
parseAttributeValue: false,
32+
parseTagValue: false,
33+
trimValues: true,
34+
});
35+
36+
const TOP_LEVEL_KINDS: Record<string, DeclarationKind> = {
37+
element: "element",
38+
complexType: "complexType",
39+
simpleType: "simpleType",
40+
group: "group",
41+
attributeGroup: "attributeGroup",
42+
attribute: "attribute",
43+
};
44+
45+
export interface ParseSchemaSetOptions {
46+
schemaDir: string;
47+
entrypoints: string[];
48+
}
49+
50+
export async function parseSchemaSet(opts: ParseSchemaSetOptions): Promise<ParsedSchemaSet> {
51+
const schemaDir = isAbsolute(opts.schemaDir) ? opts.schemaDir : resolve(opts.schemaDir);
52+
53+
const documents = new Map<string, ParsedSchemaDocument>();
54+
const namespaceByPrefix = new Map<string, Map<string, string>>();
55+
const importGraph = new Map<string, ImportEdge[]>();
56+
const declarationsByQName = new Map<string, Declaration[]>();
57+
58+
const queue: string[] = opts.entrypoints.map((p) => relPath(schemaDir, resolve(schemaDir, p)));
59+
60+
while (queue.length) {
61+
const relPathInDir = queue.shift()!;
62+
if (documents.has(relPathInDir)) continue;
63+
64+
const absolutePath = resolve(schemaDir, relPathInDir);
65+
const text = await readFile(absolutePath, "utf-8");
66+
const ast = xmlParser.parse(text) as PreserveOrderDocument;
67+
68+
const schemaNode = findFirstByLocalName(ast, ["schema"]);
69+
if (!schemaNode) {
70+
throw new Error(`No xsd:schema root in ${absolutePath}`);
71+
}
72+
73+
const attrs = nodeAttrs(schemaNode);
74+
const targetNamespace = attrs.targetNamespace;
75+
if (!targetNamespace) {
76+
throw new Error(`Schema in ${absolutePath} is missing targetNamespace`);
77+
}
78+
79+
const prefixes = extractNamespacePrefixes(attrs);
80+
const imports = extractImports(schemaNode, schemaDir, absolutePath);
81+
82+
const doc: ParsedSchemaDocument = {
83+
path: relPathInDir,
84+
absolutePath,
85+
targetNamespace,
86+
vocabularyId: vocabularyForNamespace(targetNamespace),
87+
schemaNode,
88+
};
89+
90+
documents.set(relPathInDir, doc);
91+
namespaceByPrefix.set(relPathInDir, prefixes);
92+
importGraph.set(relPathInDir, imports);
93+
94+
indexTopLevelDeclarations(doc, declarationsByQName);
95+
96+
for (const edge of imports) {
97+
if (edge.target && !documents.has(edge.target)) {
98+
queue.push(edge.target);
99+
}
100+
}
101+
}
102+
103+
return { documents, namespaceByPrefix, importGraph, declarationsByQName };
104+
}
105+
106+
function extractNamespacePrefixes(attrs: Record<string, string>): Map<string, string> {
107+
const map = new Map<string, string>();
108+
for (const [name, value] of Object.entries(attrs)) {
109+
if (name === "xmlns") map.set("", value);
110+
else if (name.startsWith("xmlns:")) map.set(name.slice("xmlns:".length), value);
111+
}
112+
return map;
113+
}
114+
115+
function extractImports(
116+
schemaNode: PreserveOrderNode,
117+
schemaDir: string,
118+
currentAbsPath: string,
119+
): ImportEdge[] {
120+
const imports: ImportEdge[] = [];
121+
for (const importNode of eachChildByLocalName(schemaNode, "import")) {
122+
const a = nodeAttrs(importNode);
123+
const schemaLocation = a.schemaLocation ?? null;
124+
let target: string | null = null;
125+
if (schemaLocation) {
126+
const importedAbs = resolve(currentAbsPath, "..", schemaLocation);
127+
target = relPath(schemaDir, importedAbs);
128+
}
129+
imports.push({
130+
namespace: a.namespace ?? "",
131+
schemaLocation,
132+
target,
133+
});
134+
}
135+
return imports;
136+
}
137+
138+
function indexTopLevelDeclarations(
139+
doc: ParsedSchemaDocument,
140+
declarationsByQName: Map<string, Declaration[]>,
141+
): void {
142+
for (const child of nodeChildrenLocal(doc.schemaNode)) {
143+
const tag = nodeTagLocal(child);
144+
if (!tag) continue;
145+
const local = stripPrefix(tag);
146+
const kind = TOP_LEVEL_KINDS[local];
147+
if (!kind) continue;
148+
149+
const a = nodeAttrs(child);
150+
const localName = a.name;
151+
if (!localName) continue;
152+
153+
const decl: Declaration = {
154+
kind,
155+
namespace: doc.targetNamespace,
156+
vocabularyId: doc.vocabularyId,
157+
localName,
158+
documentPath: doc.path,
159+
node: child,
160+
};
161+
const key = declarationQNameKey(doc.targetNamespace, kind, localName);
162+
const arr = declarationsByQName.get(key);
163+
if (arr) arr.push(decl);
164+
else declarationsByQName.set(key, [decl]);
165+
}
166+
}
167+
168+
// Local helpers (avoid pulling extra exports from ast.ts).
169+
function nodeTagLocal(node: PreserveOrderNode): string | null {
170+
for (const k of Object.keys(node)) if (k !== ":@") return k;
171+
return null;
172+
}
173+
function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] {
174+
const tag = nodeTagLocal(node);
175+
if (!tag) return [];
176+
const v = node[tag];
177+
return Array.isArray(v) ? (v as PreserveOrderNode[]) : [];
178+
}
179+
180+
function relPath(base: string, abs: string): string {
181+
const r = relative(base, normalize(abs));
182+
// Guard against escapes outside schemaDir.
183+
if (r.startsWith(`..${sep}`) || r === "..") {
184+
throw new Error(`Resolved path escapes schemaDir: ${abs} (base ${base})`);
185+
}
186+
return r;
187+
}

0 commit comments

Comments
 (0)