Skip to content

Commit a1d7cba

Browse files
committed
feat(xsd): symbol + inheritance ingest (Phase 3c)
ingestSchemaSet wraps parseSchemaSet and writes: - xsd_profiles (bootstrap target profile) - xsd_namespaces (one per unique URI) - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert) - xsd_symbol_profiles (membership for the target profile, with source_id) - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent and simpleType/restriction) The whole ingest runs in one transaction. Re-runs are no-ops via UNIQUE + ON CONFLICT DO NOTHING; stale-row cleanup is deferred per PLAN.md's edition-flip open item. QName base resolution uses the document's prefix map. Built-in xsd:* bases are auto-created on demand as kind=simpleType in vocabulary xsd-builtin so the FK on xsd_inheritance_edges.base_symbol_id holds. Phase 3c does not touch compositors, child edges, attributes, group refs, or enums (those are 3d/3e). Tests: fixture-driven happy path, idempotency check, plus an optional real-cache smoke test against the WML closure (12 docs, ~1359 symbols, ~389 inheritance edges, all bases resolved). Fixture main.xsd gains CT_Extended (extends CT_Empty) and CT_Restricted (restricts CT_Para) so the inheritance walker is exercised on both forms; existing parser test counts adjusted to match.
1 parent 0f319c7 commit a1d7cba

8 files changed

Lines changed: 610 additions & 14 deletions

File tree

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
"db:sync-sources": "bun scripts/sync-sources.ts",
2424
"xsd:fetch": "bun scripts/fetch-xsd.ts",
2525
"xsd:smoke": "bun scripts/ingest-xsd/smoke.ts",
26+
"xsd:ingest": "bun scripts/ingest-xsd/ingest.ts",
2627
"test": "bun test tests/",
2728
"ingest": "bun scripts/ingest/pipeline.ts",
2829
"ingest:chunk": "bun scripts/ingest/chunk.ts",

scripts/ingest-xsd/ingest.ts

Lines changed: 376 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,376 @@
1+
/**
2+
* Phase 3c: ingest top-level symbols and inheritance edges.
3+
*
4+
* Walks parseSchemaSet output and writes:
5+
* - xsd_profiles (bootstrap target profile, idempotent)
6+
* - xsd_namespaces (one row per unique URI seen across documents)
7+
* - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert by natural key)
8+
* - xsd_symbol_profiles (membership for the target profile, with source_id)
9+
* - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent
10+
* and simpleType/restriction)
11+
*
12+
* NOT touched here (Phases 3d/3e):
13+
* - xsd_compositors, xsd_child_edges (content models)
14+
* - xsd_attr_edges (attributes)
15+
* - xsd_group_edges (group/attributeGroup refs)
16+
* - xsd_enums (simpleType enumerations)
17+
*
18+
* Idempotency: the entire ingest runs in a single transaction. Re-running
19+
* against the same source produces no new rows (UNIQUE + ON CONFLICT DO NOTHING).
20+
* Stale-row cleanup (when symbols vanish in a future edition) is deferred,
21+
* see PLAN.md "Edition flip and behavior_notes" open item.
22+
*
23+
* Usage as a library:
24+
* await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, sql })
25+
*
26+
* Usage as a CLI:
27+
* bun scripts/ingest-xsd/ingest.ts
28+
* bun scripts/ingest-xsd/ingest.ts --schema-dir <dir> --entrypoint wml.xsd \
29+
* --profile transitional --source ecma-376-transitional
30+
*/
31+
32+
import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts";
33+
import { nodeAttrs } from "./ast.ts";
34+
import { parseSchemaSet } from "./parse-schema.ts";
35+
import { resolveQNameAttr } from "./qname.ts";
36+
import type {
37+
Declaration,
38+
ParsedSchemaSet,
39+
PreserveOrderNode,
40+
} from "./types.ts";
41+
import { vocabularyForNamespace } from "./vocabulary.ts";
42+
43+
// biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic.
44+
type Sql = any;
45+
46+
export interface IngestSchemaSetOptions {
47+
schemaDir: string;
48+
entrypoints: string[];
49+
/** Profile name to attach symbols to (e.g. "transitional"). Bootstrap if missing. */
50+
profileName: string;
51+
/** Source name in reference_sources; used for source_id on xsd_symbol_profiles. */
52+
sourceName: string;
53+
/** Existing DbClient. The ingest opens its own transaction inside. */
54+
db: DbClient;
55+
}
56+
57+
export interface IngestStats {
58+
documents: number;
59+
symbolsInserted: number;
60+
symbolsExisting: number;
61+
namespacesEnsured: number;
62+
profileMembershipsInserted: number;
63+
inheritanceEdgesInserted: number;
64+
inheritanceUnresolved: number;
65+
}
66+
67+
export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<IngestStats> {
68+
const parseResult = await parseSchemaSet({
69+
schemaDir: opts.schemaDir,
70+
entrypoints: opts.entrypoints,
71+
});
72+
73+
const stats: IngestStats = {
74+
documents: parseResult.documents.size,
75+
symbolsInserted: 0,
76+
symbolsExisting: 0,
77+
namespacesEnsured: 0,
78+
profileMembershipsInserted: 0,
79+
inheritanceEdgesInserted: 0,
80+
inheritanceUnresolved: 0,
81+
};
82+
83+
await opts.db.sql.begin(async (sql: Sql) => {
84+
const profileId = await ensureProfile(sql, opts.profileName);
85+
const sourceId = await lookupSourceId(sql, opts.sourceName);
86+
87+
// Pass 1: namespaces, symbols, profile memberships.
88+
const namespaceIds = new Map<string, number>();
89+
const symbolIds = new Map<string, number>(); // canonical (vocab|local|kind) -> id
90+
91+
for (const doc of parseResult.documents.values()) {
92+
if (!namespaceIds.has(doc.targetNamespace)) {
93+
const id = await ensureNamespace(sql, doc.targetNamespace);
94+
namespaceIds.set(doc.targetNamespace, id);
95+
stats.namespacesEnsured++;
96+
}
97+
}
98+
99+
for (const decls of parseResult.declarationsByQName.values()) {
100+
for (const decl of decls) {
101+
const key = symbolKey(decl.vocabularyId, decl.localName, decl.kind);
102+
if (symbolIds.has(key)) continue;
103+
const { id, inserted } = await upsertSymbol(
104+
sql,
105+
decl.vocabularyId,
106+
decl.localName,
107+
decl.kind,
108+
);
109+
symbolIds.set(key, id);
110+
if (inserted) stats.symbolsInserted++;
111+
else stats.symbolsExisting++;
112+
113+
const nsId = namespaceIds.get(decl.namespace);
114+
if (!nsId) {
115+
throw new Error(
116+
`Internal: missing namespace id for ${decl.namespace} (decl ${decl.localName})`,
117+
);
118+
}
119+
const linked = await linkSymbolToProfile(sql, id, profileId, nsId, sourceId);
120+
if (linked) stats.profileMembershipsInserted++;
121+
}
122+
}
123+
124+
// Pass 2: inheritance edges. Resolve base qname through the document's
125+
// prefix map; ensure built-in xsd:* placeholders exist on demand.
126+
for (const decls of parseResult.declarationsByQName.values()) {
127+
for (const decl of decls) {
128+
const inherit = findInheritance(decl);
129+
if (!inherit) continue;
130+
131+
const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath);
132+
if (!prefixMap) continue;
133+
const resolved = resolveQNameAttr(inherit.baseQName, prefixMap, decl.namespace);
134+
if (!resolved.resolved) {
135+
stats.inheritanceUnresolved++;
136+
continue;
137+
}
138+
const baseQ = resolved.qname;
139+
if (!baseQ.vocabularyId) {
140+
stats.inheritanceUnresolved++;
141+
continue;
142+
}
143+
144+
// Look up existing symbol; for xsd-builtin, ensure on demand.
145+
let baseId: number | null = null;
146+
const candidateKinds: Array<Declaration["kind"]> = [
147+
"complexType",
148+
"simpleType",
149+
"element",
150+
"group",
151+
"attributeGroup",
152+
"attribute",
153+
];
154+
for (const k of candidateKinds) {
155+
const id = symbolIds.get(symbolKey(baseQ.vocabularyId, baseQ.localName, k));
156+
if (id != null) {
157+
baseId = id;
158+
break;
159+
}
160+
}
161+
if (baseId == null && baseQ.vocabularyId === "xsd-builtin") {
162+
const { id, inserted } = await upsertSymbol(
163+
sql,
164+
"xsd-builtin",
165+
baseQ.localName,
166+
"simpleType",
167+
);
168+
symbolIds.set(symbolKey("xsd-builtin", baseQ.localName, "simpleType"), id);
169+
baseId = id;
170+
if (inserted) stats.symbolsInserted++;
171+
else stats.symbolsExisting++;
172+
}
173+
if (baseId == null) {
174+
stats.inheritanceUnresolved++;
175+
continue;
176+
}
177+
178+
const childId = symbolIds.get(symbolKey(decl.vocabularyId, decl.localName, decl.kind));
179+
if (childId == null) continue;
180+
181+
const inserted = await insertInheritance(sql, childId, baseId, profileId, inherit.relation);
182+
if (inserted) stats.inheritanceEdgesInserted++;
183+
}
184+
}
185+
});
186+
187+
return stats;
188+
}
189+
190+
// --- DB helpers ----------------------------------------------------------
191+
192+
async function ensureProfile(sql: Sql, name: string): Promise<number> {
193+
const [row] = await sql`
194+
INSERT INTO xsd_profiles (name) VALUES (${name})
195+
ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name
196+
RETURNING id
197+
`;
198+
return row.id;
199+
}
200+
201+
async function lookupSourceId(sql: Sql, name: string): Promise<number> {
202+
const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`;
203+
if (!row) throw new Error(`reference_sources row not found for name='${name}'. Run db:sync-sources first.`);
204+
return row.id;
205+
}
206+
207+
async function ensureNamespace(sql: Sql, uri: string): Promise<number> {
208+
const [row] = await sql`
209+
INSERT INTO xsd_namespaces (uri) VALUES (${uri})
210+
ON CONFLICT (uri) DO UPDATE SET uri = EXCLUDED.uri
211+
RETURNING id
212+
`;
213+
return row.id;
214+
}
215+
216+
async function upsertSymbol(
217+
sql: Sql,
218+
vocabularyId: string,
219+
localName: string,
220+
kind: string,
221+
): Promise<{ id: number; inserted: boolean }> {
222+
const [row] = await sql`
223+
INSERT INTO xsd_symbols (vocabulary_id, local_name, kind)
224+
VALUES (${vocabularyId}, ${localName}, ${kind})
225+
ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE SET kind = EXCLUDED.kind
226+
RETURNING id, (xmax = 0) AS inserted
227+
`;
228+
return { id: row.id, inserted: row.inserted };
229+
}
230+
231+
async function linkSymbolToProfile(
232+
sql: Sql,
233+
symbolId: number,
234+
profileId: number,
235+
namespaceId: number,
236+
sourceId: number,
237+
): Promise<boolean> {
238+
const rows = await sql`
239+
INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id, source_id)
240+
VALUES (${symbolId}, ${profileId}, ${namespaceId}, ${sourceId})
241+
ON CONFLICT (symbol_id, profile_id) DO NOTHING
242+
RETURNING id
243+
`;
244+
return rows.length > 0;
245+
}
246+
247+
async function insertInheritance(
248+
sql: Sql,
249+
symbolId: number,
250+
baseSymbolId: number,
251+
profileId: number,
252+
relation: "extension" | "restriction",
253+
): Promise<boolean> {
254+
const rows = await sql`
255+
INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation)
256+
VALUES (${symbolId}, ${baseSymbolId}, ${profileId}, ${relation})
257+
ON CONFLICT (symbol_id, profile_id) DO NOTHING
258+
RETURNING id
259+
`;
260+
return rows.length > 0;
261+
}
262+
263+
// --- Inheritance discovery from AST -------------------------------------
264+
265+
interface InheritanceFinding {
266+
baseQName: string;
267+
relation: "extension" | "restriction";
268+
}
269+
270+
function findInheritance(decl: Declaration): InheritanceFinding | null {
271+
if (decl.kind === "complexType") {
272+
for (const child of nodeChildrenLocal(decl.node)) {
273+
const tag = stripPrefixLocal(nodeTagLocal(child));
274+
if (tag !== "complexContent" && tag !== "simpleContent") continue;
275+
for (const inner of nodeChildrenLocal(child)) {
276+
const innerTag = stripPrefixLocal(nodeTagLocal(inner));
277+
if (innerTag !== "extension" && innerTag !== "restriction") continue;
278+
const base = nodeAttrs(inner).base;
279+
if (base) return { baseQName: base, relation: innerTag };
280+
}
281+
}
282+
return null;
283+
}
284+
if (decl.kind === "simpleType") {
285+
for (const child of nodeChildrenLocal(decl.node)) {
286+
const tag = stripPrefixLocal(nodeTagLocal(child));
287+
if (tag !== "restriction") continue;
288+
const base = nodeAttrs(child).base;
289+
if (base) return { baseQName: base, relation: "restriction" };
290+
}
291+
}
292+
return null;
293+
}
294+
295+
function nodeTagLocal(node: PreserveOrderNode): string | null {
296+
for (const k of Object.keys(node)) if (k !== ":@") return k;
297+
return null;
298+
}
299+
function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] {
300+
const tag = nodeTagLocal(node);
301+
if (!tag) return [];
302+
const v = node[tag];
303+
return Array.isArray(v) ? (v as PreserveOrderNode[]) : [];
304+
}
305+
function stripPrefixLocal(tag: string | null): string | null {
306+
if (!tag) return null;
307+
const colon = tag.indexOf(":");
308+
return colon < 0 ? tag : tag.slice(colon + 1);
309+
}
310+
311+
function symbolKey(vocab: string, local: string, kind: string): string {
312+
return `${vocab}|${local}|${kind}`;
313+
}
314+
315+
// --- CLI -----------------------------------------------------------------
316+
317+
interface CliArgs {
318+
schemaDir: string;
319+
entrypoints: string[];
320+
profileName: string;
321+
sourceName: string;
322+
}
323+
324+
function parseCliArgs(): CliArgs {
325+
const argv = process.argv.slice(2);
326+
let schemaDir = "./data/xsd-cache/ecma-376-transitional";
327+
const entrypoints: string[] = [];
328+
let profileName = "transitional";
329+
let sourceName = "ecma-376-transitional";
330+
for (let i = 0; i < argv.length; i++) {
331+
const a = argv[i];
332+
if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir;
333+
else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? "");
334+
else if (a === "--profile") profileName = argv[++i] ?? profileName;
335+
else if (a === "--source") sourceName = argv[++i] ?? sourceName;
336+
}
337+
if (entrypoints.length === 0) entrypoints.push("wml.xsd");
338+
return { schemaDir, entrypoints, profileName, sourceName };
339+
}
340+
341+
async function main() {
342+
const args = parseCliArgs();
343+
const databaseUrl = process.env.DATABASE_URL;
344+
if (!databaseUrl) {
345+
console.error("Missing DATABASE_URL");
346+
process.exit(1);
347+
}
348+
const db = createDbClient(databaseUrl);
349+
350+
const t0 = Date.now();
351+
try {
352+
const stats = await ingestSchemaSet({ ...args, db });
353+
const ms = Date.now() - t0;
354+
console.log(`schemaDir: ${args.schemaDir}`);
355+
console.log(`entrypoints: ${args.entrypoints.join(", ")}`);
356+
console.log(`profile: ${args.profileName}`);
357+
console.log(`source: ${args.sourceName}`);
358+
console.log(`documents: ${stats.documents}`);
359+
console.log(`symbols inserted: ${stats.symbolsInserted}`);
360+
console.log(`symbols existing: ${stats.symbolsExisting}`);
361+
console.log(`namespaces ensured: ${stats.namespacesEnsured}`);
362+
console.log(`profile memberships: ${stats.profileMembershipsInserted}`);
363+
console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`);
364+
console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`);
365+
console.log(`elapsed: ${ms}ms`);
366+
} finally {
367+
await db.close();
368+
}
369+
}
370+
371+
if (import.meta.path === Bun.main) {
372+
main().catch((err) => {
373+
console.error("ingest failed:", err);
374+
process.exit(1);
375+
});
376+
}

0 commit comments

Comments
 (0)