Skip to content

Commit 6cb04ac

Browse files
committed
feat(xsd): content model ingest (Phase 3d)
Pass 3 of ingestSchemaSet walks every complexType and group declaration and writes xsd_compositors, xsd_child_edges, and xsd_group_edges. Compositor handling: - sequence/choice/all under a complexType (or under complexContent/extension|restriction) become top-level compositors with parent_symbol_id set. - Nested compositors (sequence inside choice etc.) recurse with parent_compositor_id set; the XOR check guarantees exactly one parent dimension is populated. - simpleContent contributes attributes only and is skipped here. Element handling inside compositors: - ref="..." resolves through the document's prefix map to a top-level symbol; child_edge points at it. - name="..." (local) creates / reuses a symbol under the owner vocabulary (vocab, name, kind=element). Cross-CT name reuse collapses; that is a known imprecision until we need to disambiguate. Group refs become xsd_group_edges with resolved=false; future passes can expand them. attributeGroup refs are still Phase 3e (attributes). WML closure ingest stats: - 2737 symbols (1345 declarations + 14 builtins + 1378 local elements) - 585 compositors - 2098 child edges (0 unresolved) - 161 group refs (0 unresolved) - 389 inheritance edges (0 unresolved) - elapsed ~2s Fixture main.xsd gains CT_Body to exercise nested compositors, ref-vs-name elements, and group refs in one test path.
1 parent a1d7cba commit 6cb04ac

4 files changed

Lines changed: 433 additions & 12 deletions

File tree

scripts/ingest-xsd/ingest.ts

Lines changed: 297 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,7 @@ import { createDbClient, type DbClient } from "../../packages/shared/src/db/inde
3333
import { nodeAttrs } from "./ast.ts";
3434
import { parseSchemaSet } from "./parse-schema.ts";
3535
import { resolveQNameAttr } from "./qname.ts";
36-
import type {
37-
Declaration,
38-
ParsedSchemaSet,
39-
PreserveOrderNode,
40-
} from "./types.ts";
36+
import type { Declaration, ParsedSchemaSet, PreserveOrderNode } from "./types.ts";
4137
import { vocabularyForNamespace } from "./vocabulary.ts";
4238

4339
// biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic.
@@ -62,6 +58,12 @@ export interface IngestStats {
6258
profileMembershipsInserted: number;
6359
inheritanceEdgesInserted: number;
6460
inheritanceUnresolved: number;
61+
compositorsInserted: number;
62+
childEdgesInserted: number;
63+
childEdgesUnresolved: number;
64+
groupRefsInserted: number;
65+
groupRefsUnresolved: number;
66+
localElementsCreated: number;
6567
}
6668

6769
export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<IngestStats> {
@@ -78,6 +80,12 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
7880
profileMembershipsInserted: 0,
7981
inheritanceEdgesInserted: 0,
8082
inheritanceUnresolved: 0,
83+
compositorsInserted: 0,
84+
childEdgesInserted: 0,
85+
childEdgesUnresolved: 0,
86+
groupRefsInserted: 0,
87+
groupRefsUnresolved: 0,
88+
localElementsCreated: 0,
8189
};
8290

8391
await opts.db.sql.begin(async (sql: Sql) => {
@@ -182,11 +190,231 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
182190
if (inserted) stats.inheritanceEdgesInserted++;
183191
}
184192
}
193+
194+
// Pass 3: content models. Walk every complexType and group declaration,
195+
// emit xsd_compositors / xsd_child_edges / xsd_group_edges. Local element
196+
// declarations are deduped under (owner-vocab, name, element); cross-CT
197+
// reuse of a local name collapses to one symbol.
198+
for (const decls of parseResult.declarationsByQName.values()) {
199+
for (const decl of decls) {
200+
if (decl.kind !== "complexType" && decl.kind !== "group") continue;
201+
202+
const ownerSymbolId = symbolIds.get(
203+
symbolKey(decl.vocabularyId, decl.localName, decl.kind),
204+
);
205+
if (ownerSymbolId == null) continue;
206+
const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath);
207+
if (!prefixMap) continue;
208+
209+
const ctx: WalkCtx = {
210+
sql,
211+
profileId,
212+
ownerSymbolId,
213+
ownerDecl: decl,
214+
prefixMap,
215+
symbolIds,
216+
stats,
217+
};
218+
219+
const particleParents = findContentModelParents(decl);
220+
let topOrder = 0;
221+
for (const parent of particleParents) {
222+
for (const child of nodeChildrenLocal(parent)) {
223+
const tag = stripPrefixLocal(nodeTagLocal(child));
224+
if (tag === "sequence" || tag === "choice" || tag === "all") {
225+
await walkCompositor(child, tag, null, topOrder, ctx);
226+
topOrder++;
227+
} else if (tag === "group") {
228+
await handleGroupRef(child, topOrder, ctx);
229+
topOrder++;
230+
}
231+
}
232+
}
233+
}
234+
}
185235
});
186236

187237
return stats;
188238
}
189239

240+
interface WalkCtx {
241+
sql: Sql;
242+
profileId: number;
243+
ownerSymbolId: number;
244+
ownerDecl: Declaration;
245+
prefixMap: Map<string, string>;
246+
symbolIds: Map<string, number>;
247+
stats: IngestStats;
248+
}
249+
250+
/**
251+
* For a complexType: yield the node(s) whose direct children are particles
252+
* (sequence/choice/all/group). That's the complexType itself, OR (for derived
253+
* types) the inner xsd:extension or xsd:restriction beneath complexContent.
254+
*
255+
* For a group definition: yield the group node itself.
256+
*
257+
* simpleContent has no element particles; not yielded.
258+
*/
259+
function findContentModelParents(decl: Declaration): PreserveOrderNode[] {
260+
if (decl.kind === "group") return [decl.node];
261+
262+
if (decl.kind !== "complexType") return [];
263+
264+
const out: PreserveOrderNode[] = [];
265+
let sawComplexContent = false;
266+
for (const child of nodeChildrenLocal(decl.node)) {
267+
const tag = stripPrefixLocal(nodeTagLocal(child));
268+
if (tag === "complexContent") {
269+
sawComplexContent = true;
270+
for (const inner of nodeChildrenLocal(child)) {
271+
const innerTag = stripPrefixLocal(nodeTagLocal(inner));
272+
if (innerTag === "extension" || innerTag === "restriction") out.push(inner);
273+
}
274+
}
275+
}
276+
if (sawComplexContent) return out;
277+
// No complexContent wrapper: particles live directly under complexType.
278+
return [decl.node];
279+
}
280+
281+
async function walkCompositor(
282+
node: PreserveOrderNode,
283+
kind: "sequence" | "choice" | "all",
284+
parentCompositorId: number | null,
285+
orderIndex: number,
286+
ctx: WalkCtx,
287+
): Promise<void> {
288+
const a = nodeAttrs(node);
289+
const compositorId = await insertCompositor(
290+
ctx.sql,
291+
parentCompositorId === null ? ctx.ownerSymbolId : null,
292+
parentCompositorId,
293+
ctx.profileId,
294+
kind,
295+
parseMinOccurs(a.minOccurs),
296+
parseMaxOccurs(a.maxOccurs),
297+
orderIndex,
298+
);
299+
ctx.stats.compositorsInserted++;
300+
301+
let childOrder = 0;
302+
for (const child of nodeChildrenLocal(node)) {
303+
const tag = stripPrefixLocal(nodeTagLocal(child));
304+
if (tag === "element") {
305+
await handleElement(child, compositorId, childOrder, ctx);
306+
childOrder++;
307+
} else if (tag === "sequence" || tag === "choice" || tag === "all") {
308+
await walkCompositor(child, tag, compositorId, childOrder, ctx);
309+
childOrder++;
310+
} else if (tag === "group") {
311+
await handleGroupRef(child, childOrder, ctx, compositorId);
312+
childOrder++;
313+
}
314+
// xsd:any: skipped for now.
315+
}
316+
}
317+
318+
async function handleElement(
319+
node: PreserveOrderNode,
320+
compositorId: number,
321+
orderIndex: number,
322+
ctx: WalkCtx,
323+
): Promise<void> {
324+
const a = nodeAttrs(node);
325+
let childSymbolId: number | null = null;
326+
327+
if (a.ref) {
328+
const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace);
329+
if (!resolved.resolved || !resolved.qname.vocabularyId) {
330+
ctx.stats.childEdgesUnresolved++;
331+
return;
332+
}
333+
const id = ctx.symbolIds.get(
334+
symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "element"),
335+
);
336+
if (id == null) {
337+
ctx.stats.childEdgesUnresolved++;
338+
return;
339+
}
340+
childSymbolId = id;
341+
} else if (a.name) {
342+
const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element");
343+
let id = ctx.symbolIds.get(key);
344+
if (id == null) {
345+
const res = await upsertSymbol(ctx.sql, ctx.ownerDecl.vocabularyId, a.name, "element");
346+
ctx.symbolIds.set(key, res.id);
347+
if (res.inserted) {
348+
ctx.stats.symbolsInserted++;
349+
ctx.stats.localElementsCreated++;
350+
} else {
351+
ctx.stats.symbolsExisting++;
352+
}
353+
id = res.id;
354+
}
355+
childSymbolId = id;
356+
}
357+
358+
if (childSymbolId == null) return;
359+
360+
await insertChildEdge(
361+
ctx.sql,
362+
ctx.ownerSymbolId,
363+
compositorId,
364+
childSymbolId,
365+
ctx.profileId,
366+
parseMinOccurs(a.minOccurs),
367+
parseMaxOccurs(a.maxOccurs),
368+
orderIndex,
369+
);
370+
ctx.stats.childEdgesInserted++;
371+
}
372+
373+
async function handleGroupRef(
374+
node: PreserveOrderNode,
375+
orderIndex: number,
376+
ctx: WalkCtx,
377+
_compositorId: number | null = null,
378+
): Promise<void> {
379+
void _compositorId; // group_edges aren't compositor-scoped in our schema; refs hang off the parent symbol.
380+
const a = nodeAttrs(node);
381+
if (!a.ref) return;
382+
const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace);
383+
if (!resolved.resolved || !resolved.qname.vocabularyId) {
384+
ctx.stats.groupRefsUnresolved++;
385+
return;
386+
}
387+
const groupSymbolId = ctx.symbolIds.get(
388+
symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "group"),
389+
);
390+
if (groupSymbolId == null) {
391+
ctx.stats.groupRefsUnresolved++;
392+
return;
393+
}
394+
await insertGroupEdge(
395+
ctx.sql,
396+
ctx.ownerSymbolId,
397+
groupSymbolId,
398+
ctx.profileId,
399+
"group",
400+
orderIndex,
401+
);
402+
ctx.stats.groupRefsInserted++;
403+
}
404+
405+
function parseMinOccurs(raw: string | undefined): number {
406+
if (raw === undefined) return 1;
407+
const n = parseInt(raw, 10);
408+
return Number.isFinite(n) ? n : 1;
409+
}
410+
411+
function parseMaxOccurs(raw: string | undefined): number | null {
412+
if (raw === undefined) return 1;
413+
if (raw === "unbounded") return null;
414+
const n = parseInt(raw, 10);
415+
return Number.isFinite(n) ? n : 1;
416+
}
417+
190418
// --- DB helpers ----------------------------------------------------------
191419

192420
async function ensureProfile(sql: Sql, name: string): Promise<number> {
@@ -200,7 +428,10 @@ async function ensureProfile(sql: Sql, name: string): Promise<number> {
200428

201429
async function lookupSourceId(sql: Sql, name: string): Promise<number> {
202430
const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`;
203-
if (!row) throw new Error(`reference_sources row not found for name='${name}'. Run db:sync-sources first.`);
431+
if (!row)
432+
throw new Error(
433+
`reference_sources row not found for name='${name}'. Run db:sync-sources first.`,
434+
);
204435
return row.id;
205436
}
206437

@@ -260,6 +491,60 @@ async function insertInheritance(
260491
return rows.length > 0;
261492
}
262493

494+
async function insertCompositor(
495+
sql: Sql,
496+
parentSymbolId: number | null,
497+
parentCompositorId: number | null,
498+
profileId: number,
499+
kind: "sequence" | "choice" | "all",
500+
minOccurs: number,
501+
maxOccurs: number | null,
502+
orderIndex: number,
503+
): Promise<number> {
504+
const [row] = await sql`
505+
INSERT INTO xsd_compositors
506+
(parent_symbol_id, parent_compositor_id, profile_id, kind, min_occurs, max_occurs, order_index)
507+
VALUES
508+
(${parentSymbolId}, ${parentCompositorId}, ${profileId}, ${kind}, ${minOccurs}, ${maxOccurs}, ${orderIndex})
509+
RETURNING id
510+
`;
511+
return row.id;
512+
}
513+
514+
async function insertChildEdge(
515+
sql: Sql,
516+
parentSymbolId: number,
517+
compositorId: number,
518+
childSymbolId: number,
519+
profileId: number,
520+
minOccurs: number,
521+
maxOccurs: number | null,
522+
orderIndex: number,
523+
): Promise<void> {
524+
await sql`
525+
INSERT INTO xsd_child_edges
526+
(parent_symbol_id, compositor_id, child_symbol_id, profile_id, min_occurs, max_occurs, order_index)
527+
VALUES
528+
(${parentSymbolId}, ${compositorId}, ${childSymbolId}, ${profileId}, ${minOccurs}, ${maxOccurs}, ${orderIndex})
529+
`;
530+
}
531+
532+
async function insertGroupEdge(
533+
sql: Sql,
534+
parentSymbolId: number,
535+
groupSymbolId: number,
536+
profileId: number,
537+
refKind: "group" | "attributeGroup",
538+
orderIndex: number,
539+
): Promise<void> {
540+
await sql`
541+
INSERT INTO xsd_group_edges
542+
(parent_symbol_id, group_symbol_id, profile_id, ref_kind, order_index)
543+
VALUES
544+
(${parentSymbolId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${orderIndex})
545+
`;
546+
}
547+
263548
// --- Inheritance discovery from AST -------------------------------------
264549

265550
interface InheritanceFinding {
@@ -362,6 +647,12 @@ async function main() {
362647
console.log(`profile memberships: ${stats.profileMembershipsInserted}`);
363648
console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`);
364649
console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`);
650+
console.log(`compositors: ${stats.compositorsInserted}`);
651+
console.log(`child edges: ${stats.childEdgesInserted}`);
652+
console.log(`child edges unres.: ${stats.childEdgesUnresolved}`);
653+
console.log(`group refs: ${stats.groupRefsInserted}`);
654+
console.log(`group refs unres.: ${stats.groupRefsUnresolved}`);
655+
console.log(`local elements: ${stats.localElementsCreated}`);
365656
console.log(`elapsed: ${ms}ms`);
366657
} finally {
367658
await db.close();

tests/ingest-xsd/fixtures/main.xsd

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@
2424
<xsd:restriction base="CT_Para"/>
2525
</xsd:complexContent>
2626
</xsd:complexType>
27+
<xsd:complexType name="CT_Body">
28+
<xsd:sequence>
29+
<xsd:element ref="document"/>
30+
<xsd:choice minOccurs="0" maxOccurs="unbounded">
31+
<xsd:group ref="EG_PContent"/>
32+
<xsd:element name="break" type="xsd:string"/>
33+
</xsd:choice>
34+
</xsd:sequence>
35+
</xsd:complexType>
2736
<xsd:simpleType name="ST_Jc">
2837
<xsd:restriction base="xsd:string">
2938
<xsd:enumeration value="left"/>

0 commit comments

Comments
 (0)