Skip to content

Commit c742f43

Browse files
committed
fix(xsd): preserve element/attr type and group-ref compositor metadata
Three correctness gaps surfaced before Phase 4: P1 - Local elements lost type and profile membership. WML uses <xsd:element name="p" type="CT_P"/> inside groups; before this change the local element symbol carried no @type and was never linked to xsd_symbol_profiles, so ooxml_lookup_element/ooxml_children would not find it in the transitional profile or follow it to CT_P. P2 - Group refs in nested compositors lost context. <xsd:group ref> inside a nested sequence/choice was inserted with parent_symbol_id and order_index only. The compositor it lives inside and the ref's own minOccurs/maxOccurs were dropped, so later expansion could not preserve ordering or cardinality relative to siblings. P2 - Referenced attributes lost type/default/fixed. <xsd:attribute ref="r:id"/> set attr_symbol_id only; the type and default declared on the top-level <xsd:attribute name="id" type="ST_RelationshipId"/> were not recovered into the edge. Migration 0003_phase3_metadata adds: - xsd_symbols.type_ref TEXT (Clark-style {namespace}localName for elements and attributes that declare @type; NULL for the rest). - xsd_group_edges.compositor_id INT (FK with ON DELETE CASCADE), plus min_occurs / max_occurs. ingest.ts: - upsertSymbol now accepts typeRef; ON CONFLICT preserves the existing value via COALESCE so a re-run never blanks it out. - Pass 1 captures @type for top-level element/attribute decls. - Pass 3 captures @type and links local elements to xsd_symbol_profiles. - Pass 3 group refs thread compositor_id and parse min/max occurs. - Pass 4 attribute refs copy type_ref / default / fixed from the top-level declaration; attr_use stays from the ref site (XSD lets refs override use only). Real WML ingest after fix: - profile memberships: 1345 -> 2723 (1345 top-level + 1378 local elements now visible to ooxml_lookup_element). - 148 / 161 group refs carry compositor_id (rest are top-level). - Sample r:id attribute refs now expose type_ref={...relationships}ST_RelationshipId. Fixtures gain a top-level <xsd:attribute name="space" type="xsd:string" default="preserve"/> in shared.xsd and a CT_RefTest in main.xsd that refs it; the new test checks all three fixes.
1 parent 33072f5 commit c742f43

7 files changed

Lines changed: 212 additions & 24 deletions

File tree

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Phase 3 review fix: preserve element/attribute @type and group-ref compositor context.
2+
-- Idempotent.
3+
4+
ALTER TABLE xsd_symbols
5+
ADD COLUMN IF NOT EXISTS type_ref TEXT;
6+
7+
ALTER TABLE xsd_group_edges
8+
ADD COLUMN IF NOT EXISTS compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE,
9+
ADD COLUMN IF NOT EXISTS min_occurs INT DEFAULT 1,
10+
ADD COLUMN IF NOT EXISTS max_occurs INT;

db/schema.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,15 @@ CREATE TABLE xsd_namespaces (
6464
created_at TIMESTAMPTZ DEFAULT NOW()
6565
);
6666

67+
-- type_ref holds the Clark-style {namespace}localName for elements and attributes
68+
-- that declare a @type. NULL for complexType/simpleType/group/attributeGroup.
69+
-- Phase 4 lookups follow type_ref to resolve element -> type when reading children.
6770
CREATE TABLE xsd_symbols (
6871
id SERIAL PRIMARY KEY,
6972
vocabulary_id TEXT NOT NULL,
7073
local_name TEXT NOT NULL,
7174
kind TEXT NOT NULL,
75+
type_ref TEXT,
7276
payload JSONB DEFAULT '{}'::jsonb,
7377
created_at TIMESTAMPTZ DEFAULT NOW(),
7478
UNIQUE (vocabulary_id, local_name, kind)
@@ -121,13 +125,20 @@ CREATE TABLE xsd_attr_edges (
121125
order_index INT DEFAULT 0
122126
);
123127

128+
-- compositor_id is the enclosing compositor when a <xsd:group ref> appears inside
129+
-- a sequence/choice/all (NULL for refs at the type's top level or for
130+
-- attributeGroup refs which don't live in a compositor).
131+
-- min/max_occurs capture the ref site's own cardinality.
124132
CREATE TABLE xsd_group_edges (
125133
id SERIAL PRIMARY KEY,
126134
parent_symbol_id INT NOT NULL REFERENCES xsd_symbols(id) ON DELETE CASCADE,
135+
compositor_id INT REFERENCES xsd_compositors(id) ON DELETE CASCADE,
127136
group_symbol_id INT NOT NULL REFERENCES xsd_symbols(id),
128137
profile_id INT NOT NULL REFERENCES xsd_profiles(id) ON DELETE CASCADE,
129138
ref_kind TEXT NOT NULL CHECK (ref_kind IN ('group', 'attributeGroup')),
130139
resolved BOOLEAN DEFAULT FALSE,
140+
min_occurs INT DEFAULT 1,
141+
max_occurs INT,
131142
order_index INT DEFAULT 0
132143
);
133144

scripts/ingest-xsd/ingest.ts

Lines changed: 111 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,16 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
118118
for (const decl of decls) {
119119
const key = symbolKey(decl.vocabularyId, decl.localName, decl.kind);
120120
if (symbolIds.has(key)) continue;
121+
122+
// Capture @type for elements and attributes; resolved Clark-style.
123+
const typeRef = resolveDeclTypeRef(decl, parseResult);
124+
121125
const { id, inserted } = await upsertSymbol(
122126
sql,
123127
decl.vocabularyId,
124128
decl.localName,
125129
decl.kind,
130+
typeRef,
126131
);
127132
symbolIds.set(key, id);
128133
if (inserted) stats.symbolsInserted++;
@@ -229,10 +234,13 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
229234
const ctx: WalkCtx = {
230235
sql,
231236
profileId,
237+
sourceId,
232238
ownerSymbolId,
233239
ownerDecl: decl,
234240
prefixMap,
235241
symbolIds,
242+
namespaceIds,
243+
parseResult,
236244
stats,
237245
};
238246

@@ -245,7 +253,7 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
245253
await walkCompositor(child, tag, null, topOrder, ctx);
246254
topOrder++;
247255
} else if (tag === "group") {
248-
await handleGroupRef(child, topOrder, ctx);
256+
await handleGroupRef(child, null, topOrder, ctx);
249257
topOrder++;
250258
}
251259
}
@@ -283,6 +291,7 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
283291
prefixMap,
284292
decl.namespace,
285293
symbolIds,
294+
parseResult,
286295
order,
287296
stats,
288297
);
@@ -296,18 +305,27 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
296305
continue;
297306
}
298307
const groupSymbolId = symbolIds.get(
299-
symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "attributeGroup"),
308+
symbolKey(
309+
resolved.qname.vocabularyId,
310+
resolved.qname.localName,
311+
"attributeGroup",
312+
),
300313
);
301314
if (groupSymbolId == null) {
302315
stats.attrGroupRefsUnresolved++;
303316
continue;
304317
}
318+
// attributeGroup refs don't live inside content compositors;
319+
// compositor_id stays null and min/max default to 1.
305320
await insertGroupEdge(
306321
sql,
307322
ownerSymbolId,
323+
null,
308324
groupSymbolId,
309325
profileId,
310326
"attributeGroup",
327+
1,
328+
1,
311329
order,
312330
);
313331
stats.attrGroupRefsInserted++;
@@ -333,13 +351,30 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
333351
interface WalkCtx {
334352
sql: Sql;
335353
profileId: number;
354+
sourceId: number;
336355
ownerSymbolId: number;
337356
ownerDecl: Declaration;
338357
prefixMap: Map<string, string>;
339358
symbolIds: Map<string, number>;
359+
namespaceIds: Map<string, number>;
360+
parseResult: ParsedSchemaSet;
340361
stats: IngestStats;
341362
}
342363

364+
/**
365+
* Resolve a declaration's @type qname (for top-level element/attribute decls)
366+
* to Clark-style {namespace}localName, or null if the declaration has no @type.
367+
*/
368+
function resolveDeclTypeRef(decl: Declaration, parseResult: ParsedSchemaSet): string | null {
369+
if (decl.kind !== "element" && decl.kind !== "attribute") return null;
370+
const a = nodeAttrs(decl.node);
371+
if (!a.type) return null;
372+
const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath);
373+
if (!prefixMap) return a.type;
374+
const r = resolveQNameAttr(a.type, prefixMap, decl.namespace);
375+
return r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type;
376+
}
377+
343378
/**
344379
* For a complexType: yield the node(s) whose direct children are particles
345380
* (sequence/choice/all/group). That's the complexType itself, OR (for derived
@@ -401,7 +436,7 @@ async function walkCompositor(
401436
await walkCompositor(child, tag, compositorId, childOrder, ctx);
402437
childOrder++;
403438
} else if (tag === "group") {
404-
await handleGroupRef(child, childOrder, ctx, compositorId);
439+
await handleGroupRef(child, compositorId, childOrder, ctx);
405440
childOrder++;
406441
}
407442
// xsd:any: skipped for now.
@@ -432,18 +467,49 @@ async function handleElement(
432467
}
433468
childSymbolId = id;
434469
} else if (a.name) {
470+
// Resolve @type so ooxml_lookup_element / ooxml_children can follow it.
471+
let typeRef: string | null = null;
472+
if (a.type) {
473+
const r = resolveQNameAttr(a.type, ctx.prefixMap, ctx.ownerDecl.namespace);
474+
typeRef = r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type;
475+
}
435476
const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element");
436477
let id = ctx.symbolIds.get(key);
437478
if (id == null) {
438-
const res = await upsertSymbol(ctx.sql, ctx.ownerDecl.vocabularyId, a.name, "element");
479+
const res = await upsertSymbol(
480+
ctx.sql,
481+
ctx.ownerDecl.vocabularyId,
482+
a.name,
483+
"element",
484+
typeRef,
485+
);
439486
ctx.symbolIds.set(key, res.id);
440487
if (res.inserted) {
441488
ctx.stats.symbolsInserted++;
442489
ctx.stats.localElementsCreated++;
443490
} else {
444491
ctx.stats.symbolsExisting++;
445492
}
493+
// Local elements need profile membership too, otherwise
494+
// ooxml_lookup_element won't find them in the transitional profile.
495+
const nsId = ctx.namespaceIds.get(ctx.ownerDecl.namespace);
496+
if (nsId != null) {
497+
const linked = await linkSymbolToProfile(
498+
ctx.sql,
499+
res.id,
500+
ctx.profileId,
501+
nsId,
502+
ctx.sourceId,
503+
);
504+
if (linked) ctx.stats.profileMembershipsInserted++;
505+
}
446506
id = res.id;
507+
} else if (typeRef) {
508+
// Existing symbol; ensure type_ref is set if we have one.
509+
await ctx.sql`
510+
UPDATE xsd_symbols SET type_ref = ${typeRef}
511+
WHERE id = ${id} AND type_ref IS NULL
512+
`;
447513
}
448514
childSymbolId = id;
449515
}
@@ -465,11 +531,10 @@ async function handleElement(
465531

466532
async function handleGroupRef(
467533
node: PreserveOrderNode,
534+
compositorId: number | null,
468535
orderIndex: number,
469536
ctx: WalkCtx,
470-
_compositorId: number | null = null,
471537
): Promise<void> {
472-
void _compositorId; // group_edges aren't compositor-scoped in our schema; refs hang off the parent symbol.
473538
const a = nodeAttrs(node);
474539
if (!a.ref) return;
475540
const resolved = resolveQNameAttr(a.ref, ctx.prefixMap, ctx.ownerDecl.namespace);
@@ -487,9 +552,12 @@ async function handleGroupRef(
487552
await insertGroupEdge(
488553
ctx.sql,
489554
ctx.ownerSymbolId,
555+
compositorId,
490556
groupSymbolId,
491557
ctx.profileId,
492558
"group",
559+
parseMinOccurs(a.minOccurs),
560+
parseMaxOccurs(a.maxOccurs),
493561
orderIndex,
494562
);
495563
ctx.stats.groupRefsInserted++;
@@ -542,11 +610,13 @@ async function upsertSymbol(
542610
vocabularyId: string,
543611
localName: string,
544612
kind: string,
613+
typeRef: string | null = null,
545614
): Promise<{ id: number; inserted: boolean }> {
546615
const [row] = await sql`
547-
INSERT INTO xsd_symbols (vocabulary_id, local_name, kind)
548-
VALUES (${vocabularyId}, ${localName}, ${kind})
549-
ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE SET kind = EXCLUDED.kind
616+
INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref)
617+
VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef})
618+
ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE
619+
SET type_ref = COALESCE(EXCLUDED.type_ref, xsd_symbols.type_ref)
550620
RETURNING id, (xmax = 0) AS inserted
551621
`;
552622
return { id: row.id, inserted: row.inserted };
@@ -625,16 +695,19 @@ async function insertChildEdge(
625695
async function insertGroupEdge(
626696
sql: Sql,
627697
parentSymbolId: number,
698+
compositorId: number | null,
628699
groupSymbolId: number,
629700
profileId: number,
630701
refKind: "group" | "attributeGroup",
702+
minOccurs: number,
703+
maxOccurs: number | null,
631704
orderIndex: number,
632705
): Promise<void> {
633706
await sql`
634707
INSERT INTO xsd_group_edges
635-
(parent_symbol_id, group_symbol_id, profile_id, ref_kind, order_index)
708+
(parent_symbol_id, compositor_id, group_symbol_id, profile_id, ref_kind, min_occurs, max_occurs, order_index)
636709
VALUES
637-
(${parentSymbolId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${orderIndex})
710+
(${parentSymbolId}, ${compositorId}, ${groupSymbolId}, ${profileId}, ${refKind}, ${minOccurs}, ${maxOccurs}, ${orderIndex})
638711
`;
639712
}
640713

@@ -721,13 +794,16 @@ async function handleAttribute(
721794
prefixMap: Map<string, string>,
722795
defaultNamespace: string,
723796
symbolIds: Map<string, number>,
797+
parseResult: ParsedSchemaSet,
724798
orderIndex: number,
725799
stats: IngestStats,
726800
): Promise<void> {
727801
const a = nodeAttrs(node);
728802
let localName: string | null = null;
729803
let attrSymbolId: number | null = null;
730804
let typeRef: string | null = null;
805+
let defaultValue: string | null = a.default ?? null;
806+
let fixedValue: string | null = a.fixed ?? null;
731807

732808
if (a.ref) {
733809
const resolved = resolveQNameAttr(a.ref, prefixMap, defaultNamespace);
@@ -740,6 +816,27 @@ async function handleAttribute(
740816
symbolKey(resolved.qname.vocabularyId, resolved.qname.localName, "attribute"),
741817
);
742818
if (id != null) attrSymbolId = id;
819+
820+
// Carry type/default/fixed from the top-level <xsd:attribute name="..."> declaration.
821+
// XSD allows these only on the declaration, not the ref site, so look them up there.
822+
const declKey = `{${resolved.qname.namespace}}attribute:${resolved.qname.localName}`;
823+
const topDecl = parseResult.declarationsByQName.get(declKey)?.[0];
824+
if (topDecl) {
825+
const declAttrs = nodeAttrs(topDecl.node);
826+
if (declAttrs.type) {
827+
const declPrefixMap = parseResult.namespaceByPrefix.get(topDecl.documentPath);
828+
if (declPrefixMap) {
829+
const t = resolveQNameAttr(declAttrs.type, declPrefixMap, topDecl.namespace);
830+
typeRef = t.resolved
831+
? `{${t.qname.namespace}}${t.qname.localName}`
832+
: declAttrs.type;
833+
} else {
834+
typeRef = declAttrs.type;
835+
}
836+
}
837+
if (defaultValue == null) defaultValue = declAttrs.default ?? null;
838+
if (fixedValue == null) fixedValue = declAttrs.fixed ?? null;
839+
}
743840
} else if (a.name) {
744841
localName = a.name;
745842
if (a.type) {
@@ -756,9 +853,7 @@ async function handleAttribute(
756853

757854
const rawUse = a.use;
758855
const attrUse: "required" | "optional" | "prohibited" =
759-
rawUse === "required" || rawUse === "optional" || rawUse === "prohibited"
760-
? rawUse
761-
: "optional";
856+
rawUse === "required" || rawUse === "optional" || rawUse === "prohibited" ? rawUse : "optional";
762857

763858
await insertAttrEdge(
764859
sql,
@@ -767,8 +862,8 @@ async function handleAttribute(
767862
localName,
768863
profileId,
769864
attrUse,
770-
a.default ?? null,
771-
a.fixed ?? null,
865+
defaultValue,
866+
fixedValue,
772867
typeRef,
773868
orderIndex,
774869
);

tests/ingest-xsd/fixtures/main.xsd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,5 +52,8 @@
5252
<xsd:attributeGroup ref="AG_TableProps"/>
5353
<xsd:attribute name="caption" type="xsd:string" use="required"/>
5454
</xsd:complexType>
55+
<xsd:complexType name="CT_RefTest">
56+
<xsd:attribute ref="s:space" use="required"/>
57+
</xsd:complexType>
5558
<xsd:element name="document" type="CT_Empty"/>
5659
</xsd:schema>

tests/ingest-xsd/fixtures/shared.xsd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,5 @@
88
<xsd:simpleType name="ST_String">
99
<xsd:restriction base="xsd:string"/>
1010
</xsd:simpleType>
11+
<xsd:attribute name="space" type="xsd:string" default="preserve"/>
1112
</xsd:schema>

0 commit comments

Comments
 (0)