Skip to content

Commit cb3e16d

Browse files
committed
fix(xsd): scope local elements per-owner; link xsd-builtin symbols to profile; drop em dash
Three review-flagged correctness gaps before flipping ENABLE_OOXML_TOOLS. P1 - Local element symbols collapsed across complexTypes. Inline <xsd:element name="X" type="..."/> declared inside two different complexTypes was deduped under (vocabulary, name, kind), keeping only the first-seen type_ref. Real WML hits this on tblGrid alone: declared as CT_TblGridBase inside CT_TblGridChange and as CT_TblGrid inside CT_Tbl. ooxml_children("w:tblGrid") would have followed CT_TblGridBase and missed the CT_TblGrid children. Migration 0004 adds xsd_symbols.parent_symbol_id (nullable) and replaces the 3-tuple unique with a 4-tuple UNIQUE NULLS NOT DISTINCT (vocab, local_name, kind, parent_symbol_id). Top-level decls keep parent NULL and still collide on name; local decls are scoped to their owner. ingest.ts passes the owning symbol id when upserting local element symbols. P2 - xsd-builtin symbols had no profile membership. The on-demand inheritance pass created xsd:string / xsd:boolean / etc. via upsertSymbol but never called linkSymbolToProfile, so lookupSymbol (which JOINs xsd_symbol_profiles) returned null. Following an element's type_ref into a W3C built-in silently failed. Now also ensure the xs/xsd namespace exists and link the built-in symbol into the target profile. P3 - Em dash in code comment. scripts/ingest-xsd/qname.ts line 12 used "—". Replaced with "-". Tests: - New "local element symbols are scoped per-owner (no cross-CT collapse)" against a CT_OuterA / CT_OuterB fixture mirroring the WML tblGrid pattern: each `shared` element resolves to its own symbol with the correct per-owner type_ref. - New "xsd-builtin symbols have profile membership" verifies lookupSymbolByTypeRef succeeds for {...XMLSchema}string. - Existing fixture and WML smoke counts adjusted. 41 / 0 across db / ingest / mcp-server.
1 parent f1b3223 commit cb3e16d

8 files changed

Lines changed: 155 additions & 15 deletions

File tree

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
-- Phase 4 review fix: scope local element symbols by their owner.
2+
--
3+
-- Before this migration, an inline <xsd:element name="X" type="..."/> declared
4+
-- inside two different complexTypes/groups collapsed to a single symbol keyed
5+
-- on (vocabulary_id, local_name, kind). The first-seen type_ref won and the
6+
-- later one was silently dropped, so e.g. WML's `tblGrid` (CT_TblGridBase
7+
-- inside CT_TblGridChange vs CT_TblGrid inside CT_Tbl) gave a wrong answer
8+
-- for ooxml_children("w:tblGrid").
9+
--
10+
-- Fix: add parent_symbol_id to xsd_symbols and include it in the canonical key
11+
-- with NULLS NOT DISTINCT so two top-level declarations (parent NULL) still
12+
-- collide while local declarations are scoped per-owner.
13+
--
14+
-- Idempotent.
15+
16+
ALTER TABLE xsd_symbols
17+
ADD COLUMN IF NOT EXISTS parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE;
18+
19+
DO $$
20+
DECLARE cname TEXT;
21+
BEGIN
22+
-- Drop the auto-named 3-tuple unique constraint, regardless of what postgres
23+
-- ended up calling it.
24+
SELECT conname INTO cname
25+
FROM pg_constraint
26+
WHERE conrelid = 'xsd_symbols'::regclass
27+
AND contype = 'u'
28+
AND conkey = (
29+
SELECT array_agg(attnum ORDER BY attnum)
30+
FROM pg_attribute
31+
WHERE attrelid = 'xsd_symbols'::regclass
32+
AND attname IN ('vocabulary_id', 'local_name', 'kind')
33+
);
34+
IF cname IS NOT NULL THEN
35+
EXECUTE 'ALTER TABLE xsd_symbols DROP CONSTRAINT ' || quote_ident(cname);
36+
END IF;
37+
38+
IF NOT EXISTS (
39+
SELECT 1 FROM pg_constraint
40+
WHERE conname = 'xsd_symbols_canonical_key'
41+
AND conrelid = 'xsd_symbols'::regclass
42+
) THEN
43+
ALTER TABLE xsd_symbols
44+
ADD CONSTRAINT xsd_symbols_canonical_key
45+
UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id);
46+
END IF;
47+
END $$;
48+
49+
CREATE INDEX IF NOT EXISTS idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id);

db/schema.sql

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,22 @@ CREATE TABLE xsd_namespaces (
6767
-- type_ref holds the Clark-style {namespace}localName for elements and attributes
6868
-- that declare a @type. NULL for complexType/simpleType/group/attributeGroup.
6969
-- Phase 4 lookups follow type_ref to resolve element -> type when reading children.
70+
--
71+
-- parent_symbol_id is NULL for top-level declarations and set to the owning
72+
-- type/group symbol for inline (local) element declarations. The canonical
73+
-- key is 4-tuple with NULLS NOT DISTINCT so top-level decls still collide on
74+
-- name while local decls remain scoped per-owner.
7075
CREATE TABLE xsd_symbols (
7176
id SERIAL PRIMARY KEY,
7277
vocabulary_id TEXT NOT NULL,
7378
local_name TEXT NOT NULL,
7479
kind TEXT NOT NULL,
7580
type_ref TEXT,
81+
parent_symbol_id INT REFERENCES xsd_symbols(id) ON DELETE CASCADE,
7682
payload JSONB DEFAULT '{}'::jsonb,
7783
created_at TIMESTAMPTZ DEFAULT NOW(),
78-
UNIQUE (vocabulary_id, local_name, kind)
84+
CONSTRAINT xsd_symbols_canonical_key
85+
UNIQUE NULLS NOT DISTINCT (vocabulary_id, local_name, kind, parent_symbol_id)
7986
);
8087

8188
CREATE TABLE xsd_symbol_profiles (
@@ -181,6 +188,7 @@ CREATE TABLE behavior_notes (
181188
);
182189

183190
CREATE INDEX idx_xsd_symbols_lookup ON xsd_symbols(vocabulary_id, local_name, kind);
191+
CREATE INDEX idx_xsd_symbols_parent ON xsd_symbols(parent_symbol_id);
184192
CREATE INDEX idx_xsd_child_edges_parent ON xsd_child_edges(parent_symbol_id);
185193
CREATE INDEX idx_xsd_child_edges_compositor ON xsd_child_edges(compositor_id);
186194
CREATE INDEX idx_xsd_attr_edges_symbol ON xsd_attr_edges(symbol_id);

scripts/ingest-xsd/ingest.ts

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,16 @@ export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<Ing
192192
baseId = id;
193193
if (inserted) stats.symbolsInserted++;
194194
else stats.symbolsExisting++;
195+
// Link to a profile so ooxml_lookup_type / lookupSymbolByTypeRef can
196+
// follow type_refs into the W3C XSD namespace.
197+
let xsdNsId = namespaceIds.get(baseQ.namespace);
198+
if (xsdNsId == null) {
199+
xsdNsId = await ensureNamespace(sql, baseQ.namespace);
200+
namespaceIds.set(baseQ.namespace, xsdNsId);
201+
stats.namespacesEnsured++;
202+
}
203+
const linked = await linkSymbolToProfile(sql, id, profileId, xsdNsId, sourceId);
204+
if (linked) stats.profileMembershipsInserted++;
195205
}
196206
if (baseId == null) {
197207
stats.inheritanceUnresolved++;
@@ -473,7 +483,15 @@ async function handleElement(
473483
const r = resolveQNameAttr(a.type, ctx.prefixMap, ctx.ownerDecl.namespace);
474484
typeRef = r.resolved ? `{${r.qname.namespace}}${r.qname.localName}` : a.type;
475485
}
476-
const key = symbolKey(ctx.ownerDecl.vocabularyId, a.name, "element");
486+
// Local elements are scoped per-owner: the same name in two different
487+
// complexTypes is not the same symbol (e.g. WML's tblGrid is
488+
// CT_TblGridBase inside CT_TblGridChange but CT_TblGrid inside CT_Tbl).
489+
const key = symbolKey(
490+
ctx.ownerDecl.vocabularyId,
491+
a.name,
492+
"element",
493+
ctx.ownerSymbolId,
494+
);
477495
let id = ctx.symbolIds.get(key);
478496
if (id == null) {
479497
const res = await upsertSymbol(
@@ -482,6 +500,7 @@ async function handleElement(
482500
a.name,
483501
"element",
484502
typeRef,
503+
ctx.ownerSymbolId,
485504
);
486505
ctx.symbolIds.set(key, res.id);
487506
if (res.inserted) {
@@ -611,11 +630,12 @@ async function upsertSymbol(
611630
localName: string,
612631
kind: string,
613632
typeRef: string | null = null,
633+
parentSymbolId: number | null = null,
614634
): Promise<{ id: number; inserted: boolean }> {
615635
const [row] = await sql`
616-
INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref)
617-
VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef})
618-
ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE
636+
INSERT INTO xsd_symbols (vocabulary_id, local_name, kind, type_ref, parent_symbol_id)
637+
VALUES (${vocabularyId}, ${localName}, ${kind}, ${typeRef}, ${parentSymbolId})
638+
ON CONFLICT ON CONSTRAINT xsd_symbols_canonical_key DO UPDATE
619639
SET type_ref = COALESCE(EXCLUDED.type_ref, xsd_symbols.type_ref)
620640
RETURNING id, (xmax = 0) AS inserted
621641
`;
@@ -916,8 +936,8 @@ function stripPrefixLocal(tag: string | null): string | null {
916936
return colon < 0 ? tag : tag.slice(colon + 1);
917937
}
918938

919-
function symbolKey(vocab: string, local: string, kind: string): string {
920-
return `${vocab}|${local}|${kind}`;
939+
function symbolKey(vocab: string, local: string, kind: string, parentId: number | null = null): string {
940+
return `${vocab}|${local}|${kind}|${parentId ?? ""}`;
921941
}
922942

923943
// --- CLI -----------------------------------------------------------------

scripts/ingest-xsd/qname.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
*
1010
* 2. QName-valued attributes (ref, type, base, substitutionGroup, etc.) hold a
1111
* "prefix:localName" string. Resolution uses the document's xmlns:* declarations.
12-
* `resolveQNameAttr` returns either a resolved tuple or "unresolved" we never
12+
* `resolveQNameAttr` returns either a resolved tuple or "unresolved" - we never
1313
* invent a namespace for an unknown prefix.
1414
*/
1515

tests/ingest-xsd/fixtures/main.xsd

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,5 +95,18 @@
9595
<xsd:complexType name="CT_NestedAttrUser">
9696
<xsd:attributeGroup ref="AG_Outer"/>
9797
</xsd:complexType>
98+
<!-- Mirrors the WML tblGrid case: two complexTypes both declare a local
99+
element with the same name but DIFFERENT types. They must not collapse
100+
into a single symbol. -->
101+
<xsd:complexType name="CT_OuterA">
102+
<xsd:sequence>
103+
<xsd:element name="shared" type="ST_Jc"/>
104+
</xsd:sequence>
105+
</xsd:complexType>
106+
<xsd:complexType name="CT_OuterB">
107+
<xsd:sequence>
108+
<xsd:element name="shared" type="xsd:string"/>
109+
</xsd:sequence>
110+
</xsd:complexType>
98111
<xsd:element name="document" type="CT_Empty"/>
99112
</xsd:schema>

tests/ingest-xsd/ingest.test.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -223,12 +223,17 @@ test("ingest writes compositors and child edges for nested content models", asyn
223223
// CT_BaseWithChildren: sequence -> [ alpha, beta ]
224224
// CT_DerivedExtended: complexContent/extension -> sequence -> [ gamma ]
225225
// CT_NestedOrder: sequence -> [ head, choice -> [ branchA, branchB ], tail ]
226-
// Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) + Nested(2) = 8
227-
expect(stats.compositorsInserted).toBe(8);
226+
// Compositors: CT_Para(1) + CT_Body(2) + EG_PContent(1) + Base(1) + Derived(1) +
227+
// Nested(2) + OuterA(1) + OuterB(1) = 10
228+
expect(stats.compositorsInserted).toBe(10);
228229
expect(stats.groupRefsInserted).toBe(1);
229-
// Local element names (deduped per vocab): text, break, r, alpha, beta, gamma,
230-
// head, branchA, branchB, tail = 10.
231-
expect(stats.localElementsCreated).toBe(10);
230+
// Local element symbols are scoped per-owner now, so the two `shared` decls
231+
// in CT_OuterA and CT_OuterB count separately.
232+
// Per-owner locals: text(CT_Para), break(CT_Body), r(EG_PContent),
233+
// alpha+beta(CT_BaseWithChildren), gamma(CT_DerivedExtended),
234+
// head+branchA+branchB+tail(CT_NestedOrder), shared(CT_OuterA),
235+
// shared(CT_OuterB) = 12.
236+
expect(stats.localElementsCreated).toBe(12);
232237
expect(stats.childEdgesUnresolved).toBe(0);
233238
expect(stats.groupRefsUnresolved).toBe(0);
234239

tests/ingest-xsd/parse-schema.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,10 @@ test("declarationsByQName indexes all top-level declarations across documents",
8585
const set = await parseSchemaSet({ schemaDir: FIXTURES_DIR, entrypoints: ["main.xsd"] });
8686

8787
const counts = countByKind(set.declarationsByQName);
88-
// main.xsd: 1 element, 11 complexType, 1 simpleType, 1 group, 3 attributeGroup
88+
// main.xsd: 1 element, 13 complexType, 1 simpleType, 1 group, 3 attributeGroup
8989
// shared.xsd: 2 simpleType, 1 attribute
9090
expect(counts.element).toBe(1);
91-
expect(counts.complexType).toBe(11);
91+
expect(counts.complexType).toBe(13);
9292
expect(counts.simpleType).toBe(3);
9393
expect(counts.group).toBe(1);
9494
expect(counts.attributeGroup).toBe(3);

tests/mcp-server/ooxml-queries.test.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,51 @@ test("getChildren: nested compositor flatten preserves document order (CT_Nested
242242
expect(branchA?.compositorPath).toEqual(["sequence(1..1)", "choice(0..unbounded)"]);
243243
});
244244

245+
test("local element symbols are scoped per-owner (no cross-CT collapse)", async () => {
246+
// Mirrors the WML tblGrid case (CT_TblGridBase inside CT_TblGridChange vs
247+
// CT_TblGrid inside CT_Tbl). CT_OuterA / CT_OuterB both declare an inline
248+
// 'shared' element but with different @type. They must produce distinct
249+
// per-parent symbols, each carrying its own type_ref.
250+
const ctA = await lookupType(db.sql, WML_NS, "CT_OuterA", "transitional");
251+
const ctB = await lookupType(db.sql, WML_NS, "CT_OuterB", "transitional");
252+
if (!ctA || !ctB) throw new Error("CT_OuterA / CT_OuterB not found");
253+
254+
const aChildren = await getChildren(db.sql, ctA.id, "transitional");
255+
const bChildren = await getChildren(db.sql, ctB.id, "transitional");
256+
expect(aChildren).toHaveLength(1);
257+
expect(bChildren).toHaveLength(1);
258+
expect(aChildren[0].localName).toBe("shared");
259+
expect(bChildren[0].localName).toBe("shared");
260+
261+
// The two `shared` symbols carry different type_refs.
262+
const sharedSymbols = await db.sql`
263+
SELECT s.id, s.type_ref, s.parent_symbol_id, parent.local_name AS parent_name
264+
FROM xsd_symbols s
265+
JOIN xsd_symbols parent ON parent.id = s.parent_symbol_id
266+
WHERE s.local_name = 'shared' AND s.kind = 'element'
267+
ORDER BY parent.local_name
268+
`;
269+
expect(sharedSymbols).toHaveLength(2);
270+
expect(sharedSymbols[0].parent_name).toBe("CT_OuterA");
271+
expect(sharedSymbols[0].type_ref).toBe(`{${WML_NS}}ST_Jc`);
272+
expect(sharedSymbols[1].parent_name).toBe("CT_OuterB");
273+
expect(sharedSymbols[1].type_ref).toBe("{http://www.w3.org/2001/XMLSchema}string");
274+
});
275+
276+
test("xsd-builtin symbols have profile membership (lookupSymbolByTypeRef can follow xsd:string)", async () => {
277+
// Built-ins like xsd:string are auto-created during inheritance resolution and
278+
// must be linked to xsd_symbol_profiles, otherwise ooxml_lookup_type for
279+
// 'xsd:string' and lookupSymbolByTypeRef for {...XMLSchema}string return null.
280+
const t = await lookupSymbolByTypeRef(
281+
db.sql,
282+
"{http://www.w3.org/2001/XMLSchema}string",
283+
"transitional",
284+
);
285+
expect(t).not.toBeNull();
286+
expect(t?.localName).toBe("string");
287+
expect(t?.vocabularyId).toBe("xsd-builtin");
288+
});
289+
245290
test("getAttributes: nested attributeGroup chain unfolds (CT_NestedAttrUser -> innerAttr + outerAttr)", async () => {
246291
const ct = await lookupType(db.sql, WML_NS, "CT_NestedAttrUser", "transitional");
247292
if (!ct) throw new Error("CT_NestedAttrUser not found");

0 commit comments

Comments
 (0)