|
| 1 | +/** |
| 2 | + * Phase 3c: ingest top-level symbols and inheritance edges. |
| 3 | + * |
| 4 | + * Walks parseSchemaSet output and writes: |
| 5 | + * - xsd_profiles (bootstrap target profile, idempotent) |
| 6 | + * - xsd_namespaces (one row per unique URI seen across documents) |
| 7 | + * - xsd_symbols (canonical (vocabulary_id, local_name, kind), upsert by natural key) |
| 8 | + * - xsd_symbol_profiles (membership for the target profile, with source_id) |
| 9 | + * - xsd_inheritance_edges (extension/restriction from complexContent/simpleContent |
| 10 | + * and simpleType/restriction) |
| 11 | + * |
| 12 | + * NOT touched here (Phases 3d/3e): |
| 13 | + * - xsd_compositors, xsd_child_edges (content models) |
| 14 | + * - xsd_attr_edges (attributes) |
| 15 | + * - xsd_group_edges (group/attributeGroup refs) |
| 16 | + * - xsd_enums (simpleType enumerations) |
| 17 | + * |
| 18 | + * Idempotency: the entire ingest runs in a single transaction. Re-running |
| 19 | + * against the same source produces no new rows (UNIQUE + ON CONFLICT DO NOTHING). |
| 20 | + * Stale-row cleanup (when symbols vanish in a future edition) is deferred, |
| 21 | + * see PLAN.md "Edition flip and behavior_notes" open item. |
| 22 | + * |
| 23 | + * Usage as a library: |
| 24 | + * await ingestSchemaSet({ schemaDir, entrypoints, profileName, sourceName, sql }) |
| 25 | + * |
| 26 | + * Usage as a CLI: |
| 27 | + * bun scripts/ingest-xsd/ingest.ts |
| 28 | + * bun scripts/ingest-xsd/ingest.ts --schema-dir <dir> --entrypoint wml.xsd \ |
| 29 | + * --profile transitional --source ecma-376-transitional |
| 30 | + */ |
| 31 | + |
| 32 | +import { createDbClient, type DbClient } from "../../packages/shared/src/db/index.ts"; |
| 33 | +import { nodeAttrs } from "./ast.ts"; |
| 34 | +import { parseSchemaSet } from "./parse-schema.ts"; |
| 35 | +import { resolveQNameAttr } from "./qname.ts"; |
| 36 | +import type { |
| 37 | + Declaration, |
| 38 | + ParsedSchemaSet, |
| 39 | + PreserveOrderNode, |
| 40 | +} from "./types.ts"; |
| 41 | +import { vocabularyForNamespace } from "./vocabulary.ts"; |
| 42 | + |
| 43 | +// biome-ignore lint/suspicious/noExplicitAny: postgres library typing is intricate; helpers stay generic. |
| 44 | +type Sql = any; |
| 45 | + |
| 46 | +export interface IngestSchemaSetOptions { |
| 47 | + schemaDir: string; |
| 48 | + entrypoints: string[]; |
| 49 | + /** Profile name to attach symbols to (e.g. "transitional"). Bootstrap if missing. */ |
| 50 | + profileName: string; |
| 51 | + /** Source name in reference_sources; used for source_id on xsd_symbol_profiles. */ |
| 52 | + sourceName: string; |
| 53 | + /** Existing DbClient. The ingest opens its own transaction inside. */ |
| 54 | + db: DbClient; |
| 55 | +} |
| 56 | + |
| 57 | +export interface IngestStats { |
| 58 | + documents: number; |
| 59 | + symbolsInserted: number; |
| 60 | + symbolsExisting: number; |
| 61 | + namespacesEnsured: number; |
| 62 | + profileMembershipsInserted: number; |
| 63 | + inheritanceEdgesInserted: number; |
| 64 | + inheritanceUnresolved: number; |
| 65 | +} |
| 66 | + |
| 67 | +export async function ingestSchemaSet(opts: IngestSchemaSetOptions): Promise<IngestStats> { |
| 68 | + const parseResult = await parseSchemaSet({ |
| 69 | + schemaDir: opts.schemaDir, |
| 70 | + entrypoints: opts.entrypoints, |
| 71 | + }); |
| 72 | + |
| 73 | + const stats: IngestStats = { |
| 74 | + documents: parseResult.documents.size, |
| 75 | + symbolsInserted: 0, |
| 76 | + symbolsExisting: 0, |
| 77 | + namespacesEnsured: 0, |
| 78 | + profileMembershipsInserted: 0, |
| 79 | + inheritanceEdgesInserted: 0, |
| 80 | + inheritanceUnresolved: 0, |
| 81 | + }; |
| 82 | + |
| 83 | + await opts.db.sql.begin(async (sql: Sql) => { |
| 84 | + const profileId = await ensureProfile(sql, opts.profileName); |
| 85 | + const sourceId = await lookupSourceId(sql, opts.sourceName); |
| 86 | + |
| 87 | + // Pass 1: namespaces, symbols, profile memberships. |
| 88 | + const namespaceIds = new Map<string, number>(); |
| 89 | + const symbolIds = new Map<string, number>(); // canonical (vocab|local|kind) -> id |
| 90 | + |
| 91 | + for (const doc of parseResult.documents.values()) { |
| 92 | + if (!namespaceIds.has(doc.targetNamespace)) { |
| 93 | + const id = await ensureNamespace(sql, doc.targetNamespace); |
| 94 | + namespaceIds.set(doc.targetNamespace, id); |
| 95 | + stats.namespacesEnsured++; |
| 96 | + } |
| 97 | + } |
| 98 | + |
| 99 | + for (const decls of parseResult.declarationsByQName.values()) { |
| 100 | + for (const decl of decls) { |
| 101 | + const key = symbolKey(decl.vocabularyId, decl.localName, decl.kind); |
| 102 | + if (symbolIds.has(key)) continue; |
| 103 | + const { id, inserted } = await upsertSymbol( |
| 104 | + sql, |
| 105 | + decl.vocabularyId, |
| 106 | + decl.localName, |
| 107 | + decl.kind, |
| 108 | + ); |
| 109 | + symbolIds.set(key, id); |
| 110 | + if (inserted) stats.symbolsInserted++; |
| 111 | + else stats.symbolsExisting++; |
| 112 | + |
| 113 | + const nsId = namespaceIds.get(decl.namespace); |
| 114 | + if (!nsId) { |
| 115 | + throw new Error( |
| 116 | + `Internal: missing namespace id for ${decl.namespace} (decl ${decl.localName})`, |
| 117 | + ); |
| 118 | + } |
| 119 | + const linked = await linkSymbolToProfile(sql, id, profileId, nsId, sourceId); |
| 120 | + if (linked) stats.profileMembershipsInserted++; |
| 121 | + } |
| 122 | + } |
| 123 | + |
| 124 | + // Pass 2: inheritance edges. Resolve base qname through the document's |
| 125 | + // prefix map; ensure built-in xsd:* placeholders exist on demand. |
| 126 | + for (const decls of parseResult.declarationsByQName.values()) { |
| 127 | + for (const decl of decls) { |
| 128 | + const inherit = findInheritance(decl); |
| 129 | + if (!inherit) continue; |
| 130 | + |
| 131 | + const prefixMap = parseResult.namespaceByPrefix.get(decl.documentPath); |
| 132 | + if (!prefixMap) continue; |
| 133 | + const resolved = resolveQNameAttr(inherit.baseQName, prefixMap, decl.namespace); |
| 134 | + if (!resolved.resolved) { |
| 135 | + stats.inheritanceUnresolved++; |
| 136 | + continue; |
| 137 | + } |
| 138 | + const baseQ = resolved.qname; |
| 139 | + if (!baseQ.vocabularyId) { |
| 140 | + stats.inheritanceUnresolved++; |
| 141 | + continue; |
| 142 | + } |
| 143 | + |
| 144 | + // Look up existing symbol; for xsd-builtin, ensure on demand. |
| 145 | + let baseId: number | null = null; |
| 146 | + const candidateKinds: Array<Declaration["kind"]> = [ |
| 147 | + "complexType", |
| 148 | + "simpleType", |
| 149 | + "element", |
| 150 | + "group", |
| 151 | + "attributeGroup", |
| 152 | + "attribute", |
| 153 | + ]; |
| 154 | + for (const k of candidateKinds) { |
| 155 | + const id = symbolIds.get(symbolKey(baseQ.vocabularyId, baseQ.localName, k)); |
| 156 | + if (id != null) { |
| 157 | + baseId = id; |
| 158 | + break; |
| 159 | + } |
| 160 | + } |
| 161 | + if (baseId == null && baseQ.vocabularyId === "xsd-builtin") { |
| 162 | + const { id, inserted } = await upsertSymbol( |
| 163 | + sql, |
| 164 | + "xsd-builtin", |
| 165 | + baseQ.localName, |
| 166 | + "simpleType", |
| 167 | + ); |
| 168 | + symbolIds.set(symbolKey("xsd-builtin", baseQ.localName, "simpleType"), id); |
| 169 | + baseId = id; |
| 170 | + if (inserted) stats.symbolsInserted++; |
| 171 | + else stats.symbolsExisting++; |
| 172 | + } |
| 173 | + if (baseId == null) { |
| 174 | + stats.inheritanceUnresolved++; |
| 175 | + continue; |
| 176 | + } |
| 177 | + |
| 178 | + const childId = symbolIds.get(symbolKey(decl.vocabularyId, decl.localName, decl.kind)); |
| 179 | + if (childId == null) continue; |
| 180 | + |
| 181 | + const inserted = await insertInheritance(sql, childId, baseId, profileId, inherit.relation); |
| 182 | + if (inserted) stats.inheritanceEdgesInserted++; |
| 183 | + } |
| 184 | + } |
| 185 | + }); |
| 186 | + |
| 187 | + return stats; |
| 188 | +} |
| 189 | + |
| 190 | +// --- DB helpers ---------------------------------------------------------- |
| 191 | + |
| 192 | +async function ensureProfile(sql: Sql, name: string): Promise<number> { |
| 193 | + const [row] = await sql` |
| 194 | + INSERT INTO xsd_profiles (name) VALUES (${name}) |
| 195 | + ON CONFLICT (name) DO UPDATE SET name = EXCLUDED.name |
| 196 | + RETURNING id |
| 197 | + `; |
| 198 | + return row.id; |
| 199 | +} |
| 200 | + |
| 201 | +async function lookupSourceId(sql: Sql, name: string): Promise<number> { |
| 202 | + const [row] = await sql`SELECT id FROM reference_sources WHERE name = ${name} LIMIT 1`; |
| 203 | + if (!row) throw new Error(`reference_sources row not found for name='${name}'. Run db:sync-sources first.`); |
| 204 | + return row.id; |
| 205 | +} |
| 206 | + |
| 207 | +async function ensureNamespace(sql: Sql, uri: string): Promise<number> { |
| 208 | + const [row] = await sql` |
| 209 | + INSERT INTO xsd_namespaces (uri) VALUES (${uri}) |
| 210 | + ON CONFLICT (uri) DO UPDATE SET uri = EXCLUDED.uri |
| 211 | + RETURNING id |
| 212 | + `; |
| 213 | + return row.id; |
| 214 | +} |
| 215 | + |
| 216 | +async function upsertSymbol( |
| 217 | + sql: Sql, |
| 218 | + vocabularyId: string, |
| 219 | + localName: string, |
| 220 | + kind: string, |
| 221 | +): Promise<{ id: number; inserted: boolean }> { |
| 222 | + const [row] = await sql` |
| 223 | + INSERT INTO xsd_symbols (vocabulary_id, local_name, kind) |
| 224 | + VALUES (${vocabularyId}, ${localName}, ${kind}) |
| 225 | + ON CONFLICT (vocabulary_id, local_name, kind) DO UPDATE SET kind = EXCLUDED.kind |
| 226 | + RETURNING id, (xmax = 0) AS inserted |
| 227 | + `; |
| 228 | + return { id: row.id, inserted: row.inserted }; |
| 229 | +} |
| 230 | + |
| 231 | +async function linkSymbolToProfile( |
| 232 | + sql: Sql, |
| 233 | + symbolId: number, |
| 234 | + profileId: number, |
| 235 | + namespaceId: number, |
| 236 | + sourceId: number, |
| 237 | +): Promise<boolean> { |
| 238 | + const rows = await sql` |
| 239 | + INSERT INTO xsd_symbol_profiles (symbol_id, profile_id, namespace_id, source_id) |
| 240 | + VALUES (${symbolId}, ${profileId}, ${namespaceId}, ${sourceId}) |
| 241 | + ON CONFLICT (symbol_id, profile_id) DO NOTHING |
| 242 | + RETURNING id |
| 243 | + `; |
| 244 | + return rows.length > 0; |
| 245 | +} |
| 246 | + |
| 247 | +async function insertInheritance( |
| 248 | + sql: Sql, |
| 249 | + symbolId: number, |
| 250 | + baseSymbolId: number, |
| 251 | + profileId: number, |
| 252 | + relation: "extension" | "restriction", |
| 253 | +): Promise<boolean> { |
| 254 | + const rows = await sql` |
| 255 | + INSERT INTO xsd_inheritance_edges (symbol_id, base_symbol_id, profile_id, relation) |
| 256 | + VALUES (${symbolId}, ${baseSymbolId}, ${profileId}, ${relation}) |
| 257 | + ON CONFLICT (symbol_id, profile_id) DO NOTHING |
| 258 | + RETURNING id |
| 259 | + `; |
| 260 | + return rows.length > 0; |
| 261 | +} |
| 262 | + |
| 263 | +// --- Inheritance discovery from AST ------------------------------------- |
| 264 | + |
| 265 | +interface InheritanceFinding { |
| 266 | + baseQName: string; |
| 267 | + relation: "extension" | "restriction"; |
| 268 | +} |
| 269 | + |
| 270 | +function findInheritance(decl: Declaration): InheritanceFinding | null { |
| 271 | + if (decl.kind === "complexType") { |
| 272 | + for (const child of nodeChildrenLocal(decl.node)) { |
| 273 | + const tag = stripPrefixLocal(nodeTagLocal(child)); |
| 274 | + if (tag !== "complexContent" && tag !== "simpleContent") continue; |
| 275 | + for (const inner of nodeChildrenLocal(child)) { |
| 276 | + const innerTag = stripPrefixLocal(nodeTagLocal(inner)); |
| 277 | + if (innerTag !== "extension" && innerTag !== "restriction") continue; |
| 278 | + const base = nodeAttrs(inner).base; |
| 279 | + if (base) return { baseQName: base, relation: innerTag }; |
| 280 | + } |
| 281 | + } |
| 282 | + return null; |
| 283 | + } |
| 284 | + if (decl.kind === "simpleType") { |
| 285 | + for (const child of nodeChildrenLocal(decl.node)) { |
| 286 | + const tag = stripPrefixLocal(nodeTagLocal(child)); |
| 287 | + if (tag !== "restriction") continue; |
| 288 | + const base = nodeAttrs(child).base; |
| 289 | + if (base) return { baseQName: base, relation: "restriction" }; |
| 290 | + } |
| 291 | + } |
| 292 | + return null; |
| 293 | +} |
| 294 | + |
| 295 | +function nodeTagLocal(node: PreserveOrderNode): string | null { |
| 296 | + for (const k of Object.keys(node)) if (k !== ":@") return k; |
| 297 | + return null; |
| 298 | +} |
| 299 | +function nodeChildrenLocal(node: PreserveOrderNode): PreserveOrderNode[] { |
| 300 | + const tag = nodeTagLocal(node); |
| 301 | + if (!tag) return []; |
| 302 | + const v = node[tag]; |
| 303 | + return Array.isArray(v) ? (v as PreserveOrderNode[]) : []; |
| 304 | +} |
| 305 | +function stripPrefixLocal(tag: string | null): string | null { |
| 306 | + if (!tag) return null; |
| 307 | + const colon = tag.indexOf(":"); |
| 308 | + return colon < 0 ? tag : tag.slice(colon + 1); |
| 309 | +} |
| 310 | + |
| 311 | +function symbolKey(vocab: string, local: string, kind: string): string { |
| 312 | + return `${vocab}|${local}|${kind}`; |
| 313 | +} |
| 314 | + |
| 315 | +// --- CLI ----------------------------------------------------------------- |
| 316 | + |
| 317 | +interface CliArgs { |
| 318 | + schemaDir: string; |
| 319 | + entrypoints: string[]; |
| 320 | + profileName: string; |
| 321 | + sourceName: string; |
| 322 | +} |
| 323 | + |
| 324 | +function parseCliArgs(): CliArgs { |
| 325 | + const argv = process.argv.slice(2); |
| 326 | + let schemaDir = "./data/xsd-cache/ecma-376-transitional"; |
| 327 | + const entrypoints: string[] = []; |
| 328 | + let profileName = "transitional"; |
| 329 | + let sourceName = "ecma-376-transitional"; |
| 330 | + for (let i = 0; i < argv.length; i++) { |
| 331 | + const a = argv[i]; |
| 332 | + if (a === "--schema-dir") schemaDir = argv[++i] ?? schemaDir; |
| 333 | + else if (a === "--entrypoint") entrypoints.push(argv[++i] ?? ""); |
| 334 | + else if (a === "--profile") profileName = argv[++i] ?? profileName; |
| 335 | + else if (a === "--source") sourceName = argv[++i] ?? sourceName; |
| 336 | + } |
| 337 | + if (entrypoints.length === 0) entrypoints.push("wml.xsd"); |
| 338 | + return { schemaDir, entrypoints, profileName, sourceName }; |
| 339 | +} |
| 340 | + |
| 341 | +async function main() { |
| 342 | + const args = parseCliArgs(); |
| 343 | + const databaseUrl = process.env.DATABASE_URL; |
| 344 | + if (!databaseUrl) { |
| 345 | + console.error("Missing DATABASE_URL"); |
| 346 | + process.exit(1); |
| 347 | + } |
| 348 | + const db = createDbClient(databaseUrl); |
| 349 | + |
| 350 | + const t0 = Date.now(); |
| 351 | + try { |
| 352 | + const stats = await ingestSchemaSet({ ...args, db }); |
| 353 | + const ms = Date.now() - t0; |
| 354 | + console.log(`schemaDir: ${args.schemaDir}`); |
| 355 | + console.log(`entrypoints: ${args.entrypoints.join(", ")}`); |
| 356 | + console.log(`profile: ${args.profileName}`); |
| 357 | + console.log(`source: ${args.sourceName}`); |
| 358 | + console.log(`documents: ${stats.documents}`); |
| 359 | + console.log(`symbols inserted: ${stats.symbolsInserted}`); |
| 360 | + console.log(`symbols existing: ${stats.symbolsExisting}`); |
| 361 | + console.log(`namespaces ensured: ${stats.namespacesEnsured}`); |
| 362 | + console.log(`profile memberships: ${stats.profileMembershipsInserted}`); |
| 363 | + console.log(`inheritance edges: ${stats.inheritanceEdgesInserted}`); |
| 364 | + console.log(`inheritance unres.: ${stats.inheritanceUnresolved}`); |
| 365 | + console.log(`elapsed: ${ms}ms`); |
| 366 | + } finally { |
| 367 | + await db.close(); |
| 368 | + } |
| 369 | +} |
| 370 | + |
| 371 | +if (import.meta.path === Bun.main) { |
| 372 | + main().catch((err) => { |
| 373 | + console.error("ingest failed:", err); |
| 374 | + process.exit(1); |
| 375 | + }); |
| 376 | +} |
0 commit comments