From 83d49bf87cf0e4d6db716db7116dc2ce0a723583 Mon Sep 17 00:00:00 2001 From: Jadenzzz <94533693+Jadenzzz@users.noreply.github.com> Date: Thu, 14 May 2026 17:14:08 +1000 Subject: [PATCH 1/2] first commit --- src/commands/import.js | 122 +++++++++++++++++++++++++++++++---------- src/prompts/index.js | 6 +- 2 files changed, 97 insertions(+), 31 deletions(-) diff --git a/src/commands/import.js b/src/commands/import.js index 9e1c76d..c728940 100644 --- a/src/commands/import.js +++ b/src/commands/import.js @@ -87,9 +87,10 @@ export async function importDocs(options) { const { llms, llmsUrl } = await timePhase('fetch llms.txt', async () => { for (const candidate of llmsCandidates) { - const res = await fetchLlmsTxt(candidate) + const res = await fetchLlmsTxt(candidate, sourceUrl) if (res.ok) return { llms: res, llmsUrl: candidate } - styles.info(styles.dim(` ${candidate} → ${res.status ? `HTTP ${res.status}` : res.error || 'failed'}`)) + const reason = res.error || (res.status ? `HTTP ${res.status}` : 'failed') + styles.info(styles.dim(` ${candidate} → ${reason}`)) } return { llms: null, llmsUrl: null } }) @@ -316,7 +317,7 @@ export async function importDocs(options) { } else if (llms) { const fastPath = sectionsLookUsable(llms.parsed.sections) styles.info(`Organizing with Claude (${styles.bold(options.model)}, ${fastPath ? 'fast path: icons only' : 'full reorg'})...`) - organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model)) + organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model, sourceUrl)) } else { // Sitemap-only fallback: synthesize categories by clustering the URL // paths. clusterByUrlPath returns null when the URLs don't split cleanly, @@ -2054,11 +2055,11 @@ function sectionsLookUsable(sections) { return usableSections(sections).length >= 3 } -async function organizeWithClaude(parsed, model) { +async function organizeWithClaude(parsed, model, sourceUrl) { if (sectionsLookUsable(parsed.sections)) { return organizeFromSections(parsed, model) } - return organizeFromScratch(parsed, model) + return organizeFromScratch(parsed, model, sourceUrl) } /** @@ -2097,7 +2098,7 @@ async function organizeFromSections(parsed, model) { return { title: parsed.title || null, categories } } -async function organizeFromScratch(parsed, model) { +async function organizeFromScratch(parsed, model, sourceUrl) { const items = parsed.sections.flatMap((s) => s.items.map((i) => ({ section: s.title, @@ -2110,6 +2111,7 @@ async function organizeFromScratch(parsed, model) { const { systemPrompt, userPrompt } = organizeFromScratchPrompt({ siteTitle: parsed.title, items, + sourceUrl: sourceUrl ? sourceUrl.toString() : undefined, }) const raw = await runJsonQuery({ systemPrompt, userPrompt, model }) @@ -2242,10 +2244,10 @@ function buildLlmsCandidates(sourceUrl) { } /** - * Best-effort fetch of a site's /llms.txt. Returns { ok, status, error, parsed } - * where parsed is { title, sections: [{ title, items: [{ text, url, description }] }] }. + * Fetch and parse a /llms.txt. Rejects text/html responses since SPAs + * commonly return their index for unmatched paths. */ -async function fetchLlmsTxt(llmsUrl) { +async function fetchLlmsTxt(llmsUrl, sourceUrl) { try { const res = await fetch(llmsUrl, { redirect: 'follow', @@ -2253,50 +2255,112 @@ async function fetchLlmsTxt(llmsUrl) { }) if (!res.ok) return { ok: false, status: res.status } const text = await res.text() - return { ok: true, status: res.status, parsed: parseLlmsTxt(text) } + return { ok: true, status: res.status, parsed: parseLlmsTxt(text, { sourceUrl }) } } catch (e) { return { ok: false, error: e.message } } } /** - * Parse the llms.txt format. `##` headings become sections; - * `- [text](url): description` bullets become items. Items before any `##` - * land in an implicit "Resources" section. + * Parse the llms.txt format. + * + * Two link patterns are recognized: + * - Line-start bullets (`- `, `* `, `+ ` — CommonMark allows all three). + * - Inline `[text](url)` anywhere in prose. Some files (e.g. Shopify) keep + * most refs in paragraphs rather than bullet lists, so the inline pass + * is needed for usable coverage. + * + * Headings: `##`–`####`. Treating only `##` collapses deeply-nested files + * into a single mega-section. + * + * When `options.sourceUrl` is provided, all items must share its origin. */ -function parseLlmsTxt(body) { +function parseLlmsTxt(body, options = {}) { const lines = body.split(/\r?\n/) let title = null const sections = [] let current = null + let inCodeFence = false + const seenUrls = new Set() + + const headingRe = /^#{2,4}\s+(.+)$/ + const itemRe = /^\s*[-*+]\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/ + const inlineLinkRe = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g + + let originFilter = null + if (options.sourceUrl) { + try { originFilter = new URL(options.sourceUrl).origin } catch {} + } - const itemRe = /^\s*-\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/ + const cleanUrl = (u) => u.replace(/[.,;]+$/, '') + + // Peels paired markdown emphasis (`**x**`, `*x*`, `__x__`, `_x_`, and + // combined forms). `\1` backref forces matched wrappers; asymmetric + // input is left untouched. + const stripEmphasis = (text) => { + let t = text.trim() + for (let i = 0; i < 3; i++) { + const m = t.match(/^(\*{1,3}|_{1,3})([\s\S]+?)\1$/) + if (!m) break + t = m[2].trim() + } + return t + } + + const passesOriginFilter = (url) => { + if (!originFilter) return true + try { return new URL(url).origin === originFilter } catch { return false } + } + + const pushItem = ({ text, url, description }) => { + if (!passesOriginFilter(url)) return + if (seenUrls.has(url)) return + seenUrls.add(url) + if (!current) { + current = { title: 'Resources', items: [] } + sections.push(current) + } + current.items.push({ + text: stripEmphasis(text), + url, + description: description || null, + }) + } for (const line of lines) { + if (/^\s*```/.test(line)) { + inCodeFence = !inCodeFence + continue + } + if (inCodeFence) continue + const h1 = line.match(/^#\s+(.+)$/) if (h1 && !title) { - title = h1[1].trim() + title = stripEmphasis(h1[1]) continue } - const h2 = line.match(/^##\s+(.+)$/) - if (h2) { - current = { title: h2[1].trim(), items: [] } + const heading = line.match(headingRe) + if (heading) { + current = { title: stripEmphasis(heading[1]), items: [] } sections.push(current) continue } - const item = line.match(itemRe) - if (item) { - if (!current) { - current = { title: 'Resources', items: [] } - sections.push(current) - } - current.items.push({ - text: item[1].trim(), - url: item[2].replace(/[.,;]+$/, ''), - description: item[3] ? item[3].trim() : null, + const bullet = line.match(itemRe) + if (bullet) { + pushItem({ + text: bullet[1].trim(), + url: cleanUrl(bullet[2]), + description: bullet[3] ? bullet[3].trim() : null, }) + // Bullet's first link already captured; further links on this line + // belong to its description, not as separate items. + continue + } + + for (const m of line.matchAll(inlineLinkRe)) { + pushItem({ text: m[1].trim(), url: cleanUrl(m[2]), description: null }) } } diff --git a/src/prompts/index.js b/src/prompts/index.js index 19fd234..4c67e2a 100644 --- a/src/prompts/index.js +++ b/src/prompts/index.js @@ -173,7 +173,7 @@ export const organizeFromScratchSystemPrompt = [ * Pre-flattened page list. The id used in the prompt is the array index. * @returns {string} */ -export function buildOrganizeFromScratchUserPrompt({ siteTitle, items }) { +export function buildOrganizeFromScratchUserPrompt({ siteTitle, items, sourceUrl }) { const compactLines = items.map((it, idx) => { let relPath = it.url; try { relPath = new URL(it.url).pathname + new URL(it.url).search; } catch {} @@ -181,7 +181,9 @@ export function buildOrganizeFromScratchUserPrompt({ siteTitle, items }) { }); let origin = '(unknown)'; - if (items.length > 0) { + if (sourceUrl) { + try { origin = new URL(sourceUrl).origin; } catch {} + } else if (items.length > 0) { try { origin = new URL(items[0].url).origin; } catch {} } From 57f7834b48737fc2e2384fe7ed4e42c79242f66f Mon Sep 17 00:00:00 2001 From: Xavier Andueza Date: Fri, 15 May 2026 07:43:26 +1000 Subject: [PATCH 2/2] fix: ensure that the urls from llms.txt don't have anchors, query params etc --- src/commands/import.js | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/src/commands/import.js b/src/commands/import.js index 1a954a3..3fbf9df 100644 --- a/src/commands/import.js +++ b/src/commands/import.js @@ -143,11 +143,9 @@ export async function importDocs(options) { const rawKnownUrls = llms.parsed.sections.flatMap((s) => s.items.map((i) => ({ title: i.text, url: i.url, description: i.description }))) - // Dedupe llms.txt entries by pathname. Some sites (zod.dev, fumadocs) list - // every in-page anchor as its own llms.txt row (`/v4?id=wrapping-up`, - // `/v4?id=metadata`, …) even though they all live on one rendered page. - // We prefer the "cleanest" URL per path — the shortest one, which is - // usually the one without a query string or hash. + // Dedupe llms.txt entries by pathname. Some sites list every in-page anchor + // as its own llms.txt row (`/custom-data#anchor1`, `/custom-data#anchor2`, + // even though they all live on one rendered page. const byKnownPath = new Map() for (const p of rawKnownUrls) { const key = normalizePath(p.url) @@ -159,6 +157,18 @@ export async function importDocs(options) { if (dropped > 0) { styles.info(`${styles.dim(`Collapsed ${dropped} anchor/query duplicates → ${knownUrls.length} unique pages.`)}`) } + + const seenSectionPaths = new Set() + for (const section of llms.parsed.sections) { + section.items = section.items.filter((it) => { + const key = normalizePath(it.url) + const kept = byKnownPath.get(key) + if (!kept || it.url !== kept.url) return false + if (seenSectionPaths.has(key)) return false + seenSectionPaths.add(key) + return true + }) + } } else if (sitemapKnownUrls.length > 0) { knownUrls = sitemapKnownUrls } @@ -1730,6 +1740,7 @@ function slotOrphansByPath(scraped, knownPages) { matched.add(norm) } else { orphans.push(p) + matched.add(norm) } } return orphans @@ -2053,18 +2064,27 @@ function normalizePath(url) { * Returns the orphans Claude couldn't slot. */ async function slotOrphansWithClaude(scraped, orphans, model) { + const seenPaths = new Set(scraped.categories.flatMap((c) => c.pages.map((p) => normalizePath(p.url)))) + const dedupedOrphans = [] + for (const p of orphans) { + const norm = normalizePath(p.url) + if (seenPaths.has(norm)) continue + seenPaths.add(norm) + dedupedOrphans.push(p) + } + const { systemPrompt, userPrompt } = slotOrphansPrompt({ categories: scraped.categories, - orphans, + orphans: dedupedOrphans, }) const raw = await runJsonQuery({ systemPrompt, userPrompt, model }) if (!Array.isArray(raw)) { // If the model didn't cooperate, return all orphans unassigned. - return orphans + return dedupedOrphans } const leftover = [] - orphans.forEach((p, i) => { + dedupedOrphans.forEach((p, i) => { const idx = Number.isInteger(raw[i]) ? raw[i] : -1 if (idx >= 0 && idx < scraped.categories.length) { scraped.categories[idx].pages.push({