From 83d49bf87cf0e4d6db716db7116dc2ce0a723583 Mon Sep 17 00:00:00 2001
From: Jadenzzz <94533693+Jadenzzz@users.noreply.github.com>
Date: Thu, 14 May 2026 17:14:08 +1000
Subject: [PATCH 1/2] first commit

---
 src/commands/import.js | 122 +++++++++++++++++++++++++++++++----------
 src/prompts/index.js   |   6 +-
 2 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/src/commands/import.js b/src/commands/import.js
index 9e1c76d..c728940 100644
--- a/src/commands/import.js
+++ b/src/commands/import.js
@@ -87,9 +87,10 @@ export async function importDocs(options) {
 
   const { llms, llmsUrl } = await timePhase('fetch llms.txt', async () => {
     for (const candidate of llmsCandidates) {
-      const res = await fetchLlmsTxt(candidate)
+      const res = await fetchLlmsTxt(candidate, sourceUrl)
       if (res.ok) return { llms: res, llmsUrl: candidate }
-      styles.info(styles.dim(`  ${candidate} → ${res.status ? `HTTP ${res.status}` : res.error || 'failed'}`))
+      const reason = res.error || (res.status ? `HTTP ${res.status}` : 'failed')
+      styles.info(styles.dim(`  ${candidate} → ${reason}`))
     }
     return { llms: null, llmsUrl: null }
   })
@@ -316,7 +317,7 @@ export async function importDocs(options) {
   } else if (llms) {
     const fastPath = sectionsLookUsable(llms.parsed.sections)
     styles.info(`Organizing with Claude (${styles.bold(options.model)}, ${fastPath ? 'fast path: icons only' : 'full reorg'})...`)
-    organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model))
+    organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model, sourceUrl))
   } else {
     // Sitemap-only fallback: synthesize categories by clustering the URL
     // paths. clusterByUrlPath returns null when the URLs don't split cleanly,
@@ -2054,11 +2055,11 @@ function sectionsLookUsable(sections) {
   return usableSections(sections).length >= 3
 }
 
-async function organizeWithClaude(parsed, model) {
+async function organizeWithClaude(parsed, model, sourceUrl) {
   if (sectionsLookUsable(parsed.sections)) {
     return organizeFromSections(parsed, model)
   }
-  return organizeFromScratch(parsed, model)
+  return organizeFromScratch(parsed, model, sourceUrl)
 }
 
 /**
@@ -2097,7 +2098,7 @@ async function organizeFromSections(parsed, model) {
   return { title: parsed.title || null, categories }
 }
 
-async function organizeFromScratch(parsed, model) {
+async function organizeFromScratch(parsed, model, sourceUrl) {
   const items = parsed.sections.flatMap((s) =>
     s.items.map((i) => ({
       section: s.title,
@@ -2110,6 +2111,7 @@ async function organizeFromScratch(parsed, model) {
   const { systemPrompt, userPrompt } = organizeFromScratchPrompt({
     siteTitle: parsed.title,
     items,
+    sourceUrl: sourceUrl ? sourceUrl.toString() : undefined,
   })
   const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
 
@@ -2242,10 +2244,10 @@ function buildLlmsCandidates(sourceUrl) {
 }
 
 /**
- * Best-effort fetch of a site's /llms.txt. Returns { ok, status, error, parsed }
- * where parsed is { title, sections: [{ title, items: [{ text, url, description }] }] }.
+ * Fetch and parse a /llms.txt. Rejects text/html responses since SPAs
+ * commonly return their index for unmatched paths.
  */
-async function fetchLlmsTxt(llmsUrl) {
+async function fetchLlmsTxt(llmsUrl, sourceUrl) {
   try {
     const res = await fetch(llmsUrl, {
       redirect: 'follow',
@@ -2253,50 +2255,112 @@ async function fetchLlmsTxt(llmsUrl) {
     })
     if (!res.ok) return { ok: false, status: res.status }
     const text = await res.text()
-    return { ok: true, status: res.status, parsed: parseLlmsTxt(text) }
+    return { ok: true, status: res.status, parsed: parseLlmsTxt(text, { sourceUrl }) }
   } catch (e) {
     return { ok: false, error: e.message }
   }
 }
 
 /**
- * Parse the llms.txt format. `##` headings become sections;
- * `- [text](url): description` bullets become items. Items before any `##`
- * land in an implicit "Resources" section.
+ * Parse the llms.txt format. 
+ *
+ * Two link patterns are recognized:
+ *   - Line-start bullets (`- `, `* `, `+ ` — CommonMark allows all three).
+ *   - Inline `[text](url)` anywhere in prose. Some files (e.g. Shopify) keep
+ *     most refs in paragraphs rather than bullet lists, so the inline pass
+ *     is needed for usable coverage.
+ *
+ * Headings: `##`–`####`. Treating only `##` collapses deeply-nested files
+ * into a single mega-section.
+ *
+ * When `options.sourceUrl` is provided, all items must share its origin.
  */
-function parseLlmsTxt(body) {
+function parseLlmsTxt(body, options = {}) {
   const lines = body.split(/\r?\n/)
   let title = null
   const sections = []
   let current = null
+  let inCodeFence = false
+  const seenUrls = new Set()
+
+  const headingRe = /^#{2,4}\s+(.+)$/
+  const itemRe = /^\s*[-*+]\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
+  const inlineLinkRe = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g
+
+  let originFilter = null
+  if (options.sourceUrl) {
+    try { originFilter = new URL(options.sourceUrl).origin } catch {}
+  }
 
-  const itemRe = /^\s*-\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
+  const cleanUrl = (u) => u.replace(/[.,;]+$/, '')
+
+  // Peels paired markdown emphasis (`**x**`, `*x*`, `__x__`, `_x_`, and
+  // combined forms). `\1` backref forces matched wrappers; asymmetric
+  // input is left untouched.
+  const stripEmphasis = (text) => {
+    let t = text.trim()
+    for (let i = 0; i < 3; i++) {
+      const m = t.match(/^(\*{1,3}|_{1,3})([\s\S]+?)\1$/)
+      if (!m) break
+      t = m[2].trim()
+    }
+    return t
+  }
+
+  const passesOriginFilter = (url) => {
+    if (!originFilter) return true
+    try { return new URL(url).origin === originFilter } catch { return false }
+  }
+
+  const pushItem = ({ text, url, description }) => {
+    if (!passesOriginFilter(url)) return
+    if (seenUrls.has(url)) return
+    seenUrls.add(url)
+    if (!current) {
+      current = { title: 'Resources', items: [] }
+      sections.push(current)
+    }
+    current.items.push({
+      text: stripEmphasis(text),
+      url,
+      description: description || null,
+    })
+  }
 
   for (const line of lines) {
+    if (/^\s*```/.test(line)) {
+      inCodeFence = !inCodeFence
+      continue
+    }
+    if (inCodeFence) continue
+
     const h1 = line.match(/^#\s+(.+)$/)
     if (h1 && !title) {
-      title = h1[1].trim()
+      title = stripEmphasis(h1[1])
       continue
     }
 
-    const h2 = line.match(/^##\s+(.+)$/)
-    if (h2) {
-      current = { title: h2[1].trim(), items: [] }
+    const heading = line.match(headingRe)
+    if (heading) {
+      current = { title: stripEmphasis(heading[1]), items: [] }
       sections.push(current)
       continue
     }
 
-    const item = line.match(itemRe)
-    if (item) {
-      if (!current) {
-        current = { title: 'Resources', items: [] }
-        sections.push(current)
-      }
-      current.items.push({
-        text: item[1].trim(),
-        url: item[2].replace(/[.,;]+$/, ''),
-        description: item[3] ? item[3].trim() : null,
+    const bullet = line.match(itemRe)
+    if (bullet) {
+      pushItem({
+        text: bullet[1].trim(),
+        url: cleanUrl(bullet[2]),
+        description: bullet[3] ? bullet[3].trim() : null,
       })
+      // Bullet's first link already captured; further links on this line
+      // belong to its description, not as separate items.
+      continue
+    }
+
+    for (const m of line.matchAll(inlineLinkRe)) {
+      pushItem({ text: m[1].trim(), url: cleanUrl(m[2]), description: null })
     }
   }
 
diff --git a/src/prompts/index.js b/src/prompts/index.js
index 19fd234..4c67e2a 100644
--- a/src/prompts/index.js
+++ b/src/prompts/index.js
@@ -173,7 +173,7 @@ export const organizeFromScratchSystemPrompt = [
  *        Pre-flattened page list. The id used in the prompt is the array index.
  * @returns {string}
  */
-export function buildOrganizeFromScratchUserPrompt({ siteTitle, items }) {
+export function buildOrganizeFromScratchUserPrompt({ siteTitle, items, sourceUrl }) {
   const compactLines = items.map((it, idx) => {
     let relPath = it.url;
     try { relPath = new URL(it.url).pathname + new URL(it.url).search; } catch {}
@@ -181,7 +181,9 @@ export function buildOrganizeFromScratchUserPrompt({ siteTitle, items }) {
   });
 
   let origin = '(unknown)';
-  if (items.length > 0) {
+  if (sourceUrl) {
+    try { origin = new URL(sourceUrl).origin; } catch {}
+  } else if (items.length > 0) {
     try { origin = new URL(items[0].url).origin; } catch {}
   }
 

From 57f7834b48737fc2e2384fe7ed4e42c79242f66f Mon Sep 17 00:00:00 2001
From: Xavier Andueza <xavier@lyratechnologies.com.au>
Date: Fri, 15 May 2026 07:43:26 +1000
Subject: [PATCH 2/2] fix: ensure that the urls from llms.txt don't have
 anchors, query params etc

---
 src/commands/import.js | 36 ++++++++++++++++++++++++++++--------
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/src/commands/import.js b/src/commands/import.js
index 1a954a3..3fbf9df 100644
--- a/src/commands/import.js
+++ b/src/commands/import.js
@@ -143,11 +143,9 @@ export async function importDocs(options) {
 
     const rawKnownUrls = llms.parsed.sections.flatMap((s) => s.items.map((i) => ({ title: i.text, url: i.url, description: i.description })))
 
-    // Dedupe llms.txt entries by pathname. Some sites (zod.dev, fumadocs) list
-    // every in-page anchor as its own llms.txt row (`/v4?id=wrapping-up`,
-    // `/v4?id=metadata`, …) even though they all live on one rendered page.
-    // We prefer the "cleanest" URL per path — the shortest one, which is
-    // usually the one without a query string or hash.
+    // Dedupe llms.txt entries by pathname. Some sites list every in-page anchor
+    // as its own llms.txt row (`/custom-data#anchor1`, `/custom-data#anchor2`,
+    // even though they all live on one rendered page.
     const byKnownPath = new Map()
     for (const p of rawKnownUrls) {
       const key = normalizePath(p.url)
@@ -159,6 +157,18 @@ export async function importDocs(options) {
     if (dropped > 0) {
       styles.info(`${styles.dim(`Collapsed ${dropped} anchor/query duplicates → ${knownUrls.length} unique pages.`)}`)
     }
+
+    const seenSectionPaths = new Set()
+    for (const section of llms.parsed.sections) {
+      section.items = section.items.filter((it) => {
+        const key = normalizePath(it.url)
+        const kept = byKnownPath.get(key)
+        if (!kept || it.url !== kept.url) return false
+        if (seenSectionPaths.has(key)) return false
+        seenSectionPaths.add(key)
+        return true
+      })
+    }
   } else if (sitemapKnownUrls.length > 0) {
     knownUrls = sitemapKnownUrls
   }
@@ -1730,6 +1740,7 @@ function slotOrphansByPath(scraped, knownPages) {
       matched.add(norm)
     } else {
       orphans.push(p)
+      matched.add(norm)
     }
   }
   return orphans
@@ -2053,18 +2064,27 @@ function normalizePath(url) {
  * Returns the orphans Claude couldn't slot.
  */
 async function slotOrphansWithClaude(scraped, orphans, model) {
+  const seenPaths = new Set(scraped.categories.flatMap((c) => c.pages.map((p) => normalizePath(p.url))))
+  const dedupedOrphans = []
+  for (const p of orphans) {
+    const norm = normalizePath(p.url)
+    if (seenPaths.has(norm)) continue
+    seenPaths.add(norm)
+    dedupedOrphans.push(p)
+  }
+
   const { systemPrompt, userPrompt } = slotOrphansPrompt({
     categories: scraped.categories,
-    orphans,
+    orphans: dedupedOrphans,
   })
   const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
   if (!Array.isArray(raw)) {
     // If the model didn't cooperate, return all orphans unassigned.
-    return orphans
+    return dedupedOrphans
   }
 
   const leftover = []
-  orphans.forEach((p, i) => {
+  dedupedOrphans.forEach((p, i) => {
     const idx = Number.isInteger(raw[i]) ? raw[i] : -1
     if (idx >= 0 && idx < scraped.categories.length) {
       scraped.categories[idx].pages.push({