Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 121 additions & 37 deletions src/commands/import.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,10 @@ export async function importDocs(options) {

const { llms, llmsUrl } = await timePhase('fetch llms.txt', async () => {
for (const candidate of llmsCandidates) {
const res = await fetchLlmsTxt(candidate)
const res = await fetchLlmsTxt(candidate, sourceUrl)
if (res.ok) return { llms: res, llmsUrl: candidate }
styles.info(styles.dim(` ${candidate} → ${res.status ? `HTTP ${res.status}` : res.error || 'failed'}`))
const reason = res.error || (res.status ? `HTTP ${res.status}` : 'failed')
styles.info(styles.dim(` ${candidate} → ${reason}`))
}
return { llms: null, llmsUrl: null }
})
Expand Down Expand Up @@ -142,11 +143,9 @@ export async function importDocs(options) {

const rawKnownUrls = llms.parsed.sections.flatMap((s) => s.items.map((i) => ({ title: i.text, url: i.url, description: i.description })))

// Dedupe llms.txt entries by pathname. Some sites (zod.dev, fumadocs) list
// every in-page anchor as its own llms.txt row (`/v4?id=wrapping-up`,
// `/v4?id=metadata`, …) even though they all live on one rendered page.
// We prefer the "cleanest" URL per path — the shortest one, which is
// usually the one without a query string or hash.
// Dedupe llms.txt entries by pathname. Some sites list every in-page anchor
// as its own llms.txt row (`/custom-data#anchor1`, `/custom-data#anchor2`,
// even though they all live on one rendered page.
const byKnownPath = new Map()
for (const p of rawKnownUrls) {
const key = normalizePath(p.url)
Expand All @@ -158,6 +157,18 @@ export async function importDocs(options) {
if (dropped > 0) {
styles.info(`${styles.dim(`Collapsed ${dropped} anchor/query duplicates → ${knownUrls.length} unique pages.`)}`)
}

const seenSectionPaths = new Set()
for (const section of llms.parsed.sections) {
section.items = section.items.filter((it) => {
const key = normalizePath(it.url)
const kept = byKnownPath.get(key)
if (!kept || it.url !== kept.url) return false
if (seenSectionPaths.has(key)) return false
seenSectionPaths.add(key)
return true
})
}
} else if (sitemapKnownUrls.length > 0) {
knownUrls = sitemapKnownUrls
}
Expand Down Expand Up @@ -316,7 +327,7 @@ export async function importDocs(options) {
} else if (llms) {
const fastPath = sectionsLookUsable(llms.parsed.sections)
styles.info(`Organizing with Claude (${styles.bold(options.model)}, ${fastPath ? 'fast path: icons only' : 'full reorg'})...`)
organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model))
organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model, sourceUrl))
} else {
// Sitemap-only fallback: synthesize categories by clustering the URL
// paths. clusterByUrlPath returns null when the URLs don't split cleanly,
Expand Down Expand Up @@ -1729,6 +1740,7 @@ function slotOrphansByPath(scraped, knownPages) {
matched.add(norm)
} else {
orphans.push(p)
matched.add(norm)
}
}
return orphans
Expand Down Expand Up @@ -2052,18 +2064,27 @@ function normalizePath(url) {
* Returns the orphans Claude couldn't slot.
*/
async function slotOrphansWithClaude(scraped, orphans, model) {
const seenPaths = new Set(scraped.categories.flatMap((c) => c.pages.map((p) => normalizePath(p.url))))
const dedupedOrphans = []
for (const p of orphans) {
const norm = normalizePath(p.url)
if (seenPaths.has(norm)) continue
seenPaths.add(norm)
dedupedOrphans.push(p)
}

const { systemPrompt, userPrompt } = slotOrphansPrompt({
categories: scraped.categories,
orphans,
orphans: dedupedOrphans,
})
const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
if (!Array.isArray(raw)) {
// If the model didn't cooperate, return all orphans unassigned.
return orphans
return dedupedOrphans
}

const leftover = []
orphans.forEach((p, i) => {
dedupedOrphans.forEach((p, i) => {
const idx = Number.isInteger(raw[i]) ? raw[i] : -1
if (idx >= 0 && idx < scraped.categories.length) {
scraped.categories[idx].pages.push({
Expand Down Expand Up @@ -2126,11 +2147,11 @@ function sectionsLookUsable(sections) {
return usableSections(sections).length >= 3
}

async function organizeWithClaude(parsed, model) {
async function organizeWithClaude(parsed, model, sourceUrl) {
if (sectionsLookUsable(parsed.sections)) {
return organizeFromSections(parsed, model)
}
return organizeFromScratch(parsed, model)
return organizeFromScratch(parsed, model, sourceUrl)
}

/**
Expand Down Expand Up @@ -2169,7 +2190,7 @@ async function organizeFromSections(parsed, model) {
return { title: parsed.title || null, categories }
}

async function organizeFromScratch(parsed, model) {
async function organizeFromScratch(parsed, model, sourceUrl) {
const items = parsed.sections.flatMap((s) =>
s.items.map((i) => ({
section: s.title,
Expand All @@ -2182,6 +2203,7 @@ async function organizeFromScratch(parsed, model) {
const { systemPrompt, userPrompt } = organizeFromScratchPrompt({
siteTitle: parsed.title,
items,
sourceUrl: sourceUrl ? sourceUrl.toString() : undefined,
})
const raw = await runJsonQuery({ systemPrompt, userPrompt, model })

Expand Down Expand Up @@ -2314,61 +2336,123 @@ function buildLlmsCandidates(sourceUrl) {
}

/**
* Best-effort fetch of a site's /llms.txt. Returns { ok, status, error, parsed }
* where parsed is { title, sections: [{ title, items: [{ text, url, description }] }] }.
* Fetch and parse a /llms.txt. Rejects text/html responses since SPAs
* commonly return their index for unmatched paths.
*/
async function fetchLlmsTxt(llmsUrl) {
async function fetchLlmsTxt(llmsUrl, sourceUrl) {
try {
const res = await fetch(llmsUrl, {
redirect: 'follow',
headers: { 'User-Agent': 'readme-cli-import' },
})
if (!res.ok) return { ok: false, status: res.status }
const text = await res.text()
return { ok: true, status: res.status, parsed: parseLlmsTxt(text) }
return { ok: true, status: res.status, parsed: parseLlmsTxt(text, { sourceUrl }) }
} catch (e) {
return { ok: false, error: e.message }
}
}

/**
* Parse the llms.txt format. `##` headings become sections;
* `- [text](url): description` bullets become items. Items before any `##`
* land in an implicit "Resources" section.
* Parse the llms.txt format.
*
* Two link patterns are recognized:
* - Line-start bullets (`- `, `* `, `+ ` — CommonMark allows all three).
* - Inline `[text](url)` anywhere in prose. Some files (e.g. Shopify) keep
* most refs in paragraphs rather than bullet lists, so the inline pass
* is needed for usable coverage.
*
* Headings: `##`–`####`. Treating only `##` collapses deeply-nested files
* into a single mega-section.
*
* When `options.sourceUrl` is provided, all items must share its origin.
*/
function parseLlmsTxt(body) {
function parseLlmsTxt(body, options = {}) {
const lines = body.split(/\r?\n/)
let title = null
const sections = []
let current = null
let inCodeFence = false
const seenUrls = new Set()

const headingRe = /^#{2,4}\s+(.+)$/
const itemRe = /^\s*[-*+]\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
const inlineLinkRe = /\[([^\]]+)\]\((https?:\/\/[^)\s]+)\)/g

const itemRe = /^\s*-\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
let originFilter = null
if (options.sourceUrl) {
try { originFilter = new URL(options.sourceUrl).origin } catch {}
}

const cleanUrl = (u) => u.replace(/[.,;]+$/, '')

// Peels paired markdown emphasis (`**x**`, `*x*`, `__x__`, `_x_`, and
// combined forms). `\1` backref forces matched wrappers; asymmetric
// input is left untouched.
const stripEmphasis = (text) => {
let t = text.trim()
for (let i = 0; i < 3; i++) {
const m = t.match(/^(\*{1,3}|_{1,3})([\s\S]+?)\1$/)
if (!m) break
t = m[2].trim()
}
return t
}

const passesOriginFilter = (url) => {
if (!originFilter) return true
try { return new URL(url).origin === originFilter } catch { return false }
}

const pushItem = ({ text, url, description }) => {
if (!passesOriginFilter(url)) return
if (seenUrls.has(url)) return
seenUrls.add(url)
if (!current) {
current = { title: 'Resources', items: [] }
sections.push(current)
}
current.items.push({
text: stripEmphasis(text),
url,
description: description || null,
})
}

for (const line of lines) {
if (/^\s*```/.test(line)) {
inCodeFence = !inCodeFence
continue
}
if (inCodeFence) continue

const h1 = line.match(/^#\s+(.+)$/)
if (h1 && !title) {
title = h1[1].trim()
title = stripEmphasis(h1[1])
continue
}

const h2 = line.match(/^##\s+(.+)$/)
if (h2) {
current = { title: h2[1].trim(), items: [] }
const heading = line.match(headingRe)
if (heading) {
current = { title: stripEmphasis(heading[1]), items: [] }
sections.push(current)
continue
}

const item = line.match(itemRe)
if (item) {
if (!current) {
current = { title: 'Resources', items: [] }
sections.push(current)
}
current.items.push({
text: item[1].trim(),
url: item[2].replace(/[.,;]+$/, ''),
description: item[3] ? item[3].trim() : null,
const bullet = line.match(itemRe)
if (bullet) {
pushItem({
text: bullet[1].trim(),
url: cleanUrl(bullet[2]),
description: bullet[3] ? bullet[3].trim() : null,
})
// Bullet's first link already captured; further links on this line
// belong to its description, not as separate items.
continue
}

for (const m of line.matchAll(inlineLinkRe)) {
pushItem({ text: m[1].trim(), url: cleanUrl(m[2]), description: null })
}
}

Expand Down
6 changes: 4 additions & 2 deletions src/prompts/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -173,15 +173,17 @@ export const organizeFromScratchSystemPrompt = [
* Pre-flattened page list. The id used in the prompt is the array index.
* @returns {string}
*/
export function buildOrganizeFromScratchUserPrompt({ siteTitle, items }) {
export function buildOrganizeFromScratchUserPrompt({ siteTitle, items, sourceUrl }) {
const compactLines = items.map((it, idx) => {
let relPath = it.url;
try { relPath = new URL(it.url).pathname + new URL(it.url).search; } catch {}
return `${idx}\t${it.title}\t${relPath}`;
});

let origin = '(unknown)';
if (items.length > 0) {
if (sourceUrl) {
try { origin = new URL(sourceUrl).origin; } catch {}
} else if (items.length > 0) {
try { origin = new URL(items[0].url).origin; } catch {}
}

Expand Down