diff --git a/src/helpers/get-page-urls.ts b/src/helpers/get-page-urls.ts index 90a86bc..2c2a137 100644 --- a/src/helpers/get-page-urls.ts +++ b/src/helpers/get-page-urls.ts @@ -238,9 +238,13 @@ async function discoverSitemapUrls(ctx: CheckContext, originOverride?: string): return candidates; } +export type DiscoverySource = 'llms-txt' | 'sitemap' | 'fallback'; + export interface PageUrlResult { urls: string[]; warnings: string[]; + /** Which discovery methods contributed to the final URL set. */ + sources: DiscoverySource[]; } function isGzipped(url: string): boolean { @@ -740,19 +744,40 @@ export function filterByPathPrefix(urls: string[], baseUrl: string): string[] { return urls.filter((url) => matchesPathPrefix(url, baseUrlPath)); } +/** + * Merge two URL arrays, preserving order. Primary URLs come first; + * secondary URLs are appended only if not already present. + */ +function mergeUrlSets(primary: string[], secondary: string[]): string[] { + const seen = new Set(primary); + const merged = [...primary]; + for (const url of secondary) { + if (!seen.has(url)) { + merged.push(url); + seen.add(url); + } + } + return merged; +} + /** * Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl. * * Priority: - * 1. llms.txt links (from previous check results) + * 1. llms.txt links (from previous check results or direct fetch) * 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback) * 3. baseUrl fallback * + * When llms.txt produces URLs but fewer than `maxLinksToTest`, sitemap + * URLs are merged in (deduped) so the sample covers a broader surface. + * The `sources` field records which discovery methods contributed. + * * All discovered URLs are filtered to the baseUrl's path prefix so that * docs at a subpath (e.g. `/docs`) don't include unrelated site content. */ export async function getPageUrls(ctx: CheckContext): Promise { const warnings: string[] = []; + const sources: DiscoverySource[] = []; const locale = ctx.options.preferredLocale ?? extractLocaleFromUrl(ctx.baseUrl); const version = ctx.options.preferredVersion ?? extractVersionFromUrl(ctx.baseUrl); @@ -770,22 +795,43 @@ export async function getPageUrls(ctx: CheckContext): Promise { // 1. Try llms.txt links from cached results (if llms-txt-exists ran) const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx); - const scopedCachedUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase)); - if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings }; + let llmsTxtUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase)); // 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run) - if (!ctx.previousResults.has('llms-txt-exists')) { + if (llmsTxtUrls.length === 0 && !ctx.previousResults.has('llms-txt-exists')) { const fetchedUrls = await fetchLlmsTxtUrls(ctx); - const scopedFetchedUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase)); - if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings }; + llmsTxtUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase)); + } + + if (llmsTxtUrls.length > 0) { + sources.push('llms-txt'); + + // If llms.txt meets the requested sample size, no need for sitemap + if (llmsTxtUrls.length >= ctx.options.maxLinksToTest) { + return { urls: llmsTxtUrls, warnings, sources }; + } + + // llms.txt is thin — try sitemap to fill the gap + const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase }); + if (sitemapUrls.length > 0) { + sources.push('sitemap'); + return { urls: mergeUrlSets(llmsTxtUrls, sitemapUrls), warnings, sources }; + } + + // Sitemap had nothing; return llms.txt URLs alone + return { urls: llmsTxtUrls, warnings, sources }; } // 3. Try sitemap (path, locale, and version filtering applied inside) const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase }); - if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings }; + if (sitemapUrls.length > 0) { + sources.push('sitemap'); + return { urls: sitemapUrls, warnings, sources }; + } // 4. Fallback - return { urls: [ctx.baseUrl], warnings }; + sources.push('fallback'); + return { urls: [ctx.baseUrl], warnings, sources }; } export interface SampledPages { @@ -795,6 +841,8 @@ export interface SampledPages { warnings: string[]; /** When curated pages have tags, maps page URL to tag label. */ urlTags?: Record; + /** Which discovery methods contributed to the page URL set. */ + sources?: DiscoverySource[]; } /** @@ -888,6 +936,12 @@ export async function discoverAndSamplePages(ctx: CheckContext): Promise; + /** Which discovery methods contributed to the page URL set. */ + discoverySources?: DiscoverySource[]; } export interface AgentDocsConfig { diff --git a/test/helpers/mock-sitemap-not-found.ts b/test/helpers/mock-sitemap-not-found.ts new file mode 100644 index 0000000..2b30fda --- /dev/null +++ b/test/helpers/mock-sitemap-not-found.ts @@ -0,0 +1,32 @@ +import { http, HttpResponse } from 'msw'; +import type { SetupServerApi } from 'msw/node'; + +/** + * Register MSW handlers that return 404 for robots.txt and sitemap.xml + * at the given base URL's origin (and subpath if present). + * + * Call this after creating a test context with llms.txt content so that + * the discovery fallback (thin llms.txt → try sitemap) fails fast + * instead of timing out on unmocked test domains. + */ +export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): void { + const parsed = new URL(baseUrl); + const handlers = [ + http.get(`${parsed.origin}/robots.txt`, () => new HttpResponse('', { status: 404 })), + http.get(`${parsed.origin}/sitemap.xml`, () => new HttpResponse('', { status: 404 })), + ]; + const subpath = parsed.pathname.replace(/\/$/, ''); + if (subpath && subpath !== '') { + handlers.push( + http.get( + `${parsed.origin}${subpath}/sitemap.xml`, + () => new HttpResponse('', { status: 404 }), + ), + http.get( + `${parsed.origin}${subpath}/sitemap-index.xml`, + () => new HttpResponse('', { status: 404 }), + ), + ); + } + server.use(...handlers); +} diff --git a/test/integration/check-pipeline.test.ts b/test/integration/check-pipeline.test.ts index 27b6869..12cbb01 100644 --- a/test/integration/check-pipeline.test.ts +++ b/test/integration/check-pipeline.test.ts @@ -11,6 +11,7 @@ import { http, HttpResponse } from 'msw'; import { setupServer } from 'msw/node'; import { runChecks } from '../../src/runner.js'; import '../../src/checks/index.js'; +import { mockSitemapNotFound } from '../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -51,10 +52,16 @@ function setupSite( http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })), ); + // Sitemap discovery: return 404 so the fallback doesn't time out + handlers.push( + http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })), + http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })), + ); + const defaultCacheHeaders = opts.cacheControl ? { 'Cache-Control': opts.cacheControl } : {}; for (const page of opts.pages) { - // HTML version + // HTML version (GET and HEAD) handlers.push( http.get(`http://${host}${page.path}`, ({ request }) => { const accept = request.headers.get('accept') ?? ''; @@ -72,9 +79,11 @@ function setupSite( { status: 200, headers: { 'Content-Type': 'text/html', ...defaultCacheHeaders } }, ); }), + // HEAD handler for llms-txt-links-resolve — mirrors GET status + http.head(`http://${host}${page.path}`, () => new HttpResponse(null, { status: 200 })), ); - // .md URL + // .md URL (both /page.md and /page/index.md candidates from toMdUrls) if (page.md) { handlers.push( http.get( @@ -85,10 +94,22 @@ function setupSite( headers: { 'Content-Type': 'text/markdown', ...defaultCacheHeaders }, }), ), + http.get( + `http://${host}${page.path}/index.md`, + () => + new HttpResponse(page.md!, { + status: 200, + headers: { 'Content-Type': 'text/markdown', ...defaultCacheHeaders }, + }), + ), ); } else { handlers.push( http.get(`http://${host}${page.path}.md`, () => new HttpResponse(null, { status: 404 })), + http.get( + `http://${host}${page.path}/index.md`, + () => new HttpResponse(null, { status: 404 }), + ), ); } @@ -566,11 +587,16 @@ describe('check pipeline: HTML fetch cache shared across checks', () => { 'http://pipe-htmlcache.local/docs/guide.md', () => new HttpResponse(null, { status: 404 }), ), + http.get( + 'http://pipe-htmlcache.local/docs/guide/index.md', + () => new HttpResponse(null, { status: 404 }), + ), http.get( 'http://pipe-htmlcache.local/docs/guide-afdocs-nonexistent-8f3a', () => new HttpResponse('Not Found', { status: 404 }), ), ); + mockSitemapNotFound(server, 'http://pipe-htmlcache.local'); const report = await runChecks('http://pipe-htmlcache.local', { checkIds: ['llms-txt-exists', 'page-size-html', 'tabbed-content-serialization'], @@ -655,11 +681,16 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () = 'http://pipe-auth-llms.local/docs/page.md', () => new HttpResponse(null, { status: 404 }), ), + http.get( + 'http://pipe-auth-llms.local/docs/page/index.md', + () => new HttpResponse(null, { status: 404 }), + ), http.get( 'http://pipe-auth-llms.local/docs/page-afdocs-nonexistent-8f3a', () => new HttpResponse('Not Found', { status: 404 }), ), ); + mockSitemapNotFound(server, 'http://pipe-auth-llms.local'); const report = await runChecks('http://pipe-auth-llms.local', { checkIds: ['llms-txt-exists', 'auth-gate-detection', 'auth-alternative-access'], @@ -733,6 +764,14 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () = 'http://pipe-auth-md.local/docs/private.md', () => new HttpResponse(null, { status: 403 }), ), + http.get( + 'http://pipe-auth-md.local/docs/public/index.md', + () => new HttpResponse(null, { status: 404 }), + ), + http.get( + 'http://pipe-auth-md.local/docs/private/index.md', + () => new HttpResponse(null, { status: 404 }), + ), http.get( 'http://pipe-auth-md.local/docs/public-afdocs-nonexistent-8f3a', () => new HttpResponse('Not Found', { status: 404 }), @@ -742,6 +781,7 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () = () => new HttpResponse('Not Found', { status: 404 }), ), ); + mockSitemapNotFound(server, 'http://pipe-auth-md.local'); const report = await runChecks('http://pipe-auth-md.local', { checkIds: [ @@ -802,12 +842,18 @@ describe('check pipeline: rendering-strategy → tabbed-content-serialization', 'http://pipe-spa-tab.local/docs/guide.md', () => new HttpResponse(null, { status: 404 }), ), + http.get( + 'http://pipe-spa-tab.local/docs/guide/index.md', + () => new HttpResponse(null, { status: 404 }), + ), http.get( 'http://pipe-spa-tab.local/docs/guide-afdocs-nonexistent-8f3a', () => new HttpResponse('Not Found', { status: 404 }), ), ); + mockSitemapNotFound(server, 'http://pipe-spa-tab.local'); + const report = await runChecks('http://pipe-spa-tab.local', { checkIds: ['llms-txt-exists', 'rendering-strategy', 'tabbed-content-serialization'], requestDelay: 0, @@ -866,12 +912,18 @@ describe('check pipeline: tabbed-content-serialization → section-header-qualit 'http://pipe-tab-hdr.local/docs/install.md', () => new HttpResponse(null, { status: 404 }), ), + http.get( + 'http://pipe-tab-hdr.local/docs/install/index.md', + () => new HttpResponse(null, { status: 404 }), + ), http.get( 'http://pipe-tab-hdr.local/docs/install-afdocs-nonexistent-8f3a', () => new HttpResponse('Not Found', { status: 404 }), ), ); + mockSitemapNotFound(server, 'http://pipe-tab-hdr.local'); + const report = await runChecks('http://pipe-tab-hdr.local', { checkIds: ['llms-txt-exists', 'tabbed-content-serialization', 'section-header-quality'], requestDelay: 0, @@ -921,6 +973,10 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow' headers: { 'Content-Type': 'text/html' }, }), ), + http.head( + 'http://pipe-llms-md.local/docs/guide', + () => new HttpResponse(null, { status: 200 }), + ), // .md URL returns markdown http.get( 'http://pipe-llms-md.local/docs/guide.md', @@ -930,8 +986,18 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow' headers: { 'Content-Type': 'text/markdown' }, }), ), + http.head( + 'http://pipe-llms-md.local/docs/guide.md', + () => new HttpResponse(null, { status: 200, headers: { 'Content-Type': 'text/markdown' } }), + ), + http.get( + 'http://pipe-llms-md.local/docs/guide/index.md', + () => new HttpResponse(null, { status: 404 }), + ), ); + mockSitemapNotFound(server, 'http://pipe-llms-md.local'); + const report = await runChecks('http://pipe-llms-md.local', { checkIds: ['llms-txt-exists', 'llms-txt-links-markdown'], requestDelay: 0, @@ -958,6 +1024,8 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow' ), ); + mockSitemapNotFound(server, 'http://pipe-llms-md-nollms.local'); + const report = await runChecks('http://pipe-llms-md-nollms.local', { checkIds: ['llms-txt-exists', 'llms-txt-links-markdown'], requestDelay: 0, diff --git a/test/unit/checks/auth-gate-detection.test.ts b/test/unit/checks/auth-gate-detection.test.ts index 8a81eda..92b5f08 100644 --- a/test/unit/checks/auth-gate-detection.test.ts +++ b/test/unit/checks/auth-gate-detection.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('auth-gate-detection', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', diff --git a/test/unit/checks/cache-header-hygiene.test.ts b/test/unit/checks/cache-header-hygiene.test.ts index 5a0243c..122608b 100644 --- a/test/unit/checks/cache-header-hygiene.test.ts +++ b/test/unit/checks/cache-header-hygiene.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('cache-header-hygiene', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, `http://${host}`); return ctx; } @@ -309,6 +311,7 @@ describe('cache-header-hygiene', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://chh-llms.local'); const result = await check.run(ctx); expect(result.status).toBe('pass'); diff --git a/test/unit/checks/content-negotiation.test.ts b/test/unit/checks/content-negotiation.test.ts index cfb201a..93d75fd 100644 --- a/test/unit/checks/content-negotiation.test.ts +++ b/test/unit/checks/content-negotiation.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -30,6 +31,7 @@ describe('content-negotiation', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', @@ -190,6 +192,7 @@ describe('content-negotiation', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.totalPages).toBe(5); @@ -260,6 +263,7 @@ describe('content-negotiation', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.sampled).toBe(true); diff --git a/test/unit/checks/content-start-position.test.ts b/test/unit/checks/content-start-position.test.ts index a96e3bb..c98f7fc 100644 --- a/test/unit/checks/content-start-position.test.ts +++ b/test/unit/checks/content-start-position.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -30,6 +31,7 @@ describe('content-start-position', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', @@ -391,6 +393,7 @@ describe('content-start-position', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.totalPages).toBe(5); diff --git a/test/unit/checks/http-status-codes.test.ts b/test/unit/checks/http-status-codes.test.ts index ebcee6f..4307328 100644 --- a/test/unit/checks/http-status-codes.test.ts +++ b/test/unit/checks/http-status-codes.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('http-status-codes', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', diff --git a/test/unit/checks/llms-txt-directive.test.ts b/test/unit/checks/llms-txt-directive.test.ts index 3f443b9..e3aa584 100644 --- a/test/unit/checks/llms-txt-directive.test.ts +++ b/test/unit/checks/llms-txt-directive.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('llms-txt-directive', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', diff --git a/test/unit/checks/llms-txt-links-markdown.test.ts b/test/unit/checks/llms-txt-links-markdown.test.ts index 19ed51d..f600e16 100644 --- a/test/unit/checks/llms-txt-links-markdown.test.ts +++ b/test/unit/checks/llms-txt-links-markdown.test.ts @@ -79,6 +79,7 @@ Just text, no links here. }), ), http.head('http://test.local/page1.md', () => new HttpResponse(null, { status: 404 })), + http.head('http://test.local/page1/index.md', () => new HttpResponse(null, { status: 404 })), http.head( 'http://test.local/page2', () => @@ -88,6 +89,7 @@ Just text, no links here. }), ), http.head('http://test.local/page2.md', () => new HttpResponse(null, { status: 404 })), + http.head('http://test.local/page2/index.md', () => new HttpResponse(null, { status: 404 })), ); const content = `# Test diff --git a/test/unit/checks/markdown-code-fence-validity.test.ts b/test/unit/checks/markdown-code-fence-validity.test.ts index 52b5446..d0cefaa 100644 --- a/test/unit/checks/markdown-code-fence-validity.test.ts +++ b/test/unit/checks/markdown-code-fence-validity.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -53,6 +54,9 @@ describe('markdown-code-fence-validity', () => { message: llmsTxtFiles ? 'Found' : 'Not found', details: { discoveredFiles: llmsTxtFiles ?? [] }, }); + if (llmsTxtFiles) { + mockSitemapNotFound(server, 'http://test.local'); + } return ctx; } @@ -265,7 +269,12 @@ describe('markdown-code-fence-validity', () => { headers: { 'Content-Type': 'text/markdown' }, }), ), + http.get( + 'http://mcfv-standalone.local/docs/page/index.md', + () => new HttpResponse(null, { status: 404 }), + ), ); + mockSitemapNotFound(server, 'http://mcfv-standalone.local'); // No dependency results set — standalone mode const ctx = createContext('http://mcfv-standalone.local', { requestDelay: 0 }); diff --git a/test/unit/checks/markdown-url-support.test.ts b/test/unit/checks/markdown-url-support.test.ts index 15ed19a..2351aba 100644 --- a/test/unit/checks/markdown-url-support.test.ts +++ b/test/unit/checks/markdown-url-support.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('markdown-url-support', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', @@ -231,6 +233,7 @@ describe('markdown-url-support', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.totalPages).toBe(5); @@ -312,6 +315,7 @@ describe('markdown-url-support', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.sampled).toBe(true); diff --git a/test/unit/checks/page-size-html.test.ts b/test/unit/checks/page-size-html.test.ts index 62c6d9e..58dea4d 100644 --- a/test/unit/checks/page-size-html.test.ts +++ b/test/unit/checks/page-size-html.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -30,6 +31,7 @@ describe('page-size-html', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', @@ -95,6 +97,7 @@ describe('page-size-html', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.status).toBe('fail'); @@ -163,6 +166,7 @@ describe('page-size-html', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.details?.totalPages).toBe(5); @@ -247,6 +251,7 @@ describe('page-size-html', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); const result = await check.run(ctx); expect(result.status).toBe('warn'); diff --git a/test/unit/checks/page-size-markdown.test.ts b/test/unit/checks/page-size-markdown.test.ts index 165e205..211aa37 100644 --- a/test/unit/checks/page-size-markdown.test.ts +++ b/test/unit/checks/page-size-markdown.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -30,6 +31,7 @@ describe('page-size-markdown', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } return ctx; @@ -163,7 +165,7 @@ describe('page-size-markdown', () => { }), ), http.get( - 'http://ps-md-standalone.local.md', + 'http://ps-md-standalone.local/.md', () => new HttpResponse('Not found', { status: 404 }), ), http.get( @@ -196,7 +198,7 @@ describe('page-size-markdown', () => { headers: { 'Content-Type': 'text/html' }, }), ), - http.get('http://ps-md-nomd.local.md', () => new HttpResponse('Not found', { status: 404 })), + http.get('http://ps-md-nomd.local/.md', () => new HttpResponse('Not found', { status: 404 })), http.get( 'http://ps-md-nomd.local/index.md', () => new HttpResponse('Not found', { status: 404 }), diff --git a/test/unit/checks/redirect-behavior.test.ts b/test/unit/checks/redirect-behavior.test.ts index 8eebf50..fb02bc1 100644 --- a/test/unit/checks/redirect-behavior.test.ts +++ b/test/unit/checks/redirect-behavior.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -35,6 +36,7 @@ describe('redirect-behavior', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', diff --git a/test/unit/checks/rendering-strategy.test.ts b/test/unit/checks/rendering-strategy.test.ts index 75dda68..5239a47 100644 --- a/test/unit/checks/rendering-strategy.test.ts +++ b/test/unit/checks/rendering-strategy.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -28,6 +29,7 @@ describe('rendering-strategy', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, `http://${domain}`); return ctx; } diff --git a/test/unit/checks/tabbed-content-serialization.test.ts b/test/unit/checks/tabbed-content-serialization.test.ts index 68d43a1..c34a3c0 100644 --- a/test/unit/checks/tabbed-content-serialization.test.ts +++ b/test/unit/checks/tabbed-content-serialization.test.ts @@ -5,6 +5,7 @@ import { createContext } from '../../../src/runner.js'; import { getCheck } from '../../../src/checks/registry.js'; import '../../../src/checks/index.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -30,6 +31,7 @@ describe('tabbed-content-serialization', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + mockSitemapNotFound(server, 'http://test.local'); } else { ctx.previousResults.set('llms-txt-exists', { id: 'llms-txt-exists', diff --git a/test/unit/helpers/get-page-urls.test.ts b/test/unit/helpers/get-page-urls.test.ts index 8bc6e12..6843453 100644 --- a/test/unit/helpers/get-page-urls.test.ts +++ b/test/unit/helpers/get-page-urls.test.ts @@ -17,6 +17,7 @@ import { import { MAX_SITEMAP_URLS } from '../../../src/constants.js'; import { createContext } from '../../../src/runner.js'; import type { DiscoveredFile } from '../../../src/types.js'; +import { mockSitemapNotFound } from '../../helpers/mock-sitemap-not-found.js'; const server = setupServer(); @@ -582,6 +583,8 @@ describe('getPageUrls', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + + mockSitemapNotFound(server, baseUrl); } else { // Mark llms-txt-exists as having run (but failed) so getPageUrls // skips the direct llms.txt fetch and falls through to sitemap. @@ -597,13 +600,14 @@ describe('getPageUrls', () => { return ctx; } - it('returns llms.txt links when available (no sitemap fetch)', async () => { + it('returns llms.txt links when available', async () => { const content = `# Docs\n> Summary\n## Links\n- [Page](http://test.local/docs/page): A page\n`; const ctx = makeCtx('http://test.local', content); const result = await getPageUrls(ctx); expect(result.urls).toEqual(['http://test.local/docs/page']); expect(result.warnings).toEqual([]); + expect(result.sources).toContain('llms-txt'); }); it('fetches and parses sitemap.xml when no llms.txt links', async () => { @@ -632,6 +636,7 @@ describe('getPageUrls', () => { 'http://sitemap-test.local/docs/intro', 'http://sitemap-test.local/docs/guide', ]); + expect(result.sources).toEqual(['sitemap']); }); it('handles sitemap index files (follows sub-sitemaps)', async () => { @@ -708,6 +713,7 @@ describe('getPageUrls', () => { const ctx = makeCtx('http://empty-test.local'); const result = await getPageUrls(ctx); expect(result.urls).toEqual(['http://empty-test.local']); + expect(result.sources).toEqual(['fallback']); }); it('handles malformed sitemap XML gracefully', async () => { @@ -1032,6 +1038,130 @@ describe('getPageUrls', () => { expect(result.warnings[0]).toContain('sitemap-docs.xml.gz'); }); + // ── Discovery source fallback: merge llms.txt + sitemap (#27) ── + + it('falls back to sitemap when llms.txt has fewer URLs than maxLinksToTest', async () => { + const content = `# Docs\n## Links\n- [A](http://merge-test.local/docs/a): Page A\n- [B](http://merge-test.local/docs/b): Page B\n`; + const ctx = makeCtx('http://merge-test.local', content); + ctx.options.maxLinksToTest = 10; + + // Register sitemap AFTER makeCtx so it takes precedence over the default 404 handlers + server.use( + http.get( + 'http://merge-test.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://merge-test.local/docs/c + http://merge-test.local/docs/d + http://merge-test.local/docs/e +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const result = await getPageUrls(ctx); + // llms.txt URLs come first, then sitemap fills the gap + expect(result.urls).toEqual([ + 'http://merge-test.local/docs/a', + 'http://merge-test.local/docs/b', + 'http://merge-test.local/docs/c', + 'http://merge-test.local/docs/d', + 'http://merge-test.local/docs/e', + ]); + expect(result.sources).toEqual(['llms-txt', 'sitemap']); + }); + + it('does not fall back to sitemap when llms.txt meets maxLinksToTest', async () => { + const content = `# Docs\n## Links\n- [A](http://no-merge.local/docs/a): A\n- [B](http://no-merge.local/docs/b): B\n- [C](http://no-merge.local/docs/c): C\n`; + const ctx = makeCtx('http://no-merge.local', content); + ctx.options.maxLinksToTest = 3; + + const result = await getPageUrls(ctx); + expect(result.urls).toEqual([ + 'http://no-merge.local/docs/a', + 'http://no-merge.local/docs/b', + 'http://no-merge.local/docs/c', + ]); + expect(result.sources).toEqual(['llms-txt']); + }); + + it('deduplicates URLs when merging llms.txt and sitemap', async () => { + const content = `# Docs\n## Links\n- [A](http://dedup-merge.local/docs/a): A\n- [B](http://dedup-merge.local/docs/b): B\n`; + const ctx = makeCtx('http://dedup-merge.local', content); + ctx.options.maxLinksToTest = 10; + + server.use( + http.get( + 'http://dedup-merge.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://dedup-merge.local/docs/a + http://dedup-merge.local/docs/b + http://dedup-merge.local/docs/c +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const result = await getPageUrls(ctx); + // Overlapping URLs should not be duplicated + expect(result.urls).toEqual([ + 'http://dedup-merge.local/docs/a', + 'http://dedup-merge.local/docs/b', + 'http://dedup-merge.local/docs/c', + ]); + expect(result.sources).toEqual(['llms-txt', 'sitemap']); + }); + + it('applies path-prefix filtering when merging llms.txt and sitemap', async () => { + const content = `# Docs\n## Links\n- [A](http://merge-scope.local/docs/a): A\n- [Blog](http://merge-scope.local/blog/post): Blog\n`; + const ctx = makeCtx('http://merge-scope.local/docs', content); + ctx.options.maxLinksToTest = 10; + + // Register sitemap AFTER makeCtx so it takes precedence over the default 404 handlers + server.use( + http.get( + 'http://merge-scope.local/sitemap.xml', + () => + new HttpResponse( + ` + + http://merge-scope.local/docs/b + http://merge-scope.local/docs/c + http://merge-scope.local/blog/other +`, + { status: 200, headers: { 'Content-Type': 'application/xml' } }, + ), + ), + ); + + const result = await getPageUrls(ctx); + // llms.txt /blog/post filtered out by path prefix, sitemap /blog/other filtered too + expect(result.urls).toEqual([ + 'http://merge-scope.local/docs/a', + 'http://merge-scope.local/docs/b', + 'http://merge-scope.local/docs/c', + ]); + expect(result.sources).toEqual(['llms-txt', 'sitemap']); + }); + + it('reports only llms-txt source when sitemap is empty during merge attempt', async () => { + const content = `# Docs\n## Links\n- [A](http://thin-empty.local/docs/a): A\n`; + const ctx = makeCtx('http://thin-empty.local', content); + ctx.options.maxLinksToTest = 10; + // makeCtx already mocks sitemap as 404 — no sitemap URLs to merge + + const result = await getPageUrls(ctx); + expect(result.urls).toEqual(['http://thin-empty.local/docs/a']); + expect(result.sources).toEqual(['llms-txt']); + }); + // ── Progressive disclosure: walking aggregate .txt files ── it('walks aggregate .txt files linked from llms.txt (Cloudflare pattern)', async () => { @@ -1179,6 +1309,8 @@ describe('getPageUrls', () => { 'http://direct-llms.local/docs/llms.txt', () => new HttpResponse('Not found', { status: 404 }), ), + http.get('http://direct-llms.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get('http://direct-llms.local/sitemap.xml', () => new HttpResponse('', { status: 404 })), ); // No llms-txt-exists in previousResults → standalone mode @@ -1188,6 +1320,7 @@ describe('getPageUrls', () => { 'http://direct-llms.local/docs/intro', 'http://direct-llms.local/docs/guide', ]); + expect(result.sources).toContain('llms-txt'); }); it('skips llms.txt with non-text content-type in standalone mode', async () => { @@ -1675,6 +1808,9 @@ describe('discoverAndSamplePages', () => { message: 'Found', details: { discoveredFiles: discovered }, }); + + mockSitemapNotFound(server, baseUrl); + return ctx; } @@ -1687,6 +1823,7 @@ describe('discoverAndSamplePages', () => { expect(result.totalPages).toBe(2); expect(result.sampled).toBe(false); expect(result.warnings).toEqual([]); + expect(result.sources).toContain('llms-txt'); }); it('samples down to maxLinksToTest when over limit', async () => { diff --git a/test/unit/runner.test.ts b/test/unit/runner.test.ts index b687eeb..2b557cc 100644 --- a/test/unit/runner.test.ts +++ b/test/unit/runner.test.ts @@ -202,7 +202,7 @@ describe('runner', () => { headers: { 'Content-Type': 'text/html' }, }), ), - http.get('http://standalone.local.md', () => new HttpResponse('Not found', { status: 404 })), + http.get('http://standalone.local/.md', () => new HttpResponse('Not found', { status: 404 })), http.get( 'http://standalone.local/index.md', () => new HttpResponse('Not found', { status: 404 }), @@ -296,6 +296,23 @@ describe('runner', () => { }); it('includes timestamp and url in report', async () => { + server.use( + http.get('http://meta.local/llms.txt', () => new HttpResponse(null, { status: 404 })), + http.get('http://meta.local/docs/llms.txt', () => new HttpResponse(null, { status: 404 })), + http.get('http://meta.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get('http://meta.local/sitemap.xml', () => new HttpResponse('', { status: 404 })), + http.get( + 'http://meta.local', + () => + new HttpResponse('

Home

', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.get('http://meta.local.md', () => new HttpResponse(null, { status: 404 })), + http.get('http://meta.local/index.md', () => new HttpResponse(null, { status: 404 })), + ); + const report = await runChecks('http://meta.local', { checkIds: ['tabbed-content-serialization'], requestDelay: 0, @@ -304,4 +321,37 @@ describe('runner', () => { expect(report.url).toBe('http://meta.local'); expect(report.timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); }); + + it('includes discoverySources in report when page discovery runs', async () => { + server.use( + http.get('http://sources.local/llms.txt', () => + HttpResponse.text('# Docs\n## Links\n- [A](http://sources.local/docs/a): A\n'), + ), + http.get('http://sources.local/docs/llms.txt', () => new HttpResponse(null, { status: 404 })), + http.get('http://sources.local/robots.txt', () => new HttpResponse('', { status: 404 })), + http.get('http://sources.local/sitemap.xml', () => new HttpResponse('', { status: 404 })), + http.get( + 'http://sources.local/docs/a', + () => + new HttpResponse('

A

', { + status: 200, + headers: { 'Content-Type': 'text/html' }, + }), + ), + http.get('http://sources.local/docs/a.md', () => new HttpResponse(null, { status: 404 })), + http.get( + 'http://sources.local/docs/a/index.md', + () => new HttpResponse(null, { status: 404 }), + ), + ); + + // markdown-url-support triggers discoverAndSamplePages, which populates sources + const report = await runChecks('http://sources.local', { + checkIds: ['llms-txt-exists', 'markdown-url-support'], + requestDelay: 0, + }); + + expect(report.discoverySources).toBeDefined(); + expect(report.discoverySources).toContain('llms-txt'); + }); });