Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 63 additions & 9 deletions src/helpers/get-page-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -238,9 +238,13 @@ async function discoverSitemapUrls(ctx: CheckContext, originOverride?: string):
return candidates;
}

export type DiscoverySource = 'llms-txt' | 'sitemap' | 'fallback';

export interface PageUrlResult {
urls: string[];
warnings: string[];
/** Which discovery methods contributed to the final URL set. */
sources: DiscoverySource[];
}

function isGzipped(url: string): boolean {
Expand Down Expand Up @@ -740,19 +744,40 @@ export function filterByPathPrefix(urls: string[], baseUrl: string): string[] {
return urls.filter((url) => matchesPathPrefix(url, baseUrlPath));
}

/**
* Merge two URL arrays, preserving order. Primary URLs come first;
* secondary URLs are appended only if not already present.
*/
function mergeUrlSets(primary: string[], secondary: string[]): string[] {
const seen = new Set(primary);
const merged = [...primary];
for (const url of secondary) {
if (!seen.has(url)) {
merged.push(url);
seen.add(url);
}
}
return merged;
}

/**
* Discover page URLs from llms.txt links, sitemap, or fall back to baseUrl.
*
* Priority:
* 1. llms.txt links (from previous check results)
* 1. llms.txt links (from previous check results or direct fetch)
* 2. Sitemap URLs (robots.txt Sitemap directives, then /sitemap.xml fallback)
* 3. baseUrl fallback
*
* When llms.txt produces URLs but fewer than `maxLinksToTest`, sitemap
* URLs are merged in (deduped) so the sample covers a broader surface.
* The `sources` field records which discovery methods contributed.
*
* All discovered URLs are filtered to the baseUrl's path prefix so that
* docs at a subpath (e.g. `/docs`) don't include unrelated site content.
*/
export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {
const warnings: string[] = [];
const sources: DiscoverySource[] = [];

const locale = ctx.options.preferredLocale ?? extractLocaleFromUrl(ctx.baseUrl);
const version = ctx.options.preferredVersion ?? extractVersionFromUrl(ctx.baseUrl);
Expand All @@ -770,22 +795,43 @@ export async function getPageUrls(ctx: CheckContext): Promise<PageUrlResult> {

// 1. Try llms.txt links from cached results (if llms-txt-exists ran)
const cachedUrls = await getUrlsFromCachedLlmsTxt(ctx);
const scopedCachedUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase));
if (scopedCachedUrls.length > 0) return { urls: scopedCachedUrls, warnings };
let llmsTxtUrls = refineUrls(filterByPathPrefix(cachedUrls, filterBase));

// 2. Try fetching llms.txt directly (standalone mode, llms-txt-exists didn't run)
if (!ctx.previousResults.has('llms-txt-exists')) {
if (llmsTxtUrls.length === 0 && !ctx.previousResults.has('llms-txt-exists')) {
const fetchedUrls = await fetchLlmsTxtUrls(ctx);
const scopedFetchedUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase));
if (scopedFetchedUrls.length > 0) return { urls: scopedFetchedUrls, warnings };
llmsTxtUrls = refineUrls(filterByPathPrefix(fetchedUrls, filterBase));
}

if (llmsTxtUrls.length > 0) {
sources.push('llms-txt');

// If llms.txt meets the requested sample size, no need for sitemap
if (llmsTxtUrls.length >= ctx.options.maxLinksToTest) {
return { urls: llmsTxtUrls, warnings, sources };
}

// llms.txt is thin — try sitemap to fill the gap
const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase });
if (sitemapUrls.length > 0) {
sources.push('sitemap');
return { urls: mergeUrlSets(llmsTxtUrls, sitemapUrls), warnings, sources };
}

// Sitemap had nothing; return llms.txt URLs alone
return { urls: llmsTxtUrls, warnings, sources };
}

// 3. Try sitemap (path, locale, and version filtering applied inside)
const sitemapUrls = await getUrlsFromSitemap(ctx, warnings, { pathFilterBase: filterBase });
if (sitemapUrls.length > 0) return { urls: sitemapUrls, warnings };
if (sitemapUrls.length > 0) {
sources.push('sitemap');
return { urls: sitemapUrls, warnings, sources };
}

// 4. Fallback
return { urls: [ctx.baseUrl], warnings };
sources.push('fallback');
return { urls: [ctx.baseUrl], warnings, sources };
}

export interface SampledPages {
Expand All @@ -795,6 +841,8 @@ export interface SampledPages {
warnings: string[];
/** When curated pages have tags, maps page URL to tag label. */
urlTags?: Record<string, string>;
/** Which discovery methods contributed to the page URL set. */
sources?: DiscoverySource[];
}

/**
Expand Down Expand Up @@ -888,6 +936,12 @@ export async function discoverAndSamplePages(ctx: CheckContext): Promise<Sampled
}
}

ctx._sampledPages = { urls, totalPages, sampled, warnings: discovery.warnings };
ctx._sampledPages = {
urls,
totalPages,
sampled,
warnings: discovery.warnings,
sources: discovery.sources,
};
return ctx._sampledPages;
}
2 changes: 2 additions & 0 deletions src/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ export async function runChecks(
};

const urlTags = ctx._sampledPages?.urlTags;
const discoverySources = ctx._sampledPages?.sources;

return {
url: baseUrl,
Expand All @@ -133,5 +134,6 @@ export async function runChecks(
results,
summary,
...(urlTags && { urlTags }),
...(discoverySources && { discoverySources }),
};
}
4 changes: 3 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { SampledPages } from './helpers/get-page-urls.js';
import type { DiscoverySource, SampledPages } from './helpers/get-page-urls.js';

export type CheckStatus = 'pass' | 'warn' | 'fail' | 'skip' | 'error';

Expand Down Expand Up @@ -154,6 +154,8 @@ export interface ReportResult {
};
/** When curated pages have tags, maps page URL to tag label. */
urlTags?: Record<string, string>;
/** Which discovery methods contributed to the page URL set. */
discoverySources?: DiscoverySource[];
}

export interface AgentDocsConfig {
Expand Down
32 changes: 32 additions & 0 deletions test/helpers/mock-sitemap-not-found.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import { http, HttpResponse } from 'msw';
import type { SetupServerApi } from 'msw/node';

/**
* Register MSW handlers that return 404 for robots.txt and sitemap.xml
* at the given base URL's origin (and subpath if present).
*
* Call this after creating a test context with llms.txt content so that
* the discovery fallback (thin llms.txt → try sitemap) fails fast
* instead of timing out on unmocked test domains.
*/
export function mockSitemapNotFound(server: SetupServerApi, baseUrl: string): void {
const parsed = new URL(baseUrl);
const handlers = [
http.get(`${parsed.origin}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`${parsed.origin}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
];
const subpath = parsed.pathname.replace(/\/$/, '');
if (subpath && subpath !== '') {
handlers.push(
http.get(
`${parsed.origin}${subpath}/sitemap.xml`,
() => new HttpResponse('', { status: 404 }),
),
http.get(
`${parsed.origin}${subpath}/sitemap-index.xml`,
() => new HttpResponse('', { status: 404 }),
),
);
}
server.use(...handlers);
}
72 changes: 70 additions & 2 deletions test/integration/check-pipeline.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import { http, HttpResponse } from 'msw';
import { setupServer } from 'msw/node';
import { runChecks } from '../../src/runner.js';
import '../../src/checks/index.js';
import { mockSitemapNotFound } from '../helpers/mock-sitemap-not-found.js';

const server = setupServer();

Expand Down Expand Up @@ -51,10 +52,16 @@ function setupSite(
http.get(`http://${host}/docs/llms.txt`, () => new HttpResponse(null, { status: 404 })),
);

// Sitemap discovery: return 404 so the fallback doesn't time out
handlers.push(
http.get(`http://${host}/robots.txt`, () => new HttpResponse('', { status: 404 })),
http.get(`http://${host}/sitemap.xml`, () => new HttpResponse('', { status: 404 })),
);

const defaultCacheHeaders = opts.cacheControl ? { 'Cache-Control': opts.cacheControl } : {};

for (const page of opts.pages) {
// HTML version
// HTML version (GET and HEAD)
handlers.push(
http.get(`http://${host}${page.path}`, ({ request }) => {
const accept = request.headers.get('accept') ?? '';
Expand All @@ -72,9 +79,11 @@ function setupSite(
{ status: 200, headers: { 'Content-Type': 'text/html', ...defaultCacheHeaders } },
);
}),
// HEAD handler for llms-txt-links-resolve — mirrors GET status
http.head(`http://${host}${page.path}`, () => new HttpResponse(null, { status: 200 })),
);

// .md URL
// .md URL (both /page.md and /page/index.md candidates from toMdUrls)
if (page.md) {
handlers.push(
http.get(
Expand All @@ -85,10 +94,22 @@ function setupSite(
headers: { 'Content-Type': 'text/markdown', ...defaultCacheHeaders },
}),
),
http.get(
`http://${host}${page.path}/index.md`,
() =>
new HttpResponse(page.md!, {
status: 200,
headers: { 'Content-Type': 'text/markdown', ...defaultCacheHeaders },
}),
),
);
} else {
handlers.push(
http.get(`http://${host}${page.path}.md`, () => new HttpResponse(null, { status: 404 })),
http.get(
`http://${host}${page.path}/index.md`,
() => new HttpResponse(null, { status: 404 }),
),
);
}

Expand Down Expand Up @@ -566,11 +587,16 @@ describe('check pipeline: HTML fetch cache shared across checks', () => {
'http://pipe-htmlcache.local/docs/guide.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-htmlcache.local/docs/guide/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-htmlcache.local/docs/guide-afdocs-nonexistent-8f3a',
() => new HttpResponse('Not Found', { status: 404 }),
),
);
mockSitemapNotFound(server, 'http://pipe-htmlcache.local');

const report = await runChecks('http://pipe-htmlcache.local', {
checkIds: ['llms-txt-exists', 'page-size-html', 'tabbed-content-serialization'],
Expand Down Expand Up @@ -655,11 +681,16 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () =
'http://pipe-auth-llms.local/docs/page.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-auth-llms.local/docs/page/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-auth-llms.local/docs/page-afdocs-nonexistent-8f3a',
() => new HttpResponse('Not Found', { status: 404 }),
),
);
mockSitemapNotFound(server, 'http://pipe-auth-llms.local');

const report = await runChecks('http://pipe-auth-llms.local', {
checkIds: ['llms-txt-exists', 'auth-gate-detection', 'auth-alternative-access'],
Expand Down Expand Up @@ -733,6 +764,14 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () =
'http://pipe-auth-md.local/docs/private.md',
() => new HttpResponse(null, { status: 403 }),
),
http.get(
'http://pipe-auth-md.local/docs/public/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-auth-md.local/docs/private/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-auth-md.local/docs/public-afdocs-nonexistent-8f3a',
() => new HttpResponse('Not Found', { status: 404 }),
Expand All @@ -742,6 +781,7 @@ describe('check pipeline: auth-gate-detection → auth-alternative-access', () =
() => new HttpResponse('Not Found', { status: 404 }),
),
);
mockSitemapNotFound(server, 'http://pipe-auth-md.local');

const report = await runChecks('http://pipe-auth-md.local', {
checkIds: [
Expand Down Expand Up @@ -802,12 +842,18 @@ describe('check pipeline: rendering-strategy → tabbed-content-serialization',
'http://pipe-spa-tab.local/docs/guide.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-spa-tab.local/docs/guide/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-spa-tab.local/docs/guide-afdocs-nonexistent-8f3a',
() => new HttpResponse('Not Found', { status: 404 }),
),
);

mockSitemapNotFound(server, 'http://pipe-spa-tab.local');

const report = await runChecks('http://pipe-spa-tab.local', {
checkIds: ['llms-txt-exists', 'rendering-strategy', 'tabbed-content-serialization'],
requestDelay: 0,
Expand Down Expand Up @@ -866,12 +912,18 @@ describe('check pipeline: tabbed-content-serialization → section-header-qualit
'http://pipe-tab-hdr.local/docs/install.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-tab-hdr.local/docs/install/index.md',
() => new HttpResponse(null, { status: 404 }),
),
http.get(
'http://pipe-tab-hdr.local/docs/install-afdocs-nonexistent-8f3a',
() => new HttpResponse('Not Found', { status: 404 }),
),
);

mockSitemapNotFound(server, 'http://pipe-tab-hdr.local');

const report = await runChecks('http://pipe-tab-hdr.local', {
checkIds: ['llms-txt-exists', 'tabbed-content-serialization', 'section-header-quality'],
requestDelay: 0,
Expand Down Expand Up @@ -921,6 +973,10 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow'
headers: { 'Content-Type': 'text/html' },
}),
),
http.head(
'http://pipe-llms-md.local/docs/guide',
() => new HttpResponse(null, { status: 200 }),
),
// .md URL returns markdown
http.get(
'http://pipe-llms-md.local/docs/guide.md',
Expand All @@ -930,8 +986,18 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow'
headers: { 'Content-Type': 'text/markdown' },
}),
),
http.head(
'http://pipe-llms-md.local/docs/guide.md',
() => new HttpResponse(null, { status: 200, headers: { 'Content-Type': 'text/markdown' } }),
),
http.get(
'http://pipe-llms-md.local/docs/guide/index.md',
() => new HttpResponse(null, { status: 404 }),
),
);

mockSitemapNotFound(server, 'http://pipe-llms-md.local');

const report = await runChecks('http://pipe-llms-md.local', {
checkIds: ['llms-txt-exists', 'llms-txt-links-markdown'],
requestDelay: 0,
Expand All @@ -958,6 +1024,8 @@ describe('check pipeline: llms-txt-exists → llms-txt-links-markdown data flow'
),
);

mockSitemapNotFound(server, 'http://pipe-llms-md-nollms.local');

const report = await runChecks('http://pipe-llms-md-nollms.local', {
checkIds: ['llms-txt-exists', 'llms-txt-links-markdown'],
requestDelay: 0,
Expand Down
Loading