Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 1 addition & 21 deletions src/checks/content-discoverability/llms-txt-directive.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { registerCheck } from '../registry.js';
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
import { toHtmlUrl } from '../../helpers/to-md-urls.js';
import type { CheckContext, CheckResult } from '../../types.js';

interface DirectiveResult {
Expand Down Expand Up @@ -52,27 +53,6 @@ function extractBody(html: string): { body: string; offset: number } {
return { body: html.slice(bodyStart, bodyEnd), offset: bodyStart };
}

/**
* Convert a markdown URL back to its HTML equivalent.
* Strips trailing `.md` extension or `/index.md` suffix.
*/
function toHtmlUrl(url: string): string {
try {
const u = new URL(url);
if (u.pathname.endsWith('.md')) {
u.pathname = u.pathname.replace(/(?:\/index)?\.md$/, '') || '/';
// Ensure trailing slash for directory-style URLs
if (u.pathname !== '/' && !u.pathname.includes('.')) {
u.pathname = u.pathname.replace(/\/?$/, '/');
}
return u.toString();
}
} catch {
// Fall through to return original
}
return url;
}

function searchContent(
content: string,
pattern: RegExp,
Expand Down
41 changes: 37 additions & 4 deletions src/checks/markdown-availability/content-negotiation.ts
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
import { registerCheck } from '../registry.js';
import { looksLikeMarkdown, looksLikeHtml } from '../../helpers/detect-markdown.js';
import { isSoft404Body } from '../../helpers/detect-soft-404.js';
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
import { isNonPageUrl } from '../../helpers/to-md-urls.js';
import { isNonPageUrl, isMdUrl, toHtmlUrl } from '../../helpers/to-md-urls.js';
import type { CheckContext, CheckResult } from '../../types.js';

type Classification = 'markdown-with-correct-type' | 'markdown-with-wrong-type' | 'html';

interface PageResult {
url: string;
/** The URL actually fetched (may differ from url if .md was normalized). */
testedUrl?: string;
classification: Classification;
skipped?: boolean;
softError?: boolean;
contentType: string;
status: number;
error?: string;
Expand Down Expand Up @@ -37,12 +41,34 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
if (isNonPageUrl(url)) {
return { url, classification: 'html', skipped: true, contentType: '', status: 0 };
}

// Pre-request: normalize .md/.mdx URLs to their canonical HTML form (#33).
// Testing content negotiation against a .md URL is meaningless because the
// server already serves markdown at that path by definition.
const fetchUrl = isMdUrl(url) ? toHtmlUrl(url) : url;
const testedUrl = fetchUrl !== url ? fetchUrl : undefined;

try {
const response = await ctx.http.fetch(url, {
const response = await ctx.http.fetch(fetchUrl, {
headers: { Accept: 'text/markdown' },
});
const body = await response.text();
const contentType = response.headers.get('content-type') ?? '';

// Post-response: reject soft-404 error pages (#29).
// Some servers return 200 with text/markdown for error pages
// (e.g. "# Page Not Found"), which would inflate scores.
if (isSoft404Body(body)) {
return {
url,
testedUrl,
classification: 'html',
softError: true,
contentType,
status: response.status,
};
}

const isMarkdownType = contentType.includes('text/markdown');
const isMarkdownBody = looksLikeMarkdown(body);

Expand All @@ -68,10 +94,11 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
classification = 'html';
}

return { url, classification, contentType, status: response.status };
return { url, testedUrl, classification, contentType, status: response.status };
} catch (err) {
return {
url,
testedUrl,
classification: 'html',
contentType: '',
status: 0,
Expand All @@ -85,6 +112,8 @@ async function check(ctx: CheckContext): Promise<CheckResult> {

const testedResults = results.filter((r) => !r.skipped);
const skippedCount = results.length - testedResults.length;
const normalizedCount = testedResults.filter((r) => r.testedUrl).length;
const softErrorCount = testedResults.filter((r) => r.softError).length;
const markdownWithCorrectType = testedResults.filter(
(r) => r.classification === 'markdown-with-correct-type',
).length;
Expand All @@ -102,12 +131,16 @@ async function check(ctx: CheckContext): Promise<CheckResult> {
const pageLabel = wasSampled ? 'sampled pages' : 'pages';
const suffix =
(fetchErrors > 0 ? `; ${fetchErrors} failed to fetch` : '') +
(rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : '');
(rateLimited > 0 ? `; ${rateLimited} rate-limited (HTTP 429)` : '') +
(softErrorCount > 0 ? `; ${softErrorCount} returned error pages` : '') +
(normalizedCount > 0 ? `; ${normalizedCount} .md URLs normalized` : '');

const details: Record<string, unknown> = {
totalPages,
testedPages: testedResults.length,
skippedPages: skippedCount,
normalizedMdUrls: normalizedCount,
softErrorPages: softErrorCount,
sampled: wasSampled,
markdownWithCorrectType,
markdownWithWrongType,
Expand Down
22 changes: 1 addition & 21 deletions src/checks/observability/markdown-content-parity.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { parse } from 'node-html-parser';
import { registerCheck } from '../registry.js';
import { fetchPage } from '../../helpers/fetch-page.js';
import { toHtmlUrl } from '../../helpers/to-md-urls.js';
import type { CheckContext, CheckResult, CheckStatus } from '../../types.js';

/** Thresholds for the percentage of HTML segments not found in markdown. */
Expand Down Expand Up @@ -582,27 +583,6 @@ function computeParity(
};
}

/**
* Derive the HTML page URL from a cached page URL.
* Inverts the transforms from toMdUrls():
* /docs/guide.md → /docs/guide
* /docs/guide/index.md → /docs/guide/
* /docs/guide.mdx → /docs/guide
* If the URL doesn't end in .md/.mdx, return it unchanged.
*/
function toHtmlUrl(url: string): string {
const parsed = new URL(url);
if (parsed.pathname.endsWith('/index.md') || parsed.pathname.endsWith('/index.mdx')) {
parsed.pathname = parsed.pathname.replace(/\/index\.mdx?$/, '/');
return parsed.toString();
}
if (/\.mdx?$/i.test(parsed.pathname)) {
parsed.pathname = parsed.pathname.replace(/\.mdx?$/i, '');
return parsed.toString();
}
return url;
}

function worstStatus(statuses: CheckStatus[]): CheckStatus {
if (statuses.includes('fail')) return 'fail';
if (statuses.includes('warn')) return 'warn';
Expand Down
3 changes: 1 addition & 2 deletions src/checks/url-stability/http-status-codes.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { registerCheck } from '../registry.js';
import { discoverAndSamplePages } from '../../helpers/get-page-urls.js';
import { SOFT_404_PATTERNS } from '../../helpers/detect-soft-404.js';
import type { CheckContext, CheckResult } from '../../types.js';

interface StatusCodeResult {
Expand All @@ -21,8 +22,6 @@ function makeBadUrl(pageUrl: string): string {
return u.toString();
}

const SOFT_404_PATTERNS = /not\s*found|page\s*not\s*found|404|does\s*not\s*exist/i;

async function check(ctx: CheckContext): Promise<CheckResult> {
const id = 'http-status-codes';
const category = 'url-stability';
Expand Down
35 changes: 35 additions & 0 deletions src/helpers/detect-soft-404.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/**
* Broad soft-404 detection pattern.
*
* Matches common "not found" text in response bodies. Used by http-status-codes
* as a hint on pages already suspected of being soft-404s (fabricated bad URLs
* that returned 200).
*/
export const SOFT_404_PATTERNS = /not\s*found|page\s*not\s*found|404|does\s*not\s*exist/i;

/**
* Returns true if a markdown response body looks like an error page rather than
* real content. This is stricter than SOFT_404_PATTERNS because it runs on
* legitimate page URLs where documentation might naturally mention "404".
*
* Detection strategy:
* 1. If the first markdown heading contains error patterns, it's an error page.
* Real error pages say "# Page Not Found"; real docs don't lead with that.
* 2. If the body is very short (< 500 chars), scan it entirely. Terse error
* responses like "Not found" should still be caught.
*/
export function isSoft404Body(body: string): boolean {
// Check the first markdown heading (e.g. "# Page Not Found")
const headingMatch = /^#{1,6}\s+(.+)/m.exec(body.slice(0, 500));
if (headingMatch && SOFT_404_PATTERNS.test(headingMatch[1])) {
return true;
}

// For very short bodies, scan the whole thing. A real page has substantial
// content; a terse error message like "Not found" or "404" is short.
if (body.length < 500) {
return SOFT_404_PATTERNS.test(body);
}

return false;
}
37 changes: 37 additions & 0 deletions src/helpers/to-md-urls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,43 @@ export function isNonPageUrl(url: string): boolean {
);
}

/**
* Convert a .md or .mdx URL back to its canonical HTML equivalent.
* Inverts the transforms from toMdUrls():
* /docs/guide.md -> /docs/guide
* /docs/guide/index.md -> /docs/guide/
* /docs/guide.mdx -> /docs/guide
* If the URL doesn't end in .md/.mdx, return it unchanged.
*/
export function toHtmlUrl(url: string): string {
try {
const parsed = new URL(url);
if (parsed.pathname.endsWith('/index.md') || parsed.pathname.endsWith('/index.mdx')) {
parsed.pathname = parsed.pathname.replace(/\/index\.mdx?$/, '/');
return parsed.toString();
}
if (/\.mdx?$/i.test(parsed.pathname)) {
parsed.pathname = parsed.pathname.replace(/\.mdx?$/i, '');
return parsed.toString();
}
} catch {
// Fall through to return original
}
return url;
}

/**
* Returns true if the URL points to a .md or .mdx file.
*/
export function isMdUrl(url: string): boolean {
try {
const parsed = new URL(url);
return /\.mdx?$/i.test(parsed.pathname);
} catch {
return false;
}
}

/**
* Generate candidate .md URLs for a page URL.
* If the URL already ends in .md, return it as-is.
Expand Down
Loading