From cd52669f1ffa8116a0f61959d73eff113de457ab Mon Sep 17 00:00:00 2001 From: Pamela Chia Date: Tue, 9 Jun 2026 17:49:35 +0800 Subject: [PATCH] fix(docs): negotiate /guides/* markdown via shared helper (#45432) ## Summary This brings docs `/guides/*` to full content negotiation for AI agents (GROWTH-811): RFC 9110 q-value parsing instead of a `.includes('text/markdown')` substring match, a 406 when the client rejects every type the route can produce, and markdown rewrites for known LLM user agents. I implemented it by extracting the negotiation into a shared `common/markdown-negotiation` module consumed by both `apps/docs/middleware.ts` and `apps/www/middleware.ts`, rather than duplicating the helpers into docs and keeping them in sync by hand with www (#45394). Single source of truth, no re-sync burden. www is refactored onto the shared helper with no behavior change. ## Changes ### docs `/guides/*` content negotiation (GROWTH-811) - Replace the `.includes('text/markdown')` substring match with RFC 9110 q-value parsing. - Return 406 (`Cache-Control: no-store`, `Vary: Accept`) when Accept excludes every type the route serves. Bypassed for LLM user agents, the `.md` suffix, and clients sending no Accept. - Rewrite to `/api/guides-md/` for LLM user agents (Claude-User, Claude-Web, ChatGPT-User, PerplexityBot) regardless of Accept. - Preserve the existing `.md` suffix routing and the entire `/reference/*` block. ### Shared negotiation helper - New `packages/common/markdown-negotiation.ts`: `negotiateMarkdown(signals, route)` returns `'markdown' | 'not-acceptable' | 'pass'`. Internalizes q-value parsing, the LLM user-agent match, the UA-length cap, and the markdown-vs-html preference. - `apps/www/middleware.ts`: refactored to consume the shared helper; its duplicated copy of the negotiation helpers (added in #45394) is removed. `.md` early-return, changelog routing, and first-referrer cookie stamping are unchanged (no behavior change, covered by its existing tests). ### Tests - New `apps/docs/middleware.test.ts`: q-value priority, the 406 path, `.md` suffix, LLM UA override, browser default Accept, training-crawler and substring-embed exclusion, and the `/reference/*` exemption. - New `packages/common/markdown-negotiation.test.ts`: the same decision matrix at the unit level (q-values, 406, LLM UAs, `.md`, `*/*`, training crawlers, OWS, out-of-range q). ## Testing (Vercel preview) After Vercel posts a preview URL, save it once then run the probe set. ```bash echo 'PREVIEW_HOST' > /tmp/growth-811-host.txt HOST=$(cat /tmp/growth-811-host.txt) # 1) Browser-style Accept -> HTML 200 curl -sI -A "Mozilla/5.0" \ -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \ "https://$HOST/docs/guides/auth" # 2) Accept: text/markdown -> markdown 200 curl -sI -H 'Accept: text/markdown' "https://$HOST/docs/guides/auth" # 3) text/html;q=1.0, text/markdown;q=0.5 -> HTML 200 curl -sI -H 'Accept: text/html;q=1.0, text/markdown;q=0.5' "https://$HOST/docs/guides/auth" # 4) unsupported Accept -> 406 + Cache-Control: no-store + Vary: Accept curl -sI -H 'Accept: application/x-content-negotiation-probe' "https://$HOST/docs/guides/auth" # 5) User-Agent: Claude-User/1.0 (any Accept) -> markdown 200 curl -sI -A 'Claude-User/1.0' "https://$HOST/docs/guides/auth" ``` ### After merge Run [acceptmarkdown.com/readiness-check](https://acceptmarkdown.com/readiness-check) against `https://supabase.com/docs/guides/auth`: expect 100/100. ## Linear - fixes GROWTH-811 --- apps/docs/middleware.test.ts | 150 +++++++++++++++++ apps/docs/middleware.ts | 44 +++-- apps/www/middleware.ts | 78 ++------- packages/common/markdown-negotiation.test.ts | 167 +++++++++++++++++++ packages/common/markdown-negotiation.ts | 86 ++++++++++ 5 files changed, 443 insertions(+), 82 deletions(-) create mode 100644 apps/docs/middleware.test.ts create mode 100644 packages/common/markdown-negotiation.test.ts create mode 100644 packages/common/markdown-negotiation.ts diff --git a/apps/docs/middleware.test.ts b/apps/docs/middleware.test.ts new file mode 100644 index 00000000000..9e447e4afb8 --- /dev/null +++ b/apps/docs/middleware.test.ts @@ -0,0 +1,150 @@ +import { NextRequest } from 'next/server' +import { describe, expect, it } from 'vitest' + +import { middleware } from './middleware' + +// BASE_PATH defaults to '/docs' when NEXT_PUBLIC_BASE_PATH is unset, so test +// paths include the /docs prefix to match the middleware's GUIDES_PATH check. +function makeRequest( + path: string, + { accept, userAgent }: { accept?: string; userAgent?: string } = {} +): NextRequest { + const headers: Record = {} + if (accept) headers.accept = accept + if (userAgent) headers['user-agent'] = userAgent + return new NextRequest(new URL(path, 'https://supabase.com'), { headers }) +} + +const REWRITE_HEADER = 'x-middleware-rewrite' +const GUIDES_MD_REWRITE = (slug: string) => `https://supabase.com/docs/api/guides-md/${slug}` + +describe('docs middleware — /guides/* content negotiation', () => { + it('rewrites to /api/guides-md/ when Accept includes text/markdown', () => { + const req = makeRequest('/docs/guides/auth', { accept: 'text/markdown' }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + }) + + it('rewrites /.md to /api/guides-md/ regardless of Accept', () => { + for (const accept of [undefined, 'text/html']) { + const req = makeRequest('/docs/guides/auth.md', accept ? { accept } : {}) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + } + }) + + it('serves HTML for browser-style Accept', () => { + const req = makeRequest('/docs/guides/auth', { + accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBeNull() + }) + + it('serves markdown when md q-value beats html', () => { + const req = makeRequest('/docs/guides/auth', { + accept: 'text/html;q=0.5, text/markdown;q=1.0', + }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + }) + + it('falls through to HTML when no Accept header is sent', () => { + expect(middleware(makeRequest('/docs/guides/auth')).headers.get(REWRITE_HEADER)).toBeNull() + }) + + it('falls through to HTML for bare Accept: */* (no explicit md preference)', () => { + const req = makeRequest('/docs/guides/auth', { accept: '*/*' }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBeNull() + }) + + it('does not serve markdown when client explicitly rejects it (q=0)', () => { + const req = makeRequest('/docs/guides/auth', { + accept: 'text/markdown;q=0, text/html;q=1.0', + }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBeNull() + }) + + it('tolerates OWS in q-params and clamps out-of-range q-values', () => { + // OWS around q: html wins. + const ows = makeRequest('/docs/guides/auth', { + accept: 'text/html ; q = 1.0, text/markdown ; q = 0.5', + }) + expect(middleware(ows).headers.get(REWRITE_HEADER)).toBeNull() + // q=2.0 is out-of-range, falls back to default 1.0; tie -> markdown. + const oor = makeRequest('/docs/guides/auth', { + accept: 'text/html;q=2.0, text/markdown;q=1.0', + }) + expect(middleware(oor).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + }) + + it('returns 406 with Cache-Control: no-store and Vary: Accept when Accept excludes every type', () => { + const req = makeRequest('/docs/guides/auth', { + accept: 'application/x-content-negotiation-probe', + }) + const res = middleware(req) + expect(res.status).toBe(406) + expect(res.headers.get('Cache-Control')).toBe('no-store') + expect(res.headers.get('Vary')).toBe('Accept') + }) + + it('does not 406 for LLM UAs or .md suffix paths even with a probe Accept', () => { + const llm = makeRequest('/docs/guides/auth', { + accept: 'application/x-content-negotiation-probe', + userAgent: 'Claude-User/1.0', + }) + expect(middleware(llm).status).not.toBe(406) + expect(middleware(llm).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + + const md = makeRequest('/docs/guides/auth.md', { + accept: 'application/x-content-negotiation-probe', + }) + expect(middleware(md).status).not.toBe(406) + expect(middleware(md).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + }) + + it('does not 406 on /reference/* (negotiation contract is /guides/* only)', () => { + const req = makeRequest('/docs/reference/javascript/introduction', { + accept: 'application/x-content-negotiation-probe', + }) + expect(middleware(req).status).not.toBe(406) + }) + + it('rewrites for each LLM user agent', () => { + for (const ua of [ + 'Claude-User (claude-code/2.1.119; +https://support.anthropic.com/)', + 'Claude-Web/1.0', + 'Mozilla/5.0 (compatible; ChatGPT-User/1.0)', + 'PerplexityBot/1.0', + ]) { + const req = makeRequest('/docs/guides/auth', { userAgent: ua }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + } + }) + + it('LLM UA overrides an Accept header that prefers HTML', () => { + const req = makeRequest('/docs/guides/auth', { + accept: 'text/html;q=1.0, text/markdown;q=0.1', + userAgent: 'Claude-User/1.0', + }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBe(GUIDES_MD_REWRITE('auth')) + }) + + it('falls through for non-LLM UAs (browsers, training crawlers, substring embeds)', () => { + for (const ua of [ + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36', + 'GPTBot/1.0', + 'ClaudeBot/1.0', + 'CCBot/2.0', + 'chatgpt-userscript/2.0', + 'NotPerplexityBot', + ]) { + const req = makeRequest('/docs/guides/auth', { userAgent: ua }) + expect(middleware(req).headers.get(REWRITE_HEADER)).toBeNull() + } + }) + + it('does not apply LLM UA rewrite to /reference/* (guides-only)', () => { + const req = makeRequest('/docs/reference/javascript/introduction', { + userAgent: 'Claude-User/1.0', + }) + const rewrite = middleware(req).headers.get(REWRITE_HEADER) ?? '' + expect(rewrite).not.toContain('/api/guides-md/') + }) +}) diff --git a/apps/docs/middleware.ts b/apps/docs/middleware.ts index 34cf26138b5..42817c237c7 100644 --- a/apps/docs/middleware.ts +++ b/apps/docs/middleware.ts @@ -1,33 +1,47 @@ +import { clientSdkIds } from '~/content/navigation.references' +import { BASE_PATH } from '~/lib/constants' +import { negotiateMarkdown } from 'common/markdown-negotiation' import { isbot } from 'isbot' import { NextResponse, type NextRequest } from 'next/server' -import { clientSdkIds } from '~/content/navigation.references' -import { BASE_PATH } from '~/lib/constants' - const REFERENCE_PATH = `${BASE_PATH ?? ''}/reference` - const GUIDES_PATH = `${BASE_PATH ?? ''}/guides` export function middleware(request: NextRequest) { const url = new URL(request.url) + const { pathname } = url - const requestsMarkdown = - request.headers.get('Accept')?.includes('text/markdown') || url.pathname.endsWith('.md') + if (pathname.startsWith(GUIDES_PATH + '/')) { + const isMdSuffix = pathname.endsWith('.md') + const slug = pathname.replace(`${GUIDES_PATH}/`, '').replace(/\.md$/, '') + const decision = negotiateMarkdown( + { + acceptHeader: request.headers.get('accept') ?? '', + userAgent: request.headers.get('user-agent') ?? '', + }, + { hasMarkdownVariant: true, isMarkdownSuffix: isMdSuffix } + ) - // Serve pre-generated .md files before the [[...slug]] page route can intercept them - if (url.pathname.startsWith(GUIDES_PATH + '/') && requestsMarkdown) { - const slug = url.pathname.replace(`${GUIDES_PATH}/`, '').replace(/\.md$/, '') - const rewriteUrl = new URL(url) - rewriteUrl.pathname = `${BASE_PATH ?? ''}/api/guides-md/${slug}` - return NextResponse.rewrite(rewriteUrl) + if (decision === 'not-acceptable') { + return new NextResponse('Not Acceptable', { + status: 406, + headers: { 'Cache-Control': 'no-store', Vary: 'Accept' }, + }) + } + + if (decision === 'markdown') { + const rewriteUrl = new URL(url) + rewriteUrl.pathname = `${BASE_PATH ?? ''}/api/guides-md/${slug}` + return NextResponse.rewrite(rewriteUrl) + } } - if (!url.pathname.startsWith(REFERENCE_PATH)) { + if (!pathname.startsWith(REFERENCE_PATH)) { return NextResponse.next() } if (isbot(request.headers.get('user-agent'))) { - let [, lib, maybeVersion, ...slug] = url.pathname.replace(REFERENCE_PATH, '').split('/') + let [, lib, maybeVersion, ...slug] = pathname.replace(REFERENCE_PATH, '').split('/') if (clientSdkIds.includes(lib)) { const version = /v\d+/.test(maybeVersion) ? maybeVersion : undefined @@ -43,7 +57,7 @@ export function middleware(request: NextRequest) { } } - const [, lib, maybeVersion] = url.pathname.replace(REFERENCE_PATH, '').split('/') + const [, lib, maybeVersion] = pathname.replace(REFERENCE_PATH, '').split('/') if (lib === 'cli') { const rewritePath = [REFERENCE_PATH, 'cli'].join('/') diff --git a/apps/www/middleware.ts b/apps/www/middleware.ts index 095fa5f3a67..cef09208638 100644 --- a/apps/www/middleware.ts +++ b/apps/www/middleware.ts @@ -1,56 +1,9 @@ import { stampFirstReferrerCookie } from 'common/first-referrer-cookie' +import { negotiateMarkdown } from 'common/markdown-negotiation' import { NextResponse, type NextRequest } from 'next/server' import { MD_PAGES } from './app/api-v2/md/content.generated' -// Live-fetch agents only. Training crawlers (GPTBot, ClaudeBot, CCBot) are -// governed by robots.txt; serving them content that differs from the HTML -// page risks SEO and cloaking penalties. -const LLM_USER_AGENT = /\bClaude-User\b|\bClaude-Web\b|\bChatGPT-User\b|\bPerplexityBot\b/i - -// Media ranges (RFC 9110 §5.3.2) ordered most to least specific. -const RANGES = ['text/markdown', 'text/html', 'text/*', '*/*'] as const -type Range = (typeof RANGES)[number] - -const Q_PARAM = /^\s*q\s*=\s*([\d.]+)\s*$/i - -function isRange(s: string): s is Range { - return (RANGES as readonly string[]).includes(s) -} - -function parseQ(params: string[]): number { - for (const p of params) { - const q = parseFloat(p.match(Q_PARAM)?.[1] ?? '') - if (Number.isFinite(q) && q >= 0 && q <= 1) return q - } - return 1 -} - -// `markdownExplicit` lets the caller avoid flipping a bare `Accept: */*` to -// markdown — generic clients sending */* aren't expressing a preference. -function parseAccept(header: string) { - const seen = new Map() - - for (const entry of header.toLowerCase().split(',')) { - const [rawType, ...params] = entry.trim().split(';') - const range = rawType.trim() - if (!isRange(range)) continue - seen.set(range, Math.max(seen.get(range) ?? -1, parseQ(params))) - } - - return { - html: seen.get('text/html') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0, - markdown: seen.get('text/markdown') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0, - markdownExplicit: seen.has('text/markdown') || seen.has('text/*'), - } -} - -function shouldServeMarkdown(accept: ReturnType): boolean { - if (accept.markdown === 0) return false - if (accept.markdown > accept.html) return true - return accept.markdown === accept.html && accept.markdownExplicit -} - export function middleware(request: NextRequest) { const { pathname } = request.nextUrl @@ -61,37 +14,28 @@ export function middleware(request: NextRequest) { } } - const acceptHeader = request.headers.get('accept') ?? '' - // Cap UA length before regex test to bound CPU on the edge hot path. - const userAgent = (request.headers.get('user-agent') ?? '').slice(0, 512) - const isLlmAgent = LLM_USER_AGENT.test(userAgent) - const accept = acceptHeader ? parseAccept(acceptHeader) : null - // Strip trailing slash so /auth/ and /auth resolve to the same allowlist // entry — NextURL preserves trailing-slash style on rewrite targets. const slug = (pathname === '/' ? 'homepage' : pathname.slice(1)).replace(/\/$/, '') const isMdEligible = MD_PAGES.has(slug) const isChangelogEntry = slug === 'changelog' || /^changelog\/\d+/.test(slug) - const hasMdVariant = isMdEligible || isChangelogEntry - // 406 when Accept rejects every type we can produce. Skip for LLM UAs - // (always served markdown) and clients with no Accept (browser default). - if ( - hasMdVariant && - !isLlmAgent && - accept !== null && - accept.markdown === 0 && - accept.html === 0 - ) { + const decision = negotiateMarkdown( + { + acceptHeader: request.headers.get('accept') ?? '', + userAgent: request.headers.get('user-agent') ?? '', + }, + { hasMarkdownVariant: isMdEligible || isChangelogEntry } + ) + + if (decision === 'not-acceptable') { return new NextResponse('Not Acceptable', { status: 406, headers: { 'Cache-Control': 'no-store', Vary: 'Accept' }, }) } - const wantsMarkdown = isLlmAgent || (accept !== null && shouldServeMarkdown(accept)) - - if (wantsMarkdown) { + if (decision === 'markdown') { if (isMdEligible) { return NextResponse.rewrite(new URL(`/api-v2/md/${slug}`, request.nextUrl)) } diff --git a/packages/common/markdown-negotiation.test.ts b/packages/common/markdown-negotiation.test.ts new file mode 100644 index 00000000000..cc43407abb9 --- /dev/null +++ b/packages/common/markdown-negotiation.test.ts @@ -0,0 +1,167 @@ +import { describe, expect, it } from 'vitest' + +import { negotiateMarkdown } from './markdown-negotiation' + +const BROWSER_ACCEPT = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' + +describe('negotiateMarkdown', () => { + describe('hasMarkdownVariant gate', () => { + it('passes when the route has no markdown variant, regardless of other signals', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/markdown', userAgent: 'Claude-User/1.0' }, + { hasMarkdownVariant: false, isMarkdownSuffix: true } + ) + ).toBe('pass') + }) + }) + + describe('forced markdown', () => { + it('returns markdown for LLM user agents even when Accept rejects everything', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'application/x-content-negotiation-probe', userAgent: 'Claude-User/1.0' }, + { hasMarkdownVariant: true } + ) + ).toBe('markdown') + }) + + it('returns markdown for an explicit .md suffix even with an HTML-only Accept', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/html', userAgent: '' }, + { hasMarkdownVariant: true, isMarkdownSuffix: true } + ) + ).toBe('markdown') + }) + + it.each([ + 'Claude-User (claude-code/2.1.119; +https://support.anthropic.com/)', + 'Claude-Web/1.0', + 'Mozilla/5.0 (compatible; ChatGPT-User/1.0)', + 'PerplexityBot/1.0', + ])('treats %s as an LLM agent', (userAgent) => { + expect(negotiateMarkdown({ acceptHeader: '', userAgent }, { hasMarkdownVariant: true })).toBe( + 'markdown' + ) + }) + + it.each([ + 'GPTBot/1.0', + 'ClaudeBot/1.0', + 'CCBot/2.0', + 'chatgpt-userscript/2.0', + 'NotPerplexityBot', + ])('does not treat %s (training crawler / substring embed) as an LLM agent', (userAgent) => { + expect(negotiateMarkdown({ acceptHeader: '', userAgent }, { hasMarkdownVariant: true })).toBe( + 'pass' + ) + }) + + it('caps user-agent length before matching', () => { + const padded = 'x'.repeat(600) + 'Claude-User' + expect( + negotiateMarkdown({ acceptHeader: '', userAgent: padded }, { hasMarkdownVariant: true }) + ).toBe('pass') + }) + }) + + describe('no Accept header', () => { + it('passes (serves HTML) when no Accept header is sent', () => { + expect( + negotiateMarkdown({ acceptHeader: '', userAgent: '' }, { hasMarkdownVariant: true }) + ).toBe('pass') + }) + }) + + describe('406 not-acceptable', () => { + it('returns not-acceptable when Accept excludes every type we serve', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'application/x-content-negotiation-probe', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('not-acceptable') + }) + + it('does not 406 for bare */*', () => { + expect( + negotiateMarkdown({ acceptHeader: '*/*', userAgent: '' }, { hasMarkdownVariant: true }) + ).toBe('pass') + }) + }) + + describe('q-value negotiation', () => { + it('serves HTML for browser-style Accept', () => { + expect( + negotiateMarkdown( + { acceptHeader: BROWSER_ACCEPT, userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('pass') + }) + + it('serves markdown when explicitly requested', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/markdown', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('markdown') + }) + + it('serves markdown when its q-value beats html', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/html;q=0.5, text/markdown;q=1.0', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('markdown') + }) + + it('serves HTML when its q-value beats markdown', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/html;q=1.0, text/markdown;q=0.5', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('pass') + }) + + it('breaks an explicit md/html tie toward markdown', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/markdown, text/html, */*', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('markdown') + }) + + it('does not serve markdown when the client rejects it (q=0)', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/markdown;q=0, text/html;q=1.0', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('pass') + }) + + it('tolerates OWS around the q parameter (RFC 9110)', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/html ; q = 1.0, text/markdown ; q = 0.5', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('pass') + }) + + it('ignores out-of-range q-values (falls back to 1.0; tie -> markdown)', () => { + expect( + negotiateMarkdown( + { acceptHeader: 'text/html;q=2.0, text/markdown;q=1.0', userAgent: '' }, + { hasMarkdownVariant: true } + ) + ).toBe('markdown') + }) + }) +}) diff --git a/packages/common/markdown-negotiation.ts b/packages/common/markdown-negotiation.ts new file mode 100644 index 00000000000..28f8416815a --- /dev/null +++ b/packages/common/markdown-negotiation.ts @@ -0,0 +1,86 @@ +// Live-fetch agents only. Training crawlers (GPTBot, ClaudeBot, CCBot) are +// governed by robots.txt; serving them content that differs from the HTML +// page risks SEO and cloaking penalties. +const LLM_USER_AGENT = /\bClaude-User\b|\bClaude-Web\b|\bChatGPT-User\b|\bPerplexityBot\b/i + +// Media ranges (RFC 9110 §5.3.2) ordered most to least specific. +const RANGES = ['text/markdown', 'text/html', 'text/*', '*/*'] as const +type Range = (typeof RANGES)[number] + +const Q_PARAM = /^\s*q\s*=\s*([\d.]+)\s*$/i + +// Cap UA length before the regex test to bound CPU on the edge hot path. +const MAX_UA_LENGTH = 512 + +function isRange(s: string): s is Range { + return (RANGES as readonly string[]).includes(s) +} + +function parseQ(params: string[]): number { + for (const p of params) { + const q = parseFloat(p.match(Q_PARAM)?.[1] ?? '') + if (Number.isFinite(q) && q >= 0 && q <= 1) return q + } + return 1 +} + +// `markdownExplicit` lets callers avoid flipping a bare `Accept: */*` to +// markdown — generic clients sending */* aren't expressing a preference. +function parseAccept(header: string) { + const seen = new Map() + + for (const entry of header.toLowerCase().split(',')) { + const [rawType, ...params] = entry.trim().split(';') + const range = rawType.trim() + if (!isRange(range)) continue + seen.set(range, Math.max(seen.get(range) ?? -1, parseQ(params))) + } + + return { + html: seen.get('text/html') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0, + markdown: seen.get('text/markdown') ?? seen.get('text/*') ?? seen.get('*/*') ?? 0, + markdownExplicit: seen.has('text/markdown') || seen.has('text/*'), + } +} + +function shouldServeMarkdown(accept: ReturnType): boolean { + if (accept.markdown === 0) return false + if (accept.markdown > accept.html) return true + return accept.markdown === accept.html && accept.markdownExplicit +} + +export type MarkdownDecision = 'markdown' | 'not-acceptable' | 'pass' + +/** + * Content negotiation for routes that can serve either HTML or markdown. + * + * `hasMarkdownVariant` is false for paths with no markdown representation (they + * never negotiate). `isMarkdownSuffix` forces markdown for an explicit `.md` + * request; callers that handle `.md` upstream can leave it false. + */ +export function negotiateMarkdown( + { acceptHeader, userAgent }: { acceptHeader: string; userAgent: string }, + { + hasMarkdownVariant, + isMarkdownSuffix = false, + }: { hasMarkdownVariant: boolean; isMarkdownSuffix?: boolean } +): MarkdownDecision { + if (!hasMarkdownVariant) return 'pass' + + // LLM agents and an explicit `.md` request always get markdown. + if (LLM_USER_AGENT.test(userAgent.slice(0, MAX_UA_LENGTH)) || isMarkdownSuffix) { + return 'markdown' + } + + // No Accept header = browser/default client: serve HTML, never 406. + if (!acceptHeader) return 'pass' + + const accept = parseAccept(acceptHeader) + + // 406 when Accept rejects every type this route can produce. Only reached for + // non-LLM, non-`.md` clients that sent an Accept header (guards above), so a + // deliberate `Accept: application/json` gets a clean 406 instead of HTML. + if (accept.markdown === 0 && accept.html === 0) return 'not-acceptable' + + return shouldServeMarkdown(accept) ? 'markdown' : 'pass' +}