Files
supabase/apps/www/internals/generate-sitemap.mjs
Pamela Chia 95d1e8abe8 fix(www): drop malformed legal URLs from sitemap_www.xml (#45775)
## Summary
Google Search Console flagged 4 "URL not allowed" errors on
`sitemap_www.xml` — malformed URLs like
`https://supabase.comdata/legal/terms/v1` (missing slash, non-existent
path). The generator was globbing `data/**/*.mdx`, picking up the 4
content-source MDX files under `data/legal/` that are imported by
`pages/terms.tsx` and `pages/enterprise-terms.tsx` but are not
themselves routed. With no path replacement mapping `data/...` to a
route and no leading slash, the URL template concatenated to garbage.
The real `/terms` and `/enterprise-terms` URLs come from the
`pages/*.tsx` glob and are unaffected.

## Changes
- Remove `data/**/*.mdx` glob (and its companion `!data/*.mdx` exclude)
from the sitemap generator. `apps/www/data/` has no routed MDX, only
content sources imported into pages.
- Anchor the `pages` prefix replace: `.replace('pages', '')` →
`.replace(/^pages/, '')`. String-form replace is first-occurrence and
would mangle any future filename containing `pages` as a non-prefix
substring (e.g., `_blog/about-pages.mdx` → `/blog/about-`). No current
files trigger this; defensive hardening.

## Testing
Regenerated the sitemap locally and verified:
- [x] `grep -c "supabase.comdata" public/sitemap_www.xml` → `0` (was 4)
- [x] `<loc>https://supabase.com/terms</loc>` and
`<loc>https://supabase.com/enterprise-terms</loc>` still present
- [x] Every `<loc>` matches
`^<loc>https://supabase\.com(/[a-zA-Z0-9].*)?</loc>$` (no malformed URLs
of any kind)
- [x] Total loc count stable across both commits (regression-free for
the anchor change)

Local count is lower than prod (527 vs 906) because
`.next/server/pages/**` partner/expert/feature HTML globs only resolve
after a full build — runs correctly via `postbuild` on Vercel.

After deploy lands, resubmit `sitemap_www.xml` in Google Search Console
to force a re-crawl (otherwise daily-ish). Expect status to flip from "4
errors" to "Success" and Discovered pages: 906 → 902.

## Linear
- fixes GROWTH-837

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

## Release Notes

* **Chores**
* Improved sitemap generation to properly index specific content
sections (blog, case studies, customers, events, and alternatives) with
refined route path processing for better search engine discoverability.

[![Review Change
Stack](https://storage.googleapis.com/coderabbit_public_assets/review-stack-in-coderabbit-ui.svg)](https://app.coderabbit.ai/change-stack/supabase/supabase/pull/45775)

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-05-11 15:14:36 +08:00

176 lines
4.9 KiB
JavaScript

import { readFileSync, writeFileSync } from 'fs'
import { globby } from 'globby'
import prettier from 'prettier'
async function generate() {
const prettierConfig = await prettier.resolveConfig('./.prettierrc.js')
const unsortedPages = await globby([
'pages/*.js',
'pages/*.tsx',
'pages/*.mdx',
'pages/**/*.tsx',
'_blog/*.mdx',
'_case-studies/*.mdx',
'_customers/*.mdx',
'_events/*.mdx',
'_alternatives/*.mdx',
'!pages/_*.js',
'!pages/_*.tsx',
'!pages/api',
'!pages/404.tsx',
'.next/server/pages/partners/integrations/*.html',
'.next/server/pages/partners/experts/*.html',
'.next/server/pages/features/*.html',
])
const pages = unsortedPages.sort((a, b) => a.localeCompare(b))
const blogUrl = 'blog'
const caseStudiesUrl = 'case-studies'
const customerStoriesUrl = 'customers'
const eventsUrl = 'events'
// Generate URLs for static pages
const staticUrls = pages
.map((page) => {
const path = page
.replace('.next/server/pages', '')
.replace(/^pages/, '')
.replace('.html', '')
// add a `/` for blog posts
.replace('_blog', `/${blogUrl}`)
.replace('_case-studies', `/${caseStudiesUrl}`)
.replace('_customers', `/${customerStoriesUrl}`)
.replace('_events', `/${eventsUrl}`)
.replace('_alternatives', '/alternatives')
.replace('.tsx', '')
.replace('.mdx', '')
// replace /{directory}/index with /{directory}
.replace(/\/([^\/]+)\/index/, '/$1')
let route = path === '/index' ? '' : path
if (route === '/alternatives/[slug]') return null
if (route === '/partners/[slug]') return null
if (route === '/case-studies/[slug]') return null
if (route === '/customers/[slug]') return null
if (route === '/events/[slug]') return null
if (route === '/features/[slug]') return null
if (route === '/blog/categories/[category]') return null
if (route === '/partners/experts/[slug]') return null
if (route === '/partners/integrations/[slug]') return null
if (route === '/launch-week/ticket-image') return null
if (route === '/launch-week/tickets/[username]') return null
if (route === '/changelog/[slug]') return null
/**
* Blog based urls
* handle removal of dates in filename
*/
if (route.includes(`/${blogUrl}/`)) {
/**
* remove directory from route
*/
const _route = route.replace(`/${blogUrl}/`, '')
/**
* remove the date from the file name
*/
const substring = _route.substring(11)
/**
* reconsruct the route
*/
route = `/${blogUrl}/` + substring
}
/**
* Event based urls
* handle removal of dates in filename
*/
if (route.includes(`/${eventsUrl}/`)) {
// remove finelnames with __
if (route.includes(`__`)) return null
/**
* remove directory from route
*/
const _route = route.replace(`/${eventsUrl}/`, '')
/**
* remove the date from the file name
*/
const substring = _route.substring(11)
/**
* reconsruct the route
*/
route = `/${eventsUrl}/` + substring
}
return `
<url>
<loc>${`https://supabase.com${route}`}</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
`
})
.filter(Boolean)
// Changelog detail pages are dynamic routes; include them from generated changelog RSS links.
const changelogDetailUrls = (() => {
try {
const rss = readFileSync('public/changelog-rss.xml', 'utf-8')
const matches = [
...rss.matchAll(/<link>(https:\/\/supabase\.com\/changelog\/\d+[^<]*)<\/link>/g),
]
const uniqueUrls = [...new Set(matches.map((match) => match[1]))]
return uniqueUrls.map(
(url) => `
<url>
<loc>${url}</loc>
<changefreq>weekly</changefreq>
<priority>0.5</priority>
</url>
`
)
} catch {
return []
}
})()
const sitemap = `
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
${[...staticUrls, ...changelogDetailUrls].join('')}
</urlset>
`
const formatted = await prettier.format(sitemap, {
...prettierConfig,
parser: 'html',
})
/**
* generate sitemap router
*
* this points to www and docs sitemaps
*/
const sitemapRouter = `<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://supabase.com/sitemap_www.xml</loc>
</sitemap>
<sitemap>
<loc>https://supabase.com/docs/sitemap.xml</loc>
</sitemap>
</sitemapindex>
`
/**
* write sitemaps
*/
writeFileSync('public/sitemap.xml', sitemapRouter)
writeFileSync('public/sitemap_www.xml', formatted)
}
generate()