Files
supabase/apps/docs/scripts/search/sources/markdown.ts
Charis e20038e2f2 refactor: move auth guides into content directory (#20777)
Begin the process of moving our MDX files into their own content directory.

Fixed a few minor bugs re: ToC and tabs not rerendering consistently on page navigation. (The ToC thing wasn't a problem before the refactor, the tabs thing is a problem on prod.)

Moved MDX files can't import their own components, so everything they require needs to be back in the component prop for mdx-remote's serializer. Cleaned this up a bit and lazy-loaded heavy/rare stuff. Also, the component prop doesn't take arbitrary objects (only actual components), so imported data has to be wrapped in a component.
2024-02-07 17:25:08 +00:00

251 lines
6.5 KiB
TypeScript

import { createHash } from 'crypto'
import { ObjectExpression } from 'estree'
import { readFile } from 'fs/promises'
import GithubSlugger from 'github-slugger'
import matter from 'gray-matter'
import { Content, Root } from 'mdast'
import { fromMarkdown } from 'mdast-util-from-markdown'
import { MdxjsEsm, mdxFromMarkdown } from 'mdast-util-mdx'
import { toMarkdown } from 'mdast-util-to-markdown'
import { toString } from 'mdast-util-to-string'
import { mdxjs } from 'micromark-extension-mdxjs'
import { u } from 'unist-builder'
import { filter } from 'unist-util-filter'
import { BaseLoader, BaseSource, Json, Section } from './base'
/**
* Extracts ES literals from an `estree` `ObjectExpression`
* into a plain JavaScript object.
*/
export function getObjectFromExpression(node: ObjectExpression) {
return node.properties.reduce<
Record<string, string | number | bigint | true | RegExp | undefined>
>((object, property) => {
if (property.type !== 'Property') {
return object
}
const key = (property.key.type === 'Identifier' && property.key.name) || undefined
const value = (property.value.type === 'Literal' && property.value.value) || undefined
if (!key) {
return object
}
return {
...object,
[key]: value,
}
}, {})
}
/**
* Extracts the `meta` ESM export from the MDX file.
*
* This info is akin to frontmatter.
*/
export function extractMetaExport(mdxTree: Root) {
const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => {
return (
node.type === 'mdxjsEsm' &&
node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
node.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
node.data.estree.body[0].declaration.declarations[0].id.name === 'meta'
)
})
if (!metaExportNode) {
return undefined
}
const objectExpression =
(metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type ===
'ObjectExpression' &&
metaExportNode.data.estree.body[0].declaration.declarations[0].init) ||
undefined
if (!objectExpression) {
return undefined
}
return getObjectFromExpression(objectExpression)
}
/**
* Splits a `mdast` tree into multiple trees based on
* a predicate function. Will include the splitting node
* at the beginning of each tree.
*
* Useful to split a markdown file into smaller sections.
*/
export function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
return tree.children.reduce<Root[]>((trees, node) => {
const [lastTree] = trees.slice(-1)
if (!lastTree || predicate(node)) {
const tree: Root = u('root', [node])
return trees.concat(tree)
}
lastTree.children.push(node)
return trees
}, [])
}
/**
* Parses a markdown heading which can optionally
* contain a custom anchor in the format:
*
* ```markdown
* ### My Heading [#my-custom-anchor]
* ```
*/
export function parseHeading(heading: string): { heading: string; customAnchor?: string } {
const match = heading.match(/(.*) *\[#(.*)\]/)
if (match) {
const [, heading, customAnchor] = match
return { heading, customAnchor }
}
return { heading }
}
/**
* Processes MDX content for search indexing.
* It extracts metadata, strips it of all JSX,
* and splits it into sub-sections based on criteria.
*/
export function processMdxForSearch(_content: string, options?: { yaml?: boolean }): ProcessedMdx {
const checksum = createHash('sha256').update(_content).digest('base64')
let frontmatter: Record<string, unknown> = {}
let content = _content
if (options?.yaml) {
const parsed = matter(_content)
frontmatter = parsed.data
content = parsed.content
}
const mdxTree = fromMarkdown(content, {
extensions: [mdxjs()],
mdastExtensions: [mdxFromMarkdown()],
})
let meta: Record<string, unknown>
if (options?.yaml) {
meta = frontmatter
} else {
meta = extractMetaExport(mdxTree)
}
const serializableMeta: Json = meta && JSON.parse(JSON.stringify(meta))
// Remove all MDX elements from markdown
const mdTree = filter(
mdxTree,
(node) =>
![
'mdxjsEsm',
'mdxJsxFlowElement',
'mdxJsxTextElement',
'mdxFlowExpression',
'mdxTextExpression',
].includes(node.type)
)
if (!mdTree) {
return {
checksum,
meta: serializableMeta,
sections: [],
}
}
const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading')
const slugger = new GithubSlugger()
const sections = sectionTrees.map((tree) => {
const [firstNode] = tree.children
const content = toMarkdown(tree)
const rawHeading: string = firstNode.type === 'heading' ? toString(firstNode) : undefined
if (!rawHeading) {
return { content }
}
const { heading, customAnchor } = parseHeading(rawHeading)
const slug = slugger.slug(customAnchor ?? heading)
return {
content,
heading,
slug,
}
})
return {
checksum,
meta: serializableMeta,
sections,
}
}
export type ProcessedMdx = {
checksum: string
meta: Json
sections: Section[]
}
export class MarkdownLoader extends BaseLoader {
type = 'markdown' as const
constructor(
source: string,
public filePath: string,
public options?: { yaml?: boolean }
) {
const path = filePath.replace(/^(pages|content)/, '').replace(/\.mdx?$/, '')
super(source, path)
}
async load() {
const contents = await readFile(this.filePath, 'utf8')
return [new MarkdownSource(this.source, this.path, contents, this.options)]
}
}
export class MarkdownSource extends BaseSource {
type = 'markdown' as const
constructor(
source: string,
path: string,
public contents: string,
public options?: { yaml?: boolean }
) {
super(source, path)
}
process() {
const { checksum, meta, sections } = processMdxForSearch(this.contents, this.options)
this.checksum = checksum
this.meta = meta
this.sections = sections
return { checksum, meta, sections }
}
extractIndexedContent(): string {
const sections = this.sections ?? []
return sections.map(({ content }) => content).join('\n\n')
}
}