From 0c1ab7e2e50df5b433e27e657b4af0192d2edb41 Mon Sep 17 00:00:00 2001 From: Charis <26616127+charislam@users.noreply.github.com> Date: Tue, 4 Jun 2024 14:00:24 -0400 Subject: [PATCH] refactor: factor out markdown processing utils (#26729) --- apps/docs/scripts/helpers.mdx.ts | 212 ++++++++++++++++++ .../scripts/search/generate-embeddings.ts | 2 +- apps/docs/scripts/search/sources/base.ts | 11 +- apps/docs/scripts/search/sources/markdown.ts | 207 +---------------- .../search/sources/partner-integrations.ts | 4 +- .../scripts/search/sources/reference-doc.ts | 3 +- 6 files changed, 221 insertions(+), 218 deletions(-) create mode 100644 apps/docs/scripts/helpers.mdx.ts diff --git a/apps/docs/scripts/helpers.mdx.ts b/apps/docs/scripts/helpers.mdx.ts new file mode 100644 index 00000000000..05a6d844b32 --- /dev/null +++ b/apps/docs/scripts/helpers.mdx.ts @@ -0,0 +1,212 @@ +import { createHash } from 'crypto' +import { ObjectExpression } from 'estree' +import GithubSlugger from 'github-slugger' +import matter from 'gray-matter' +import { type Content, type Root } from 'mdast' +import { fromMarkdown } from 'mdast-util-from-markdown' +import { toMarkdown } from 'mdast-util-to-markdown' +import { mdxFromMarkdown, type MdxjsEsm } from 'mdast-util-mdx' +import { toString } from 'mdast-util-to-string' +import { mdxjs } from 'micromark-extension-mdxjs' +import { u } from 'unist-builder' +import { filter } from 'unist-util-filter' + +type Json = Record + +type Section = { + content: string + heading?: string + slug?: string +} + +export type ProcessedMdx = { + checksum: string + meta: Json + sections: Section[] +} + +/** + * Process MDX content. + * + * Splits MDX content into sections based on headings, and calculates checksum. + */ +function processMdx(content: string, options?: { yaml?: boolean }): ProcessedMdx { + const checksum = createHash('sha256').update(content).digest('base64') + + let frontmatter: Record = {} + if (options?.yaml) { + const parsed = matter(content) + frontmatter = parsed.data + content = parsed.content + } + + const mdxTree = fromMarkdown(content, { + extensions: [mdxjs()], + mdastExtensions: [mdxFromMarkdown()], + }) + + let meta: Record + if (options?.yaml) { + meta = frontmatter + } else { + meta = extractMetaExport(mdxTree) + } + + const serializableMeta: Json = meta && JSON.parse(JSON.stringify(meta)) + + // Remove all MDX elements from markdown + const mdTree = filter( + mdxTree, + (node) => + ![ + 'mdxjsEsm', + 'mdxJsxFlowElement', + 'mdxJsxTextElement', + 'mdxFlowExpression', + 'mdxTextExpression', + ].includes(node.type) + ) + + if (!mdTree) { + return { + checksum, + meta: serializableMeta, + sections: [], + } + } + + const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading') + + const slugger = new GithubSlugger() + + const sections = sectionTrees.map((tree) => { + const [firstNode] = tree.children + const content = toMarkdown(tree) + + const rawHeading: string = firstNode.type === 'heading' ? toString(firstNode) : undefined + + if (!rawHeading) { + return { content } + } + + const { heading, customAnchor } = parseHeading(rawHeading) + + const slug = slugger.slug(customAnchor ?? heading) + + return { + content, + heading, + slug, + } + }) + + return { + checksum, + meta: serializableMeta, + sections, + } +} + +/** + * Extracts the `meta` ESM export from the MDX file. + * + * This info is akin to frontmatter. + */ +function extractMetaExport(mdxTree: Root) { + const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => { + return ( + node.type === 'mdxjsEsm' && + node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' && + node.data.estree.body[0].declaration?.type === 'VariableDeclaration' && + node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' && + node.data.estree.body[0].declaration.declarations[0].id.name === 'meta' + ) + }) + + if (!metaExportNode) { + return undefined + } + + const objectExpression = + (metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' && + metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' && + metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' && + metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' && + metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type === + 'ObjectExpression' && + metaExportNode.data.estree.body[0].declaration.declarations[0].init) || + undefined + + if (!objectExpression) { + return undefined + } + + return getObjectFromExpression(objectExpression) +} + +/** + * Extracts ES literals from an `estree` `ObjectExpression` + * into a plain JavaScript object. + */ +function getObjectFromExpression(node: ObjectExpression) { + return node.properties.reduce< + Record + >((object, property) => { + if (property.type !== 'Property') { + return object + } + + const key = (property.key.type === 'Identifier' && property.key.name) || undefined + const value = (property.value.type === 'Literal' && property.value.value) || undefined + + if (!key) { + return object + } + + return { + ...object, + [key]: value, + } + }, {}) +} + +/** + * Splits a `mdast` tree into multiple trees based on + * a predicate function. Will include the splitting node + * at the beginning of each tree. + * + * Useful to split a markdown file into smaller sections. + */ +function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) { + return tree.children.reduce((trees, node) => { + const [lastTree] = trees.slice(-1) + + if (!lastTree || predicate(node)) { + const tree: Root = u('root', [node]) + return trees.concat(tree) + } + + lastTree.children.push(node) + return trees + }, []) +} + +/** + * Parses a markdown heading which can optionally + * contain a custom anchor in the format: + * + * ```markdown + * ### My Heading [#my-custom-anchor] + * ``` + */ +function parseHeading(heading: string): { heading: string; customAnchor?: string } { + const match = heading.match(/(.*) *\[#(.*)\]/) + if (match) { + const [, heading, customAnchor] = match + return { heading, customAnchor } + } + return { heading } +} + +export { processMdx } +export type { Json, Section } diff --git a/apps/docs/scripts/search/generate-embeddings.ts b/apps/docs/scripts/search/generate-embeddings.ts index cfac32cae35..ad464657083 100644 --- a/apps/docs/scripts/search/generate-embeddings.ts +++ b/apps/docs/scripts/search/generate-embeddings.ts @@ -3,8 +3,8 @@ import dotenv from 'dotenv' import { parseArgs } from 'node:util' import { OpenAI } from 'openai' import { v4 as uuidv4 } from 'uuid' +import type { Json, Section } from '../helpers.mdx' import { fetchSources } from './sources' -import { Json, Section } from './sources/base' dotenv.config() diff --git a/apps/docs/scripts/search/sources/base.ts b/apps/docs/scripts/search/sources/base.ts index 931a57290f7..7416f8bdb67 100644 --- a/apps/docs/scripts/search/sources/base.ts +++ b/apps/docs/scripts/search/sources/base.ts @@ -1,13 +1,4 @@ -export type Json = Record< - string, - string | number | boolean | null | Json[] | { [key: string]: Json } -> - -export type Section = { - content: string - heading?: string - slug?: string -} +import type { Json, Section } from '../../helpers.mdx' export abstract class BaseLoader { type: string diff --git a/apps/docs/scripts/search/sources/markdown.ts b/apps/docs/scripts/search/sources/markdown.ts index 5682c094ffb..d9417d223b6 100644 --- a/apps/docs/scripts/search/sources/markdown.ts +++ b/apps/docs/scripts/search/sources/markdown.ts @@ -1,207 +1,6 @@ -import { createHash } from 'crypto' -import { ObjectExpression } from 'estree' import { readFile } from 'fs/promises' -import GithubSlugger from 'github-slugger' -import matter from 'gray-matter' -import { Content, Root } from 'mdast' -import { fromMarkdown } from 'mdast-util-from-markdown' -import { MdxjsEsm, mdxFromMarkdown } from 'mdast-util-mdx' -import { toMarkdown } from 'mdast-util-to-markdown' -import { toString } from 'mdast-util-to-string' -import { mdxjs } from 'micromark-extension-mdxjs' -import { u } from 'unist-builder' -import { filter } from 'unist-util-filter' -import { BaseLoader, BaseSource, Json, Section } from './base' - -/** - * Extracts ES literals from an `estree` `ObjectExpression` - * into a plain JavaScript object. - */ -export function getObjectFromExpression(node: ObjectExpression) { - return node.properties.reduce< - Record - >((object, property) => { - if (property.type !== 'Property') { - return object - } - - const key = (property.key.type === 'Identifier' && property.key.name) || undefined - const value = (property.value.type === 'Literal' && property.value.value) || undefined - - if (!key) { - return object - } - - return { - ...object, - [key]: value, - } - }, {}) -} - -/** - * Extracts the `meta` ESM export from the MDX file. - * - * This info is akin to frontmatter. - */ -export function extractMetaExport(mdxTree: Root) { - const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => { - return ( - node.type === 'mdxjsEsm' && - node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' && - node.data.estree.body[0].declaration?.type === 'VariableDeclaration' && - node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' && - node.data.estree.body[0].declaration.declarations[0].id.name === 'meta' - ) - }) - - if (!metaExportNode) { - return undefined - } - - const objectExpression = - (metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' && - metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' && - metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' && - metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' && - metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type === - 'ObjectExpression' && - metaExportNode.data.estree.body[0].declaration.declarations[0].init) || - undefined - - if (!objectExpression) { - return undefined - } - - return getObjectFromExpression(objectExpression) -} - -/** - * Splits a `mdast` tree into multiple trees based on - * a predicate function. Will include the splitting node - * at the beginning of each tree. - * - * Useful to split a markdown file into smaller sections. - */ -export function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) { - return tree.children.reduce((trees, node) => { - const [lastTree] = trees.slice(-1) - - if (!lastTree || predicate(node)) { - const tree: Root = u('root', [node]) - return trees.concat(tree) - } - - lastTree.children.push(node) - return trees - }, []) -} - -/** - * Parses a markdown heading which can optionally - * contain a custom anchor in the format: - * - * ```markdown - * ### My Heading [#my-custom-anchor] - * ``` - */ -export function parseHeading(heading: string): { heading: string; customAnchor?: string } { - const match = heading.match(/(.*) *\[#(.*)\]/) - if (match) { - const [, heading, customAnchor] = match - return { heading, customAnchor } - } - return { heading } -} - -/** - * Processes MDX content for search indexing. - * It extracts metadata, strips it of all JSX, - * and splits it into sub-sections based on criteria. - */ -export function processMdxForSearch(_content: string, options?: { yaml?: boolean }): ProcessedMdx { - const checksum = createHash('sha256').update(_content).digest('base64') - - let frontmatter: Record = {} - let content = _content - if (options?.yaml) { - const parsed = matter(_content) - frontmatter = parsed.data - content = parsed.content - } - - const mdxTree = fromMarkdown(content, { - extensions: [mdxjs()], - mdastExtensions: [mdxFromMarkdown()], - }) - - let meta: Record - if (options?.yaml) { - meta = frontmatter - } else { - meta = extractMetaExport(mdxTree) - } - - const serializableMeta: Json = meta && JSON.parse(JSON.stringify(meta)) - - // Remove all MDX elements from markdown - const mdTree = filter( - mdxTree, - (node) => - ![ - 'mdxjsEsm', - 'mdxJsxFlowElement', - 'mdxJsxTextElement', - 'mdxFlowExpression', - 'mdxTextExpression', - ].includes(node.type) - ) - - if (!mdTree) { - return { - checksum, - meta: serializableMeta, - sections: [], - } - } - - const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading') - - const slugger = new GithubSlugger() - - const sections = sectionTrees.map((tree) => { - const [firstNode] = tree.children - const content = toMarkdown(tree) - - const rawHeading: string = firstNode.type === 'heading' ? toString(firstNode) : undefined - - if (!rawHeading) { - return { content } - } - - const { heading, customAnchor } = parseHeading(rawHeading) - - const slug = slugger.slug(customAnchor ?? heading) - - return { - content, - heading, - slug, - } - }) - - return { - checksum, - meta: serializableMeta, - sections, - } -} - -export type ProcessedMdx = { - checksum: string - meta: Json - sections: Section[] -} +import { processMdx } from '../../helpers.mdx' +import { BaseLoader, BaseSource } from './base' export class MarkdownLoader extends BaseLoader { type = 'markdown' as const @@ -234,7 +33,7 @@ export class MarkdownSource extends BaseSource { } process() { - const { checksum, meta, sections } = processMdxForSearch(this.contents, this.options) + const { checksum, meta, sections } = processMdx(this.contents, this.options) this.checksum = checksum this.meta = meta diff --git a/apps/docs/scripts/search/sources/partner-integrations.ts b/apps/docs/scripts/search/sources/partner-integrations.ts index e675a1149a6..6f89dff783c 100644 --- a/apps/docs/scripts/search/sources/partner-integrations.ts +++ b/apps/docs/scripts/search/sources/partner-integrations.ts @@ -2,7 +2,7 @@ import { type SupabaseClient, createClient } from '@supabase/supabase-js' import { upperFirst } from 'lodash' import { BaseLoader, BaseSource } from './base' -import { processMdxForSearch } from './markdown' +import { processMdx } from '../../helpers.mdx' type PartnerData = { slug: string // The partner slug corresponding to the last part of the URL @@ -59,7 +59,7 @@ export class IntegrationSource extends BaseSource { } process() { - const { checksum, sections } = processMdxForSearch(this.partnerData.overview) + const { checksum, sections } = processMdx(this.partnerData.overview) const meta = { title: upperFirst(this.partnerData.slug), subtitle: 'Integration', diff --git a/apps/docs/scripts/search/sources/reference-doc.ts b/apps/docs/scripts/search/sources/reference-doc.ts index 5645c0cd480..8b50e898a30 100644 --- a/apps/docs/scripts/search/sources/reference-doc.ts +++ b/apps/docs/scripts/search/sources/reference-doc.ts @@ -11,7 +11,8 @@ import { import { CliCommand, CliSpec } from '../../../generator/types/CliSpec' import { flattenSections } from '../../../lib/helpers' import { enrichedOperation, gen_v3 } from '../../../lib/refGenerator/helpers' -import { BaseLoader, BaseSource, Json } from './base' +import type { Json } from '../../helpers.mdx' +import { BaseLoader, BaseSource } from './base' export abstract class ReferenceLoader extends BaseLoader { type = 'reference' as const