mirror of
https://github.com/supabase/supabase.git
synced 2026-06-20 20:16:04 +08:00
436 lines
12 KiB
TypeScript
436 lines
12 KiB
TypeScript
import { createClient } from '@supabase/supabase-js'
|
|
import { createHash } from 'crypto'
|
|
import dotenv from 'dotenv'
|
|
import { ObjectExpression } from 'estree'
|
|
import { readdir, readFile, stat } from 'fs/promises'
|
|
import GithubSlugger from 'github-slugger'
|
|
import { Content, Root } from 'mdast'
|
|
import { fromMarkdown } from 'mdast-util-from-markdown'
|
|
import { mdxFromMarkdown, MdxjsEsm } from 'mdast-util-mdx'
|
|
import { toMarkdown } from 'mdast-util-to-markdown'
|
|
import { toString } from 'mdast-util-to-string'
|
|
import { mdxjs } from 'micromark-extension-mdxjs'
|
|
import 'openai'
|
|
import { Configuration, OpenAIApi } from 'openai'
|
|
import { basename, dirname, join } from 'path'
|
|
import { u } from 'unist-builder'
|
|
import { filter } from 'unist-util-filter'
|
|
import { inspect } from 'util'
|
|
|
|
dotenv.config()
|
|
|
|
const ignoredFiles = ['pages/404.mdx']
|
|
|
|
/**
|
|
* Extracts ES literals from an `estree` `ObjectExpression`
|
|
* into a plain JavaScript object.
|
|
*/
|
|
function getObjectFromExpression(node: ObjectExpression) {
|
|
return node.properties.reduce<
|
|
Record<string, string | number | bigint | true | RegExp | undefined>
|
|
>((object, property) => {
|
|
if (property.type !== 'Property') {
|
|
return object
|
|
}
|
|
|
|
const key = (property.key.type === 'Identifier' && property.key.name) || undefined
|
|
const value = (property.value.type === 'Literal' && property.value.value) || undefined
|
|
|
|
if (!key) {
|
|
return object
|
|
}
|
|
|
|
return {
|
|
...object,
|
|
[key]: value,
|
|
}
|
|
}, {})
|
|
}
|
|
|
|
/**
|
|
* Extracts the `meta` ESM export from the MDX file.
|
|
*
|
|
* This info is akin to frontmatter.
|
|
*/
|
|
function extractMetaExport(mdxTree: Root) {
|
|
const metaExportNode = mdxTree.children.find((node): node is MdxjsEsm => {
|
|
return (
|
|
node.type === 'mdxjsEsm' &&
|
|
node.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
|
|
node.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
|
|
node.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
|
|
node.data.estree.body[0].declaration.declarations[0].id.name === 'meta'
|
|
)
|
|
})
|
|
|
|
if (!metaExportNode) {
|
|
return undefined
|
|
}
|
|
|
|
const objectExpression =
|
|
(metaExportNode.data?.estree?.body[0]?.type === 'ExportNamedDeclaration' &&
|
|
metaExportNode.data.estree.body[0].declaration?.type === 'VariableDeclaration' &&
|
|
metaExportNode.data.estree.body[0].declaration.declarations[0]?.id.type === 'Identifier' &&
|
|
metaExportNode.data.estree.body[0].declaration.declarations[0].id.name === 'meta' &&
|
|
metaExportNode.data.estree.body[0].declaration.declarations[0].init?.type ===
|
|
'ObjectExpression' &&
|
|
metaExportNode.data.estree.body[0].declaration.declarations[0].init) ||
|
|
undefined
|
|
|
|
if (!objectExpression) {
|
|
return undefined
|
|
}
|
|
|
|
return getObjectFromExpression(objectExpression)
|
|
}
|
|
|
|
/**
|
|
* Splits a `mdast` tree into multiple trees based on
|
|
* a predicate function. Will include the splitting node
|
|
* at the beginning of each tree.
|
|
*
|
|
* Useful to split a markdown file into smaller sections.
|
|
*/
|
|
function splitTreeBy(tree: Root, predicate: (node: Content) => boolean) {
|
|
return tree.children.reduce<Root[]>((trees, node) => {
|
|
const [lastTree] = trees.slice(-1)
|
|
|
|
if (!lastTree || predicate(node)) {
|
|
const tree: Root = u('root', [node])
|
|
return trees.concat(tree)
|
|
}
|
|
|
|
lastTree.children.push(node)
|
|
return trees
|
|
}, [])
|
|
}
|
|
|
|
type Meta = ReturnType<typeof extractMetaExport>
|
|
|
|
type Section = {
|
|
content: string
|
|
heading?: string
|
|
slug?: string
|
|
}
|
|
|
|
type ProcessedMdx = {
|
|
checksum: string
|
|
meta: Meta
|
|
sections: Section[]
|
|
}
|
|
|
|
/**
|
|
* Processes MDX content for search indexing.
|
|
* It extracts metadata, strips it of all JSX,
|
|
* and splits it into sub-sections based on criteria.
|
|
*/
|
|
function processMdxForSearch(content: string): ProcessedMdx {
|
|
const checksum = createHash('sha256').update(content).digest('base64')
|
|
|
|
const mdxTree = fromMarkdown(content, {
|
|
extensions: [mdxjs()],
|
|
mdastExtensions: [mdxFromMarkdown()],
|
|
})
|
|
|
|
const meta = extractMetaExport(mdxTree)
|
|
|
|
// Remove all MDX elements from markdown
|
|
const mdTree = filter(
|
|
mdxTree,
|
|
(node) =>
|
|
![
|
|
'mdxjsEsm',
|
|
'mdxJsxFlowElement',
|
|
'mdxJsxTextElement',
|
|
'mdxFlowExpression',
|
|
'mdxTextExpression',
|
|
].includes(node.type)
|
|
)
|
|
|
|
if (!mdTree) {
|
|
return {
|
|
checksum,
|
|
meta,
|
|
sections: [],
|
|
}
|
|
}
|
|
|
|
const sectionTrees = splitTreeBy(mdTree, (node) => node.type === 'heading')
|
|
|
|
const slugger = new GithubSlugger()
|
|
|
|
const sections = sectionTrees.map((tree) => {
|
|
const [firstNode] = tree.children
|
|
|
|
const heading = firstNode.type === 'heading' ? toString(firstNode) : undefined
|
|
const slug = heading ? slugger.slug(heading) : undefined
|
|
|
|
return {
|
|
content: toMarkdown(tree),
|
|
heading,
|
|
slug,
|
|
}
|
|
})
|
|
|
|
return {
|
|
checksum,
|
|
meta,
|
|
sections,
|
|
}
|
|
}
|
|
|
|
type WalkEntry = {
|
|
path: string
|
|
parentPath?: string
|
|
}
|
|
|
|
async function walk(dir: string, parentPath?: string): Promise<WalkEntry[]> {
|
|
const immediateFiles = await readdir(dir)
|
|
|
|
const recursiveFiles = await Promise.all(
|
|
immediateFiles.map(async (file) => {
|
|
const path = join(dir, file)
|
|
const stats = await stat(path)
|
|
if (stats.isDirectory()) {
|
|
// Keep track of document hierarchy (if this dir has corresponding doc file)
|
|
const docPath = `${basename(path)}.mdx`
|
|
|
|
return walk(
|
|
path,
|
|
immediateFiles.includes(docPath) ? join(dirname(path), docPath) : parentPath
|
|
)
|
|
} else if (stats.isFile()) {
|
|
return [
|
|
{
|
|
path: path,
|
|
parentPath,
|
|
},
|
|
]
|
|
} else {
|
|
return []
|
|
}
|
|
})
|
|
)
|
|
|
|
const flattenedFiles = recursiveFiles.reduce(
|
|
(all, folderContents) => all.concat(folderContents),
|
|
[]
|
|
)
|
|
|
|
return flattenedFiles.sort((a, b) => a.path.localeCompare(b.path))
|
|
}
|
|
|
|
async function generateEmbeddings() {
|
|
// TODO: use better CLI lib like yargs
|
|
const args = process.argv.slice(2)
|
|
const shouldRefresh = args.includes('--refresh')
|
|
|
|
if (
|
|
!process.env.NEXT_PUBLIC_SUPABASE_URL ||
|
|
!process.env.SUPABASE_SERVICE_ROLE_KEY ||
|
|
!process.env.OPENAI_KEY
|
|
) {
|
|
return console.log(
|
|
'Environment variables NEXT_PUBLIC_SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY, and OPENAI_KEY are required: skipping embeddings generation'
|
|
)
|
|
}
|
|
|
|
const supabaseClient = createClient(
|
|
process.env.NEXT_PUBLIC_SUPABASE_URL,
|
|
process.env.SUPABASE_SERVICE_ROLE_KEY
|
|
)
|
|
|
|
const markdownFiles = (await walk('pages'))
|
|
.filter(({ path }) => /\.mdx?$/.test(path))
|
|
.filter(({ path }) => !ignoredFiles.includes(path))
|
|
|
|
console.log(`Discovered ${markdownFiles.length} pages`)
|
|
|
|
if (!shouldRefresh) {
|
|
console.log('Checking which pages are new or have changed')
|
|
} else {
|
|
console.log('Refresh flag set, re-generating all pages')
|
|
}
|
|
|
|
for (const markdownFile of markdownFiles) {
|
|
const path = markdownFile.path.replace(/^pages/, '').replace(/\.mdx?$/, '')
|
|
const parentPath = markdownFile.parentPath?.replace(/^pages/, '').replace(/\.mdx?$/, '')
|
|
|
|
try {
|
|
const contents = await readFile(markdownFile.path, 'utf8')
|
|
|
|
const { checksum, meta, sections } = processMdxForSearch(contents)
|
|
|
|
// Check for existing page in DB and compare checksums
|
|
const { error: fetchPageError, data: existingPage } = await supabaseClient
|
|
.from('page')
|
|
.select('id, path, checksum, parentPage:parent_page_id(id, path)')
|
|
.filter('path', 'eq', path)
|
|
.limit(1)
|
|
.maybeSingle()
|
|
|
|
if (fetchPageError) {
|
|
throw fetchPageError
|
|
}
|
|
|
|
type Singular<T> = T extends any[] ? undefined : T
|
|
|
|
// We use checksum to determine if this page & its sections need to be regenerated
|
|
if (!shouldRefresh && existingPage?.checksum === checksum) {
|
|
const existingParentPage = existingPage?.parentPage as Singular<
|
|
typeof existingPage.parentPage
|
|
>
|
|
|
|
// If parent page changed, update it
|
|
if (existingParentPage?.path !== parentPath) {
|
|
console.log(`[${path}] Parent page has changed. Updating to '${parentPath}'...`)
|
|
const { error: fetchParentPageError, data: parentPage } = await supabaseClient
|
|
.from('page')
|
|
.select()
|
|
.filter('path', 'eq', parentPath)
|
|
.limit(1)
|
|
.maybeSingle()
|
|
|
|
if (fetchParentPageError) {
|
|
throw fetchParentPageError
|
|
}
|
|
|
|
const { error: updatePageError } = await supabaseClient
|
|
.from('page')
|
|
.update({ parent_page_id: parentPage?.id })
|
|
.filter('id', 'eq', existingPage.id)
|
|
|
|
if (updatePageError) {
|
|
throw updatePageError
|
|
}
|
|
}
|
|
continue
|
|
}
|
|
|
|
if (existingPage) {
|
|
if (!shouldRefresh) {
|
|
console.log(
|
|
`[${path}] Docs have changed, removing old page sections and their embeddings`
|
|
)
|
|
} else {
|
|
console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`)
|
|
}
|
|
|
|
const { error: deletePageSectionError } = await supabaseClient
|
|
.from('page_section')
|
|
.delete()
|
|
.filter('page_id', 'eq', existingPage.id)
|
|
|
|
if (deletePageSectionError) {
|
|
throw deletePageSectionError
|
|
}
|
|
}
|
|
|
|
const { error: fetchParentPageError, data: parentPage } = await supabaseClient
|
|
.from('page')
|
|
.select()
|
|
.filter('path', 'eq', parentPath)
|
|
.limit(1)
|
|
.maybeSingle()
|
|
|
|
if (fetchParentPageError) {
|
|
throw fetchParentPageError
|
|
}
|
|
|
|
// Create/update page record. Intentionally clear checksum until we
|
|
// have successfully generated all page sections.
|
|
const { error: upsertPageError, data: page } = await supabaseClient
|
|
.from('page')
|
|
.upsert(
|
|
{
|
|
checksum: null,
|
|
path,
|
|
meta,
|
|
parent_page_id: parentPage?.id,
|
|
},
|
|
{ onConflict: 'path' }
|
|
)
|
|
.select()
|
|
.limit(1)
|
|
.single()
|
|
|
|
if (upsertPageError) {
|
|
throw upsertPageError
|
|
}
|
|
|
|
console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`)
|
|
for (const { slug, heading, content } of sections) {
|
|
// OpenAI recommends replacing newlines with spaces for best results (specific to embeddings)
|
|
const input = content.replace(/\n/g, ' ')
|
|
|
|
try {
|
|
const configuration = new Configuration({ apiKey: process.env.OPENAI_KEY })
|
|
const openai = new OpenAIApi(configuration)
|
|
|
|
const embeddingResponse = await openai.createEmbedding({
|
|
model: 'text-embedding-ada-002',
|
|
input,
|
|
})
|
|
|
|
if (embeddingResponse.status !== 200) {
|
|
throw new Error(inspect(embeddingResponse.data, false, 2))
|
|
}
|
|
|
|
const [responseData] = embeddingResponse.data.data
|
|
|
|
const { error: insertPageSectionError, data: pageSection } = await supabaseClient
|
|
.from('page_section')
|
|
.insert({
|
|
page_id: page.id,
|
|
slug,
|
|
heading,
|
|
content,
|
|
token_count: embeddingResponse.data.usage.total_tokens,
|
|
embedding: responseData.embedding,
|
|
})
|
|
.select()
|
|
.limit(1)
|
|
.single()
|
|
|
|
if (insertPageSectionError) {
|
|
throw insertPageSectionError
|
|
}
|
|
} catch (err) {
|
|
// TODO: decide how to better handle failed embeddings
|
|
console.error(
|
|
`Failed to generate embeddings for '${path}' page section starting with '${input.slice(
|
|
0,
|
|
40
|
|
)}...'`
|
|
)
|
|
|
|
throw err
|
|
}
|
|
}
|
|
|
|
// Set page checksum so that we know this page was stored successfully
|
|
const { error: updatePageError } = await supabaseClient
|
|
.from('page')
|
|
.update({ checksum })
|
|
.filter('id', 'eq', page.id)
|
|
|
|
if (updatePageError) {
|
|
throw updatePageError
|
|
}
|
|
} catch (err) {
|
|
console.error(
|
|
`Page '${path}' or one/multiple of its page sections failed to store properly. Page has been marked with null checksum to indicate that it needs to be re-generated.`
|
|
)
|
|
console.error(err)
|
|
}
|
|
}
|
|
|
|
console.log('Embedding generation complete')
|
|
}
|
|
|
|
async function main() {
|
|
await generateEmbeddings()
|
|
}
|
|
|
|
main().catch((err) => console.error(err))
|