From bb28c45a77e9999f6485f9ef77eeafb4eb6f7ac5 Mon Sep 17 00:00:00 2001 From: Greg Richardson Date: Mon, 20 Feb 2023 14:14:25 -0700 Subject: [PATCH] chore(docs-search): embeddings:refresh script --- apps/docs/package.json | 3 ++- apps/docs/scripts/generate-embeddings.ts | 27 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/apps/docs/package.json b/apps/docs/package.json index 517280d3f19..7cce8bdda67 100644 --- a/apps/docs/package.json +++ b/apps/docs/package.json @@ -8,7 +8,8 @@ "start": "next start", "lint": "next lint", "build:sitemap": "node ./internals/generate-sitemap.mjs", - "build:embeddings": "tsx scripts/generate-embeddings.ts", + "embeddings": "tsx scripts/generate-embeddings.ts", + "embeddings:refresh": "npm run embeddings -- --refresh", "postbuild": "ts-node ./scripts/build-search.ts && node ./internals/generate-sitemap.mjs", "generate:all": "npm-run-all --parallel gen:api gen:cli gen:gotrue gen:storage gen:supabase-dart:v0 gen:supabase-dart:v1 gen:supabase-csharp:v0 gen:supabase-js:v1 gen:supabase-js:v2 gen:realtime", "gen:api": "npm-run-all gen:api:usage", diff --git a/apps/docs/scripts/generate-embeddings.ts b/apps/docs/scripts/generate-embeddings.ts index 8d144cdbefc..6bc3f5dc19a 100644 --- a/apps/docs/scripts/generate-embeddings.ts +++ b/apps/docs/scripts/generate-embeddings.ts @@ -221,6 +221,10 @@ async function walk(dir: string, parentPath?: string): Promise { } async function generateEmbeddings() { + // TODO: use better CLI lib like yargs + const args = process.argv.slice(2) + const shouldRefresh = args.includes('--refresh') + if ( !process.env.NEXT_PUBLIC_SUPABASE_URL || !process.env.SUPABASE_SERVICE_ROLE_KEY || @@ -241,7 +245,12 @@ async function generateEmbeddings() { .filter(({ path }) => !ignoredFiles.includes(path)) console.log(`Discovered ${markdownFiles.length} pages`) - console.log('Checking which pages are new or have changed') + + if (!shouldRefresh) { + console.log('Checking which pages are new or have changed') + } else { + console.log('Refresh flag set, re-generating all pages') + } for (const markdownFile of markdownFiles) { const path = markdownFile.path.replace(/^pages/, '').replace(/\.mdx?$/, '') @@ -267,14 +276,14 @@ async function generateEmbeddings() { type Singular = T extends any[] ? undefined : T // We use checksum to determine if this page & its sections need to be regenerated - if (existingPage?.checksum === checksum) { + if (!shouldRefresh && existingPage?.checksum === checksum) { const existingParentPage = existingPage?.parentPage as Singular< typeof existingPage.parentPage > // If parent page changed, update it if (existingParentPage?.path !== parentPath) { - console.log(`Parent page has changed for '${path}'. Updating to '${parentPath}'...`) + console.log(`[${path}] Parent page has changed. Updating to '${parentPath}'...`) const { error: fetchParentPageError, data: parentPage } = await supabaseClient .from('page') .select() @@ -299,9 +308,13 @@ async function generateEmbeddings() { } if (existingPage) { - console.log( - `Docs have changed for '${path}', removing old page sections and their embeddings` - ) + if (!shouldRefresh) { + console.log( + `[${path}] Docs have changed, removing old page sections and their embeddings` + ) + } else { + console.log(`[${path}] Refresh flag set, removing old page sections and their embeddings`) + } const { error: deletePageSectionError } = await supabaseClient .from('page_section') @@ -345,7 +358,7 @@ async function generateEmbeddings() { throw upsertPageError } - console.log(`Adding ${sections.length} page sections (with embeddings) for '${path}'`) + console.log(`[${path}] Adding ${sections.length} page sections (with embeddings)`) for (const { slug, heading, content } of sections) { // OpenAI recommends replacing newlines with spaces for best results (specific to embeddings) const input = content.replace(/\n/g, ' ')