refactor(studio): address second round of review feedback on trace-level scorers

2026-05-06 22:18:00 +08:00 · 2026-05-04 18:17:09 -04:00
parent e83579310e
commit 5de89e48f2
3 changed files with 34 additions and 29 deletions
--- a/apps/studio/evals/scorer-wasm.ts
+++ b/apps/studio/evals/scorer-wasm.ts
@@ -1,12 +1,10 @@
 import { EvalScorer, Trace } from 'braintrust'
 import { parse } from 'libpg-query'
-import { z } from 'zod'

 import { AssistantEvalInput, AssistantEvalOutput, Expected, getToolSpans } from './scorer'
+import { executeSqlInputSchema } from '@/lib/ai/tools/studio-tools'
 import { extractIdentifiers, isQuotedInSql, needsQuoting } from '@/lib/sql-identifier-quoting'

-const executeSqlInputSchema = z.object({ sql: z.string() })
-
 /** Extracts SQL strings from all `execute_sql` tool spans in the trace. */
 async function getSqlQueries(trace: Trace): Promise<string[]> {
  const spans = await getToolSpans(trace, 'execute_sql')
--- a/apps/studio/evals/scorer.ts
+++ b/apps/studio/evals/scorer.ts
@@ -4,6 +4,7 @@ import { EvalCase, EvalScorer, SpanData, Trace } from 'braintrust'
 import { stripIndent } from 'common-tags'
 import { z } from 'zod'

+import { loadKnowledgeInputSchema } from '@/lib/ai/tools/studio-tools'
 import { extractUrls } from '@/lib/helpers'

 const LLM_AS_A_JUDGE_MODEL = 'gpt-5.2' // NOTE: `gpt-5.2-2025-12-11` snapshot not yet working with online scorers
@@ -52,10 +53,10 @@ export type AssistantEvalCase = EvalCase<AssistantEvalInput, Expected, Assistant

 const chatMessageSchema = z.object({ role: z.string(), content: z.unknown() })
 const textContentBlockSchema = z.object({ type: z.literal('text'), text: z.string() })
-const loadKnowledgeInputSchema = z.object({ name: z.string() })
 // search_docs returns { content: [{ text: string }] } where each text is a JSON doc string
 const searchDocsOutputSchema = z.object({ content: z.array(z.object({ text: z.string() })) })

+/** Extracts plain text from a message content field (string or content-block array). */
 function extractMessageText(content: unknown): string {
  if (typeof content === 'string') return content
  if (!Array.isArray(content)) return ''
@@ -67,6 +68,7 @@ function extractMessageText(content: unknown): string {
    .join('\n')
 }

+/** Returns the text of the last assistant message in the trace thread, or null if none. */
 async function getLastAssistantText(trace: Trace): Promise<string | null> {
  const thread = await trace.getThread()
  for (let i = thread.length - 1; i >= 0; i--) {
@@ -94,6 +96,7 @@ async function getConversationContext(trace: Trace): Promise<string> {
    .join('\n\n')
 }

+/** Returns tool spans from the trace, optionally filtered to a specific tool name. */
 export async function getToolSpans(trace: Trace, toolName?: string): Promise<SpanData[]> {
  const spans = await trace.getSpans({ spanType: ['tool'] })
  if (!toolName) return spans
@@ -130,7 +133,7 @@ export const knowledgeUsageScorer: EvalScorer<
  if (!expected.requiredKnowledge || !trace) return null

  const knowledgeSpans = await getToolSpans(trace, 'load_knowledge')
-  const loadedKnowledge = knowledgeSpans.flatMap((s) => {
+  const loadedKnowledge: string[] = knowledgeSpans.flatMap((s) => {
    const r = loadKnowledgeInputSchema.safeParse(s.input)
    return r.success ? [r.data.name] : []
  })
--- a/apps/studio/lib/ai/tools/studio-tools.ts
+++ b/apps/studio/lib/ai/tools/studio-tools.ts
@@ -18,28 +18,36 @@ const KNOWLEDGE = {

 type KnowledgeName = keyof typeof KNOWLEDGE

+export const executeSqlInputSchema = z.object({
+  // Transform at parse time so the corrected SQL is what gets stored in
+  // toolCall.input — ensuring evals and logs reflect what actually runs.
+  sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
+  label: z.string().describe('A short 2-4 word label for the SQL statement.'),
+  chartConfig: z
+    .object({
+      view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
+      xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
+      yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
+    })
+    .describe('Chart configuration for rendering the results'),
+  isWriteQuery: z
+    .boolean()
+    .default(false)
+    .describe(
+      'Whether the SQL statement performs a write operation of any kind instead of a read operation'
+    ),
+})
+
+export const loadKnowledgeInputSchema = z.object({
+  name: z
+    .enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
+    .describe('The knowledge to load'),
+})
+
 export const getStudioTools = () => ({
  execute_sql: tool({
    description: 'Asks the user to execute a SQL statement and return the results',
-    inputSchema: z.object({
-      // Transform at parse time so the corrected SQL is what gets stored in
-      // toolCall.input — ensuring evals and logs reflect what actually runs.
-      sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
-      label: z.string().describe('A short 2-4 word label for the SQL statement.'),
-      chartConfig: z
-        .object({
-          view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
-          xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
-          yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
-        })
-        .describe('Chart configuration for rendering the results'),
-      isWriteQuery: z
-        .boolean()
-        .default(false)
-        .describe(
-          'Whether the SQL statement performs a write operation of any kind instead of a read operation'
-        ),
-    }),
+    inputSchema: executeSqlInputSchema,
  }),
  deploy_edge_function: tool({
    description:
@@ -61,11 +69,7 @@ export const getStudioTools = () => ({
  load_knowledge: tool({
    description:
      'Load detailed knowledge about a Supabase topic before answering questions about it.',
-    inputSchema: z.object({
-      name: z
-        .enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
-        .describe('The knowledge to load'),
-    }),
+    inputSchema: loadKnowledgeInputSchema,
    execute: ({ name }) => KNOWLEDGE[name],
  }),
 })