From 5de89e48f20e731e6fe9002574e5fdb6bf3b92bb Mon Sep 17 00:00:00 2001 From: Matt Rossman <22670878+mattrossman@users.noreply.github.com> Date: Mon, 4 May 2026 18:17:09 -0400 Subject: [PATCH] refactor(studio): address second round of review feedback on trace-level scorers --- apps/studio/evals/scorer-wasm.ts | 4 +- apps/studio/evals/scorer.ts | 7 +++- apps/studio/lib/ai/tools/studio-tools.ts | 52 +++++++++++++----------- 3 files changed, 34 insertions(+), 29 deletions(-) diff --git a/apps/studio/evals/scorer-wasm.ts b/apps/studio/evals/scorer-wasm.ts index bf32088825..7a4798c7c0 100644 --- a/apps/studio/evals/scorer-wasm.ts +++ b/apps/studio/evals/scorer-wasm.ts @@ -1,12 +1,10 @@ import { EvalScorer, Trace } from 'braintrust' import { parse } from 'libpg-query' -import { z } from 'zod' import { AssistantEvalInput, AssistantEvalOutput, Expected, getToolSpans } from './scorer' +import { executeSqlInputSchema } from '@/lib/ai/tools/studio-tools' import { extractIdentifiers, isQuotedInSql, needsQuoting } from '@/lib/sql-identifier-quoting' -const executeSqlInputSchema = z.object({ sql: z.string() }) - /** Extracts SQL strings from all `execute_sql` tool spans in the trace. */ async function getSqlQueries(trace: Trace): Promise { const spans = await getToolSpans(trace, 'execute_sql') diff --git a/apps/studio/evals/scorer.ts b/apps/studio/evals/scorer.ts index ba0b61f884..515dde253e 100644 --- a/apps/studio/evals/scorer.ts +++ b/apps/studio/evals/scorer.ts @@ -4,6 +4,7 @@ import { EvalCase, EvalScorer, SpanData, Trace } from 'braintrust' import { stripIndent } from 'common-tags' import { z } from 'zod' +import { loadKnowledgeInputSchema } from '@/lib/ai/tools/studio-tools' import { extractUrls } from '@/lib/helpers' const LLM_AS_A_JUDGE_MODEL = 'gpt-5.2' // NOTE: `gpt-5.2-2025-12-11` snapshot not yet working with online scorers @@ -52,10 +53,10 @@ export type AssistantEvalCase = EvalCase { const thread = await trace.getThread() for (let i = thread.length - 1; i >= 0; i--) { @@ -94,6 +96,7 @@ async function getConversationContext(trace: Trace): Promise { .join('\n\n') } +/** Returns tool spans from the trace, optionally filtered to a specific tool name. */ export async function getToolSpans(trace: Trace, toolName?: string): Promise { const spans = await trace.getSpans({ spanType: ['tool'] }) if (!toolName) return spans @@ -130,7 +133,7 @@ export const knowledgeUsageScorer: EvalScorer< if (!expected.requiredKnowledge || !trace) return null const knowledgeSpans = await getToolSpans(trace, 'load_knowledge') - const loadedKnowledge = knowledgeSpans.flatMap((s) => { + const loadedKnowledge: string[] = knowledgeSpans.flatMap((s) => { const r = loadKnowledgeInputSchema.safeParse(s.input) return r.success ? [r.data.name] : [] }) diff --git a/apps/studio/lib/ai/tools/studio-tools.ts b/apps/studio/lib/ai/tools/studio-tools.ts index 1279e3f12a..b5c84ba668 100644 --- a/apps/studio/lib/ai/tools/studio-tools.ts +++ b/apps/studio/lib/ai/tools/studio-tools.ts @@ -18,28 +18,36 @@ const KNOWLEDGE = { type KnowledgeName = keyof typeof KNOWLEDGE +export const executeSqlInputSchema = z.object({ + // Transform at parse time so the corrected SQL is what gets stored in + // toolCall.input — ensuring evals and logs reflect what actually runs. + sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes), + label: z.string().describe('A short 2-4 word label for the SQL statement.'), + chartConfig: z + .object({ + view: z.enum(['table', 'chart']).describe('How to render the results after execution'), + xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'), + yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'), + }) + .describe('Chart configuration for rendering the results'), + isWriteQuery: z + .boolean() + .default(false) + .describe( + 'Whether the SQL statement performs a write operation of any kind instead of a read operation' + ), +}) + +export const loadKnowledgeInputSchema = z.object({ + name: z + .enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]]) + .describe('The knowledge to load'), +}) + export const getStudioTools = () => ({ execute_sql: tool({ description: 'Asks the user to execute a SQL statement and return the results', - inputSchema: z.object({ - // Transform at parse time so the corrected SQL is what gets stored in - // toolCall.input — ensuring evals and logs reflect what actually runs. - sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes), - label: z.string().describe('A short 2-4 word label for the SQL statement.'), - chartConfig: z - .object({ - view: z.enum(['table', 'chart']).describe('How to render the results after execution'), - xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'), - yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'), - }) - .describe('Chart configuration for rendering the results'), - isWriteQuery: z - .boolean() - .default(false) - .describe( - 'Whether the SQL statement performs a write operation of any kind instead of a read operation' - ), - }), + inputSchema: executeSqlInputSchema, }), deploy_edge_function: tool({ description: @@ -61,11 +69,7 @@ export const getStudioTools = () => ({ load_knowledge: tool({ description: 'Load detailed knowledge about a Supabase topic before answering questions about it.', - inputSchema: z.object({ - name: z - .enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]]) - .describe('The knowledge to load'), - }), + inputSchema: loadKnowledgeInputSchema, execute: ({ name }) => KNOWLEDGE[name], }), })