refactor(studio): address second round of review feedback on trace-level scorers

This commit is contained in:
Matt Rossman
2026-05-04 18:17:09 -04:00
parent e83579310e
commit 5de89e48f2
3 changed files with 34 additions and 29 deletions

View File

@@ -1,12 +1,10 @@
import { EvalScorer, Trace } from 'braintrust'
import { parse } from 'libpg-query'
import { z } from 'zod'
import { AssistantEvalInput, AssistantEvalOutput, Expected, getToolSpans } from './scorer'
import { executeSqlInputSchema } from '@/lib/ai/tools/studio-tools'
import { extractIdentifiers, isQuotedInSql, needsQuoting } from '@/lib/sql-identifier-quoting'
const executeSqlInputSchema = z.object({ sql: z.string() })
/** Extracts SQL strings from all `execute_sql` tool spans in the trace. */
async function getSqlQueries(trace: Trace): Promise<string[]> {
const spans = await getToolSpans(trace, 'execute_sql')

View File

@@ -4,6 +4,7 @@ import { EvalCase, EvalScorer, SpanData, Trace } from 'braintrust'
import { stripIndent } from 'common-tags'
import { z } from 'zod'
import { loadKnowledgeInputSchema } from '@/lib/ai/tools/studio-tools'
import { extractUrls } from '@/lib/helpers'
const LLM_AS_A_JUDGE_MODEL = 'gpt-5.2' // NOTE: `gpt-5.2-2025-12-11` snapshot not yet working with online scorers
@@ -52,10 +53,10 @@ export type AssistantEvalCase = EvalCase<AssistantEvalInput, Expected, Assistant
const chatMessageSchema = z.object({ role: z.string(), content: z.unknown() })
const textContentBlockSchema = z.object({ type: z.literal('text'), text: z.string() })
const loadKnowledgeInputSchema = z.object({ name: z.string() })
// search_docs returns { content: [{ text: string }] } where each text is a JSON doc string
const searchDocsOutputSchema = z.object({ content: z.array(z.object({ text: z.string() })) })
/** Extracts plain text from a message content field (string or content-block array). */
function extractMessageText(content: unknown): string {
if (typeof content === 'string') return content
if (!Array.isArray(content)) return ''
@@ -67,6 +68,7 @@ function extractMessageText(content: unknown): string {
.join('\n')
}
/** Returns the text of the last assistant message in the trace thread, or null if none. */
async function getLastAssistantText(trace: Trace): Promise<string | null> {
const thread = await trace.getThread()
for (let i = thread.length - 1; i >= 0; i--) {
@@ -94,6 +96,7 @@ async function getConversationContext(trace: Trace): Promise<string> {
.join('\n\n')
}
/** Returns tool spans from the trace, optionally filtered to a specific tool name. */
export async function getToolSpans(trace: Trace, toolName?: string): Promise<SpanData[]> {
const spans = await trace.getSpans({ spanType: ['tool'] })
if (!toolName) return spans
@@ -130,7 +133,7 @@ export const knowledgeUsageScorer: EvalScorer<
if (!expected.requiredKnowledge || !trace) return null
const knowledgeSpans = await getToolSpans(trace, 'load_knowledge')
const loadedKnowledge = knowledgeSpans.flatMap((s) => {
const loadedKnowledge: string[] = knowledgeSpans.flatMap((s) => {
const r = loadKnowledgeInputSchema.safeParse(s.input)
return r.success ? [r.data.name] : []
})

View File

@@ -18,28 +18,36 @@ const KNOWLEDGE = {
type KnowledgeName = keyof typeof KNOWLEDGE
export const executeSqlInputSchema = z.object({
// Transform at parse time so the corrected SQL is what gets stored in
// toolCall.input — ensuring evals and logs reflect what actually runs.
sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
label: z.string().describe('A short 2-4 word label for the SQL statement.'),
chartConfig: z
.object({
view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
})
.describe('Chart configuration for rendering the results'),
isWriteQuery: z
.boolean()
.default(false)
.describe(
'Whether the SQL statement performs a write operation of any kind instead of a read operation'
),
})
export const loadKnowledgeInputSchema = z.object({
name: z
.enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
.describe('The knowledge to load'),
})
export const getStudioTools = () => ({
execute_sql: tool({
description: 'Asks the user to execute a SQL statement and return the results',
inputSchema: z.object({
// Transform at parse time so the corrected SQL is what gets stored in
// toolCall.input — ensuring evals and logs reflect what actually runs.
sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
label: z.string().describe('A short 2-4 word label for the SQL statement.'),
chartConfig: z
.object({
view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
})
.describe('Chart configuration for rendering the results'),
isWriteQuery: z
.boolean()
.default(false)
.describe(
'Whether the SQL statement performs a write operation of any kind instead of a read operation'
),
}),
inputSchema: executeSqlInputSchema,
}),
deploy_edge_function: tool({
description:
@@ -61,11 +69,7 @@ export const getStudioTools = () => ({
load_knowledge: tool({
description:
'Load detailed knowledge about a Supabase topic before answering questions about it.',
inputSchema: z.object({
name: z
.enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
.describe('The knowledge to load'),
}),
inputSchema: loadKnowledgeInputSchema,
execute: ({ name }) => KNOWLEDGE[name],
}),
})