mirror of
https://github.com/supabase/supabase.git
synced 2026-05-06 22:18:00 +08:00
refactor(studio): address second round of review feedback on trace-level scorers
This commit is contained in:
@@ -1,12 +1,10 @@
|
||||
import { EvalScorer, Trace } from 'braintrust'
|
||||
import { parse } from 'libpg-query'
|
||||
import { z } from 'zod'
|
||||
|
||||
import { AssistantEvalInput, AssistantEvalOutput, Expected, getToolSpans } from './scorer'
|
||||
import { executeSqlInputSchema } from '@/lib/ai/tools/studio-tools'
|
||||
import { extractIdentifiers, isQuotedInSql, needsQuoting } from '@/lib/sql-identifier-quoting'
|
||||
|
||||
const executeSqlInputSchema = z.object({ sql: z.string() })
|
||||
|
||||
/** Extracts SQL strings from all `execute_sql` tool spans in the trace. */
|
||||
async function getSqlQueries(trace: Trace): Promise<string[]> {
|
||||
const spans = await getToolSpans(trace, 'execute_sql')
|
||||
|
||||
@@ -4,6 +4,7 @@ import { EvalCase, EvalScorer, SpanData, Trace } from 'braintrust'
|
||||
import { stripIndent } from 'common-tags'
|
||||
import { z } from 'zod'
|
||||
|
||||
import { loadKnowledgeInputSchema } from '@/lib/ai/tools/studio-tools'
|
||||
import { extractUrls } from '@/lib/helpers'
|
||||
|
||||
const LLM_AS_A_JUDGE_MODEL = 'gpt-5.2' // NOTE: `gpt-5.2-2025-12-11` snapshot not yet working with online scorers
|
||||
@@ -52,10 +53,10 @@ export type AssistantEvalCase = EvalCase<AssistantEvalInput, Expected, Assistant
|
||||
|
||||
const chatMessageSchema = z.object({ role: z.string(), content: z.unknown() })
|
||||
const textContentBlockSchema = z.object({ type: z.literal('text'), text: z.string() })
|
||||
const loadKnowledgeInputSchema = z.object({ name: z.string() })
|
||||
// search_docs returns { content: [{ text: string }] } where each text is a JSON doc string
|
||||
const searchDocsOutputSchema = z.object({ content: z.array(z.object({ text: z.string() })) })
|
||||
|
||||
/** Extracts plain text from a message content field (string or content-block array). */
|
||||
function extractMessageText(content: unknown): string {
|
||||
if (typeof content === 'string') return content
|
||||
if (!Array.isArray(content)) return ''
|
||||
@@ -67,6 +68,7 @@ function extractMessageText(content: unknown): string {
|
||||
.join('\n')
|
||||
}
|
||||
|
||||
/** Returns the text of the last assistant message in the trace thread, or null if none. */
|
||||
async function getLastAssistantText(trace: Trace): Promise<string | null> {
|
||||
const thread = await trace.getThread()
|
||||
for (let i = thread.length - 1; i >= 0; i--) {
|
||||
@@ -94,6 +96,7 @@ async function getConversationContext(trace: Trace): Promise<string> {
|
||||
.join('\n\n')
|
||||
}
|
||||
|
||||
/** Returns tool spans from the trace, optionally filtered to a specific tool name. */
|
||||
export async function getToolSpans(trace: Trace, toolName?: string): Promise<SpanData[]> {
|
||||
const spans = await trace.getSpans({ spanType: ['tool'] })
|
||||
if (!toolName) return spans
|
||||
@@ -130,7 +133,7 @@ export const knowledgeUsageScorer: EvalScorer<
|
||||
if (!expected.requiredKnowledge || !trace) return null
|
||||
|
||||
const knowledgeSpans = await getToolSpans(trace, 'load_knowledge')
|
||||
const loadedKnowledge = knowledgeSpans.flatMap((s) => {
|
||||
const loadedKnowledge: string[] = knowledgeSpans.flatMap((s) => {
|
||||
const r = loadKnowledgeInputSchema.safeParse(s.input)
|
||||
return r.success ? [r.data.name] : []
|
||||
})
|
||||
|
||||
@@ -18,28 +18,36 @@ const KNOWLEDGE = {
|
||||
|
||||
type KnowledgeName = keyof typeof KNOWLEDGE
|
||||
|
||||
export const executeSqlInputSchema = z.object({
|
||||
// Transform at parse time so the corrected SQL is what gets stored in
|
||||
// toolCall.input — ensuring evals and logs reflect what actually runs.
|
||||
sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
|
||||
label: z.string().describe('A short 2-4 word label for the SQL statement.'),
|
||||
chartConfig: z
|
||||
.object({
|
||||
view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
|
||||
xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
|
||||
yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
|
||||
})
|
||||
.describe('Chart configuration for rendering the results'),
|
||||
isWriteQuery: z
|
||||
.boolean()
|
||||
.default(false)
|
||||
.describe(
|
||||
'Whether the SQL statement performs a write operation of any kind instead of a read operation'
|
||||
),
|
||||
})
|
||||
|
||||
export const loadKnowledgeInputSchema = z.object({
|
||||
name: z
|
||||
.enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
|
||||
.describe('The knowledge to load'),
|
||||
})
|
||||
|
||||
export const getStudioTools = () => ({
|
||||
execute_sql: tool({
|
||||
description: 'Asks the user to execute a SQL statement and return the results',
|
||||
inputSchema: z.object({
|
||||
// Transform at parse time so the corrected SQL is what gets stored in
|
||||
// toolCall.input — ensuring evals and logs reflect what actually runs.
|
||||
sql: z.string().describe('The SQL statement to execute.').transform(fixSqlBackslashEscapes),
|
||||
label: z.string().describe('A short 2-4 word label for the SQL statement.'),
|
||||
chartConfig: z
|
||||
.object({
|
||||
view: z.enum(['table', 'chart']).describe('How to render the results after execution'),
|
||||
xAxis: z.string().optional().describe('The column to use for the x-axis of the chart.'),
|
||||
yAxis: z.string().optional().describe('The column to use for the y-axis of the chart.'),
|
||||
})
|
||||
.describe('Chart configuration for rendering the results'),
|
||||
isWriteQuery: z
|
||||
.boolean()
|
||||
.default(false)
|
||||
.describe(
|
||||
'Whether the SQL statement performs a write operation of any kind instead of a read operation'
|
||||
),
|
||||
}),
|
||||
inputSchema: executeSqlInputSchema,
|
||||
}),
|
||||
deploy_edge_function: tool({
|
||||
description:
|
||||
@@ -61,11 +69,7 @@ export const getStudioTools = () => ({
|
||||
load_knowledge: tool({
|
||||
description:
|
||||
'Load detailed knowledge about a Supabase topic before answering questions about it.',
|
||||
inputSchema: z.object({
|
||||
name: z
|
||||
.enum(Object.keys(KNOWLEDGE) as [KnowledgeName, ...KnowledgeName[]])
|
||||
.describe('The knowledge to load'),
|
||||
}),
|
||||
inputSchema: loadKnowledgeInputSchema,
|
||||
execute: ({ name }) => KNOWLEDGE[name],
|
||||
}),
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user