mirror of
https://github.com/supabase/supabase.git
synced 2026-05-09 00:10:05 +08:00
Prevents the AI assistant from helping with local git/filesystem operations, and adds explicit warnings before irreversible database operations (DROP TABLE, DELETE without WHERE, etc.). Adds a `safetyScorer` and eval cases to cover these behaviours. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added a Safety metric to evaluations so assistant responses are scored for safe handling of destructive or risky requests * Assistant guidance updated to refuse destructive local VCS/filesystem actions and require clear warnings for irreversible database operations * **Tests** * Added evaluation cases covering safe refusals, clear warnings, and correct handling of destructive or risky prompts * **Chores** * Enabled Safety metric in online evaluation manifests/handlers <!-- end of auto-generated comment: release notes by coderabbit.ai -->
66 lines
2.1 KiB
TypeScript
66 lines
2.1 KiB
TypeScript
import assert from 'node:assert'
|
|
import { Eval } from 'braintrust'
|
|
|
|
import { dataset } from './dataset'
|
|
import { buildAssistantEvalOutput } from './output'
|
|
import {
|
|
completenessScorer,
|
|
concisenessScorer,
|
|
correctnessScorer,
|
|
docsFaithfulnessScorer,
|
|
goalCompletionScorer,
|
|
knowledgeUsageScorer,
|
|
safetyScorer,
|
|
toolUsageScorer,
|
|
urlValidityScorer,
|
|
} from './scorer'
|
|
import { sqlIdentifierQuotingScorer, sqlSyntaxScorer } from './scorer-wasm'
|
|
import { generateAssistantResponse } from '@/lib/ai/generate-assistant-response'
|
|
import { getModel } from '@/lib/ai/model'
|
|
import { DEFAULT_ASSISTANT_BASE_MODEL_ID, getAssistantModelEntry } from '@/lib/ai/model.utils'
|
|
import { getMockTools } from '@/lib/ai/tools/mock-tools'
|
|
|
|
assert(process.env.BRAINTRUST_PROJECT_ID, 'BRAINTRUST_PROJECT_ID is not set')
|
|
assert(process.env.OPENAI_API_KEY, 'OPENAI_API_KEY is not set')
|
|
|
|
Eval('Assistant', {
|
|
projectId: process.env.BRAINTRUST_PROJECT_ID,
|
|
trialCount: process.env.CI ? 3 : 1,
|
|
data: () => dataset,
|
|
task: async (input) => {
|
|
const modelEntry = getAssistantModelEntry(DEFAULT_ASSISTANT_BASE_MODEL_ID)
|
|
const modelResponse = await getModel({ provider: 'openai', modelEntry })
|
|
if (modelResponse.error) throw modelResponse.error
|
|
|
|
const result = await generateAssistantResponse({
|
|
...modelResponse.modelParams,
|
|
messages: [
|
|
{
|
|
id: '1',
|
|
role: 'user',
|
|
parts: [{ type: 'text', text: input.prompt }],
|
|
},
|
|
],
|
|
tools: await getMockTools(input.mockTables ? { list_tables: input.mockTables } : undefined),
|
|
})
|
|
|
|
// `result.toolCalls` only shows the last step, instead aggregate tools across all steps
|
|
const [finishReason, steps] = await Promise.all([result.finishReason, result.steps])
|
|
|
|
return buildAssistantEvalOutput(finishReason, steps)
|
|
},
|
|
scores: [
|
|
toolUsageScorer,
|
|
knowledgeUsageScorer,
|
|
sqlSyntaxScorer,
|
|
sqlIdentifierQuotingScorer,
|
|
goalCompletionScorer,
|
|
concisenessScorer,
|
|
completenessScorer,
|
|
docsFaithfulnessScorer,
|
|
correctnessScorer,
|
|
safetyScorer,
|
|
urlValidityScorer,
|
|
],
|
|
})
|