Files
supabase/apps/studio/evals/assistant.eval.ts
Greg Richardson 5f8906a20e fix: add destructive operation guardrails to AI assistant (#45194)
Prevents the AI assistant from helping with local git/filesystem
operations, and adds explicit warnings before irreversible database
operations (DROP TABLE, DELETE without WHERE, etc.).

Adds a `safetyScorer` and eval cases to cover these behaviours.

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **New Features**
* Added a Safety metric to evaluations so assistant responses are scored
for safe handling of destructive or risky requests
* Assistant guidance updated to refuse destructive local VCS/filesystem
actions and require clear warnings for irreversible database operations

* **Tests**
* Added evaluation cases covering safe refusals, clear warnings, and
correct handling of destructive or risky prompts

* **Chores**
  * Enabled Safety metric in online evaluation manifests/handlers
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
2026-05-06 09:24:21 -06:00

66 lines
2.1 KiB
TypeScript

import assert from 'node:assert'
import { Eval } from 'braintrust'
import { dataset } from './dataset'
import { buildAssistantEvalOutput } from './output'
import {
completenessScorer,
concisenessScorer,
correctnessScorer,
docsFaithfulnessScorer,
goalCompletionScorer,
knowledgeUsageScorer,
safetyScorer,
toolUsageScorer,
urlValidityScorer,
} from './scorer'
import { sqlIdentifierQuotingScorer, sqlSyntaxScorer } from './scorer-wasm'
import { generateAssistantResponse } from '@/lib/ai/generate-assistant-response'
import { getModel } from '@/lib/ai/model'
import { DEFAULT_ASSISTANT_BASE_MODEL_ID, getAssistantModelEntry } from '@/lib/ai/model.utils'
import { getMockTools } from '@/lib/ai/tools/mock-tools'
assert(process.env.BRAINTRUST_PROJECT_ID, 'BRAINTRUST_PROJECT_ID is not set')
assert(process.env.OPENAI_API_KEY, 'OPENAI_API_KEY is not set')
Eval('Assistant', {
projectId: process.env.BRAINTRUST_PROJECT_ID,
trialCount: process.env.CI ? 3 : 1,
data: () => dataset,
task: async (input) => {
const modelEntry = getAssistantModelEntry(DEFAULT_ASSISTANT_BASE_MODEL_ID)
const modelResponse = await getModel({ provider: 'openai', modelEntry })
if (modelResponse.error) throw modelResponse.error
const result = await generateAssistantResponse({
...modelResponse.modelParams,
messages: [
{
id: '1',
role: 'user',
parts: [{ type: 'text', text: input.prompt }],
},
],
tools: await getMockTools(input.mockTables ? { list_tables: input.mockTables } : undefined),
})
// `result.toolCalls` only shows the last step, instead aggregate tools across all steps
const [finishReason, steps] = await Promise.all([result.finishReason, result.steps])
return buildAssistantEvalOutput(finishReason, steps)
},
scores: [
toolUsageScorer,
knowledgeUsageScorer,
sqlSyntaxScorer,
sqlIdentifierQuotingScorer,
goalCompletionScorer,
concisenessScorer,
completenessScorer,
docsFaithfulnessScorer,
correctnessScorer,
safetyScorer,
urlValidityScorer,
],
})