mirror of
https://github.com/linshenkx/prompt-optimizer.git
synced 2026-05-07 22:18:23 +08:00
1489 lines
51 KiB
JavaScript
1489 lines
51 KiB
JavaScript
import fs from 'node:fs/promises';
|
||
import path from 'node:path';
|
||
import { fileURLToPath } from 'node:url';
|
||
import dotenv from 'dotenv';
|
||
import {
|
||
MemoryStorageProvider,
|
||
createPreferenceService,
|
||
createTemplateLanguageService,
|
||
createTemplateManager,
|
||
createModelManager,
|
||
createLLMService,
|
||
createEvaluationService,
|
||
buildRewritePayload,
|
||
buildRewritePromptFromEvaluation,
|
||
} from '../packages/core/dist/index.js';
|
||
|
||
const ROOT_DIR = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
|
||
const OUTPUT_ROOT = path.join(
|
||
ROOT_DIR,
|
||
'docs',
|
||
'workspace',
|
||
'compare-evaluation-analysis',
|
||
'structured-compare-calibration',
|
||
'latest',
|
||
);
|
||
|
||
dotenv.config({ path: path.join(ROOT_DIR, '.env.local') });
|
||
|
||
const nowIso = new Date().toISOString();
|
||
const CALIBRATION_TIMEOUT_MS = 180000;
|
||
const CALIBRATION_MAX_RETRIES = 3;
|
||
|
||
function isRetryableCalibrationError(error) {
|
||
const message = error instanceof Error ? error.message : String(error || '');
|
||
return (
|
||
message.includes('Socket timeout') ||
|
||
message.includes('ERR_SOCKET_TIMEOUT') ||
|
||
message.includes('ETIMEDOUT') ||
|
||
message.includes('ECONNRESET') ||
|
||
message.includes('429') ||
|
||
message.includes('502') ||
|
||
message.includes('503')
|
||
);
|
||
}
|
||
|
||
async function sleep(ms) {
|
||
return new Promise((resolve) => setTimeout(resolve, ms));
|
||
}
|
||
|
||
const LIVE_BASIC_SYSTEM_CASE = {
|
||
id: 'live-basic-system-boundary-control',
|
||
kind: 'live',
|
||
title: '真实模型: basic-system 边界控制改动',
|
||
description:
|
||
'使用真实 target/teacher 执行 4 个快照,检验 structured compare 是否能识别“更强边界约束”带来的真实收益,而不是只看表面措辞变化。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
targetModelKey: 'custom',
|
||
teacherModelKey: 'deepseek',
|
||
focus:
|
||
'优先判断改动是否真正减少了额外解释、格式边界滑移和输出结构不稳定,而不是只看表面完整度。',
|
||
previousPrompt: [
|
||
'你是一个严格的数据抽取助手。',
|
||
'阅读用户输入,输出一个 JSON 对象,包含以下字段:',
|
||
'- audience: string | null',
|
||
'- pain_points: string[]',
|
||
'- tone: string | null',
|
||
'要求:只返回 JSON。',
|
||
].join('\n'),
|
||
workspacePrompt: [
|
||
'你是一个严格的数据抽取助手。',
|
||
'你的任务是阅读用户输入,并输出一个且仅一个 JSON 对象。',
|
||
'JSON schema 必须为:',
|
||
'{"audience": string|null, "pain_points": string[], "tone": string|null}',
|
||
'规则:',
|
||
'1. 只输出 JSON 对象,不要输出 Markdown、解释、前后缀或代码块。',
|
||
'2. pain_points 只保留用户明确提到的问题,不要脑补。',
|
||
'3. 缺失信息时 audience 和 tone 用 null,pain_points 用 []。',
|
||
'4. 键名必须完全使用 audience、pain_points、tone。',
|
||
].join('\n'),
|
||
userInput:
|
||
'我在做一个给独立设计师用的合同管理工具,语气希望专业可信。现在最大的问题是版本混乱和客户确认来回很慢。请先解释你的判断依据,再给出结果。',
|
||
};
|
||
|
||
const SYNTHETIC_CASES = [
|
||
{
|
||
id: 'synthetic-medical-latent-trigger-overfit',
|
||
kind: 'synthetic',
|
||
title: '合成样本: 医疗分诊里的隐性触发过拟合',
|
||
description:
|
||
'workspace prompt 针对当前样例里的“胸口发紧 + 上楼更明显”加入了硬编码动作,看起来更果断,但 reference 侧并不支持这条触发规则。这个样本用于观察系统是否能把“样例贴合收益”与“可复用改进”区分开。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
focus:
|
||
'如果新规则只是在当前病例上更像“对题作答”,而不是更稳的分诊策略,应保留高风险或 review 结论,而不是盲目判为进步。',
|
||
targetWorkspacePrompt: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
targetPreviousPrompt: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '分诊输入',
|
||
content:
|
||
'最近三天我一爬楼梯就胸口发紧,还会气短,休息十分钟会缓一些。请帮我判断现在该怎么处理。',
|
||
},
|
||
},
|
||
],
|
||
snapshots: [
|
||
{
|
||
id: 'a',
|
||
label: 'A',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Workspace' },
|
||
promptText: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
urgency: 'high',
|
||
next_step: '立即去急诊,并停止所有含咖啡因饮品。',
|
||
warning_signs: ['胸口发紧', '活动后加重', '气短'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace',
|
||
},
|
||
{
|
||
id: 'b',
|
||
label: 'B',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 7, label: 'Previous' },
|
||
promptText: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
urgency: 'high',
|
||
next_step: '今天尽快去线下急诊或胸痛门诊评估;若症状持续或加重,立即就医。',
|
||
warning_signs: ['胸口发紧', '活动后加重', '气短'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'v7',
|
||
},
|
||
{
|
||
id: 'c',
|
||
label: 'C',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
promptText: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
urgency: 'high',
|
||
next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。',
|
||
warning_signs: ['胸口发紧', '活动后加重', '气短'],
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-workspace',
|
||
},
|
||
{
|
||
id: 'd',
|
||
label: 'D',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 7, label: 'Teacher Previous' },
|
||
promptText: [
|
||
'你是医疗分诊建议助手。',
|
||
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
|
||
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
|
||
'不要添加解释或免责声明。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
urgency: 'high',
|
||
next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。',
|
||
warning_signs: ['胸口发紧', '活动后加重', '气短'],
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-v7',
|
||
},
|
||
],
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
expected: {
|
||
stopSignals: {
|
||
overfitRisk: ['high'],
|
||
stopRecommendation: ['review'],
|
||
},
|
||
pairSignals: {
|
||
targetBaseline: ['flat', 'improved'],
|
||
referenceBaseline: ['unsupported'],
|
||
},
|
||
conflictSignals: ['improvementNotSupportedOnReference'],
|
||
},
|
||
},
|
||
{
|
||
id: 'synthetic-ecommerce-schema-no-model-worship',
|
||
kind: 'synthetic',
|
||
title: '合成样本: 电商抽取里不能因为 teacher 更会写就忽略 schema',
|
||
description:
|
||
'workspace prompt 把既有商品抽取 contract 改成了新字段和外层 wrapper,teacher 输出也更像“高质量总结”。这个样本用于校验 compare 是否会坚持 schema/contract 优先,而不是因为 reference 更流畅就放过漂移。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
focus:
|
||
'即便 reference 看上去更完整、更自然,只要 prompt 改动造成字段名或外层结构变化,就应把 target 相对 baseline 判为回退。',
|
||
targetWorkspacePrompt: [
|
||
'你是电商商品信息抽取助手。',
|
||
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
|
||
'为了后续扩展,把三个字段统一包在 payload 里。',
|
||
'不要解释。',
|
||
].join('\n'),
|
||
targetPreviousPrompt: [
|
||
'你是电商商品信息抽取助手。',
|
||
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
|
||
'不要改字段名,不要添加外层包裹对象,不要解释。',
|
||
].join('\n'),
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '商品文案',
|
||
content:
|
||
'便携手冲咖啡壶,容量 600ml,适合露营和办公室使用,主打双层不锈钢保温,注意不支持电磁炉直火加热。',
|
||
},
|
||
},
|
||
],
|
||
snapshots: [
|
||
{
|
||
id: 'a',
|
||
label: 'A',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Workspace' },
|
||
promptText: [
|
||
'你是电商商品信息抽取助手。',
|
||
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
|
||
'为了后续扩展,把三个字段统一包在 payload 里。',
|
||
'不要解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
payload: {
|
||
product_name: '便携手冲咖啡壶',
|
||
buyer_highlights: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
|
||
cautions: ['不支持电磁炉直火加热'],
|
||
},
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace',
|
||
},
|
||
{
|
||
id: 'b',
|
||
label: 'B',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 4, label: 'Previous' },
|
||
promptText: [
|
||
'你是电商商品信息抽取助手。',
|
||
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
|
||
'不要改字段名,不要添加外层包裹对象,不要解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
title: '便携手冲咖啡壶',
|
||
selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
|
||
cautions: ['不支持电磁炉直火加热'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'v4',
|
||
},
|
||
{
|
||
id: 'c',
|
||
label: 'C',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
promptText: [
|
||
'你是电商商品信息抽取助手。',
|
||
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
|
||
'为了后续扩展,把三个字段统一包在 payload 里。',
|
||
'不要解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
payload: {
|
||
product_name: '便携手冲咖啡壶',
|
||
buyer_highlights: [
|
||
'双场景使用:露营与办公室',
|
||
'600ml 大容量',
|
||
'双层不锈钢保温更稳',
|
||
],
|
||
cautions: ['不支持电磁炉直火加热'],
|
||
},
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-workspace',
|
||
},
|
||
{
|
||
id: 'd',
|
||
label: 'D',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 4, label: 'Teacher Previous' },
|
||
promptText: [
|
||
'你是电商商品信息抽取助手。',
|
||
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
|
||
'不要改字段名,不要添加外层包裹对象,不要解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
title: '便携手冲咖啡壶',
|
||
selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
|
||
cautions: ['不支持电磁炉直火加热'],
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-v4',
|
||
},
|
||
],
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
expected: {
|
||
stopSignals: {
|
||
targetVsBaseline: ['regressed'],
|
||
stopRecommendation: ['review'],
|
||
},
|
||
pairSignals: {
|
||
targetBaseline: ['regressed'],
|
||
targetReference: ['none', 'minor'],
|
||
referenceBaseline: ['unsupported'],
|
||
},
|
||
conflictSignals: ['regressionOutweighsCosmeticGains'],
|
||
},
|
||
},
|
||
{
|
||
id: 'synthetic-legal-flat-not-unclear',
|
||
kind: 'synthetic',
|
||
title: '合成样本: 法务风险摘要应该判 flat 而不是 unclear',
|
||
description:
|
||
'workspace prompt 只把表达风格改得更口语化,但目标输出与 previous 在风险结论和行动建议上没有实质变化。这个样本用于观察 judge 是否能稳定给出 flat,而不是因为措辞不同就退回 unclear。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
focus:
|
||
'当两个版本在核心结论、风险点和动作建议上等价时,应更倾向于 flat,而不是把风格差异误判成信息不足。',
|
||
targetWorkspacePrompt: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'用更简洁、偏业务同学可读的中文表达。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
targetPreviousPrompt: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'保持客观、精炼。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '合同片段',
|
||
content:
|
||
'合作协议约定平台可单方修改结算周期,并在未通知的情况下暂停服务;违约责任仅约束供应商,不约束平台。',
|
||
},
|
||
},
|
||
],
|
||
snapshots: [
|
||
{
|
||
id: 'a',
|
||
label: 'A',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Workspace' },
|
||
promptText: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'用更简洁、偏业务同学可读的中文表达。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
risk_level: 'high',
|
||
core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任明显失衡'],
|
||
recommended_action: '要求补充通知义务、限制单方变更范围,并补齐平台违约责任。',
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace',
|
||
},
|
||
{
|
||
id: 'b',
|
||
label: 'B',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 6, label: 'Previous' },
|
||
promptText: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'保持客观、精炼。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
risk_level: 'high',
|
||
core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任分配失衡'],
|
||
recommended_action: '建议增加通知义务、限制单方修改权限,并要求平台承担对等违约责任。',
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'v6',
|
||
},
|
||
{
|
||
id: 'c',
|
||
label: 'C',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
promptText: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'用更简洁、偏业务同学可读的中文表达。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
risk_level: 'high',
|
||
core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任缺乏对等性'],
|
||
recommended_action: '要求把通知义务、变更边界和平台违约责任补齐后再推进。',
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-workspace',
|
||
},
|
||
{
|
||
id: 'd',
|
||
label: 'D',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 6, label: 'Teacher Previous' },
|
||
promptText: [
|
||
'你是法务风险摘要助手。',
|
||
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
|
||
'保持客观、精炼。',
|
||
'不要添加解释。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
risk_level: 'high',
|
||
core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任缺乏对等性'],
|
||
recommended_action: '建议补充通知义务、限制单方变更,并增加平台违约责任。',
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-v6',
|
||
},
|
||
],
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
expected: {
|
||
stopSignals: {
|
||
targetVsBaseline: ['flat'],
|
||
},
|
||
pairSignals: {
|
||
targetBaseline: ['flat'],
|
||
referenceBaseline: ['supported', 'mixed'],
|
||
},
|
||
},
|
||
},
|
||
{
|
||
id: 'synthetic-teaching-overfit-regression',
|
||
kind: 'synthetic',
|
||
title: '合成样本: 教学讲解里的样例口诀导致回退',
|
||
description:
|
||
'workspace prompt 为当前题目硬塞了特定口诀和固定讲法,导致输出只贴当前样例,不再给出通用原理。这个样本用于校验系统能否识别“看似更像老师在说话,实际更窄更脆弱”的回退。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
focus:
|
||
'如果工作区版本为了当前题目显得更顺口,却牺牲了可迁移的通用解释结构,应把它判为 regressed,并暴露较高过拟合风险。',
|
||
targetWorkspacePrompt: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
targetPreviousPrompt: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '题目输入',
|
||
content: '请解释为什么 3/4 ÷ 1/2 = 3/2,给五年级学生听。',
|
||
},
|
||
},
|
||
],
|
||
snapshots: [
|
||
{
|
||
id: 'a',
|
||
label: 'A',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Workspace' },
|
||
promptText: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
explanation:
|
||
'因为半个单位能装两次,所以 3/4 里面能装 1/2 两次的一半,也就是 3/2。记住这句就行。',
|
||
key_rule: '看到 3/4 ÷ 1/2 时,直接理解成答案翻倍。',
|
||
common_mistake: '不要把 3/4 和 1/2 直接相除成 3/8。',
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace',
|
||
},
|
||
{
|
||
id: 'b',
|
||
label: 'B',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 8, label: 'Previous' },
|
||
promptText: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
explanation:
|
||
'除以 1/2 的意思是看 3/4 里面有几个“半个”。因为把一个数除以 1/2,等于看它能分成几个半个,所以可以改写成 3/4 × 2 = 3/2。',
|
||
key_rule: '除以一个分数,就是乘以这个分数的倒数。',
|
||
common_mistake: '不要把“除以 1/2”误算成“再除以 2”。',
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'v8',
|
||
},
|
||
{
|
||
id: 'c',
|
||
label: 'C',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
promptText: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
explanation:
|
||
'可以把 3/4 ÷ 1/2 理解成“3/4 里面有几个半个”,但仍然要告诉学生一般规则:除以 1/2 等于乘 2,所以结果是 3/2。',
|
||
key_rule: '除以一个分数,本质上是乘以它的倒数。',
|
||
common_mistake: '不要只背这个题的口诀,换别的分数就会出错。',
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-workspace',
|
||
},
|
||
{
|
||
id: 'd',
|
||
label: 'D',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 8, label: 'Teacher Previous' },
|
||
promptText: [
|
||
'你是数学讲解助手。',
|
||
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
|
||
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
|
||
'不要添加题外扩展。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
explanation:
|
||
'3/4 ÷ 1/2 的意思是“3/4 里面有几个半个”。因为除以 1/2 就是乘 2,所以可以算成 3/4 × 2 = 3/2。',
|
||
key_rule: '除以一个分数,就是乘以它的倒数。',
|
||
common_mistake: '不要把“除以 1/2”误写成“乘以 1/2”。',
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-v8',
|
||
},
|
||
],
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
expected: {
|
||
stopSignals: {
|
||
targetVsBaseline: ['regressed'],
|
||
overfitRisk: ['high'],
|
||
stopRecommendation: ['review'],
|
||
},
|
||
pairSignals: {
|
||
targetBaseline: ['regressed'],
|
||
referenceBaseline: ['unsupported'],
|
||
},
|
||
conflictSignals: ['regressionOutweighsCosmeticGains'],
|
||
},
|
||
},
|
||
{
|
||
id: 'synthetic-hiring-replica-semantic-instability',
|
||
kind: 'synthetic',
|
||
title: '合成样本: 招聘筛选里 replica 语义不稳定',
|
||
description:
|
||
'workspace prompt 在单次输出里看起来比 previous 更结构化,但同 prompt 的 replica 却给出了不同的录用结论。这个样本用于校验系统是否能识别“单次胜出但语义不稳定”的情况。',
|
||
mode: {
|
||
functionMode: 'basic',
|
||
subMode: 'system',
|
||
},
|
||
evaluatorModelKey: 'deepseek',
|
||
rewriteModelKey: 'deepseek',
|
||
focus:
|
||
'如果工作区版本在重复执行时连录用建议都发生漂移,就算首个结果更像样,也应优先暴露稳定性问题。',
|
||
targetWorkspacePrompt: [
|
||
'你是简历筛选总结助手。',
|
||
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
|
||
'recommendation 只能是 hire、hold、reject 之一。',
|
||
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
|
||
].join('\n'),
|
||
targetPreviousPrompt: [
|
||
'你是简历筛选总结助手。',
|
||
'输出 recommendation, strengths, risks。',
|
||
'结论尽量简洁。',
|
||
].join('\n'),
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '候选人信息',
|
||
content:
|
||
'候选人 6 年 B2B SaaS 产品经验,做过企业权限系统和审计日志,英语一般,过去一年没有直接带人经验。岗位要求:熟悉权限/审计场景,能与海外客户沟通,优先有团队协作经验。',
|
||
},
|
||
},
|
||
],
|
||
snapshots: [
|
||
{
|
||
id: 'a',
|
||
label: 'A',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Workspace' },
|
||
promptText: [
|
||
'你是简历筛选总结助手。',
|
||
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
|
||
'recommendation 只能是 hire、hold、reject 之一。',
|
||
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
recommendation: 'hold',
|
||
strengths: ['有 6 年 B2B SaaS 产品经验', '做过权限系统和审计日志,和岗位高度相关'],
|
||
risks: ['英语一般,海外客户沟通能力待确认', '近期没有直接带人经验'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace',
|
||
},
|
||
{
|
||
id: 'b',
|
||
label: 'B',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 5, label: 'Previous' },
|
||
promptText: [
|
||
'你是简历筛选总结助手。',
|
||
'输出 recommendation, strengths, risks。',
|
||
'结论尽量简洁。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
recommendation: 'hold',
|
||
strengths: ['经验较匹配'],
|
||
risks: ['英语一般'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'v5',
|
||
},
|
||
{
|
||
id: 'c',
|
||
label: 'C',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
promptText: [
|
||
'你是简历筛选总结助手。',
|
||
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
|
||
'recommendation 只能是 hire、hold、reject 之一。',
|
||
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
recommendation: 'hold',
|
||
strengths: ['权限系统和审计日志经验与岗位核心场景强相关', 'B2B SaaS 背景成熟'],
|
||
risks: ['英语一般,跨海外客户沟通需进一步验证', '缺少近期直接管理经验'],
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-workspace',
|
||
},
|
||
{
|
||
id: 'd',
|
||
label: 'D',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'version', version: 5, label: 'Teacher Previous' },
|
||
promptText: [
|
||
'你是简历筛选总结助手。',
|
||
'输出 recommendation, strengths, risks。',
|
||
'结论尽量简洁。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
recommendation: 'hold',
|
||
strengths: ['岗位相关经验较多'],
|
||
risks: ['英语一般,管理经历偏弱'],
|
||
}),
|
||
modelKey: 'deepseek',
|
||
versionLabel: 'teacher-v5',
|
||
},
|
||
{
|
||
id: 'e',
|
||
label: 'E',
|
||
testCaseId: 'tc-1',
|
||
promptRef: { kind: 'workspace', label: 'Replica' },
|
||
promptText: [
|
||
'你是简历筛选总结助手。',
|
||
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
|
||
'recommendation 只能是 hire、hold、reject 之一。',
|
||
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
|
||
].join('\n'),
|
||
output: JSON.stringify({
|
||
recommendation: 'hire',
|
||
strengths: ['权限系统与审计日志经验高度匹配岗位核心需求', 'B2B SaaS 背景可直接上手复杂业务'],
|
||
risks: ['英语一般,但可通过团队支持弥补', '近一年缺少直接带人经验'],
|
||
}),
|
||
modelKey: 'custom',
|
||
versionLabel: 'workspace-replica',
|
||
},
|
||
],
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
e: 'replica',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
expected: {
|
||
stopSignals: {
|
||
stopRecommendation: ['review'],
|
||
},
|
||
pairSignals: {
|
||
targetBaseline: ['improved', 'flat'],
|
||
targetReplica: ['unstable'],
|
||
},
|
||
conflictSignals: ['improvementUnstableAcrossReplicas'],
|
||
},
|
||
},
|
||
];
|
||
|
||
function toPrettyJson(value) {
|
||
return `${JSON.stringify(value, null, 2)}\n`;
|
||
}
|
||
|
||
function jsonFence(value) {
|
||
return `\`\`\`json\n${JSON.stringify(value, null, 2)}\n\`\`\`\n`;
|
||
}
|
||
|
||
function textFence(value, language = '') {
|
||
return `\`\`\`${language}\n${value}\n\`\`\`\n`;
|
||
}
|
||
|
||
async function ensureDir(target) {
|
||
await fs.mkdir(target, { recursive: true });
|
||
}
|
||
|
||
async function writeText(filePath, content) {
|
||
await ensureDir(path.dirname(filePath));
|
||
await fs.writeFile(filePath, content, 'utf8');
|
||
}
|
||
|
||
function renderMessagesMarkdown(messages) {
|
||
return messages
|
||
.map(
|
||
(message, index) =>
|
||
`### Message ${index + 1}\n- role: ${message.role}\n\n${textFence(message.content)}`,
|
||
)
|
||
.join('\n');
|
||
}
|
||
|
||
function renderCallMarkdown(call, index) {
|
||
const responseBlock = call.error
|
||
? `### Error\n${textFence(call.error)}`
|
||
: `### Response\n${textFence(call.response || '')}`;
|
||
|
||
return [
|
||
`## Call ${index + 1}`,
|
||
`- phase: ${call.phase}`,
|
||
`- modelKey: ${call.modelKey}`,
|
||
'',
|
||
'### Messages',
|
||
renderMessagesMarkdown(call.messages),
|
||
'',
|
||
responseBlock,
|
||
'',
|
||
].join('\n');
|
||
}
|
||
|
||
const PAIR_JUDGE_PAYLOAD_MARKERS = [
|
||
'Pair Judge Evidence Payload (JSON):',
|
||
];
|
||
|
||
const SYNTHESIS_PAYLOAD_MARKERS = [
|
||
'Synthesis Payload (JSON):',
|
||
];
|
||
|
||
const REWRITE_PAYLOAD_MARKERS = [
|
||
'Rewrite Payload (JSON):',
|
||
];
|
||
|
||
function extractJsonAfterMarker(content, markers) {
|
||
const text = content || '';
|
||
for (const marker of markers) {
|
||
const index = text.indexOf(marker);
|
||
if (index === -1) continue;
|
||
const candidate = text.slice(index + marker.length).trim();
|
||
if (!candidate) continue;
|
||
try {
|
||
return JSON.parse(candidate);
|
||
} catch (_error) {
|
||
return null;
|
||
}
|
||
}
|
||
|
||
return null;
|
||
}
|
||
|
||
function collectPromptPayloadArtifacts(llmCalls) {
|
||
const pairJudgePayloads = llmCalls
|
||
.filter((call) => String(call.phase || '').startsWith('pair-judge:'))
|
||
.map((call) => {
|
||
const userMessage = call.messages?.find((message) => message.role === 'user')?.content || '';
|
||
return {
|
||
phase: call.phase,
|
||
payload: extractJsonAfterMarker(userMessage, PAIR_JUDGE_PAYLOAD_MARKERS),
|
||
};
|
||
})
|
||
.filter((item) => item.payload);
|
||
|
||
const synthesisCall = llmCalls.find(
|
||
(call) => String(call.phase || '') === 'structured-compare-synthesis'
|
||
);
|
||
const synthesisPayload = synthesisCall
|
||
? extractJsonAfterMarker(
|
||
synthesisCall.messages?.find((message) => message.role === 'user')?.content || '',
|
||
SYNTHESIS_PAYLOAD_MARKERS,
|
||
)
|
||
: null;
|
||
|
||
const rewriteCall = llmCalls.find((call) => String(call.phase || '').startsWith('rewrite:'));
|
||
const rewritePayloadFromMessage = rewriteCall
|
||
? extractJsonAfterMarker(
|
||
rewriteCall.messages?.find((message) => message.role === 'user')?.content || '',
|
||
REWRITE_PAYLOAD_MARKERS,
|
||
)
|
||
: null;
|
||
|
||
return {
|
||
pairJudgePayloads,
|
||
synthesisPayload,
|
||
rewritePayloadFromMessage,
|
||
};
|
||
}
|
||
|
||
function summarizeCaseResult(caseConfig, response) {
|
||
const metadata = response.metadata || {};
|
||
return {
|
||
compareMode: metadata.compareMode || null,
|
||
summary: response.summary,
|
||
score: response.score?.overall ?? null,
|
||
improvements: response.improvements || [],
|
||
stopSignals: metadata.compareStopSignals || null,
|
||
conflictSignals: metadata.compareInsights?.conflictSignals || [],
|
||
pairJudgements:
|
||
metadata.compareJudgements?.map((judgement) => ({
|
||
pairType: judgement.pairType,
|
||
pairSignal: judgement.pairSignal,
|
||
verdict: judgement.verdict,
|
||
confidence: judgement.confidence,
|
||
})) || [],
|
||
expected: caseConfig.expected || null,
|
||
};
|
||
}
|
||
|
||
function evaluateExpectations(expected, response) {
|
||
if (!expected) {
|
||
return [];
|
||
}
|
||
|
||
const metadata = response.metadata || {};
|
||
const results = [];
|
||
const stopSignals = metadata.compareStopSignals || {};
|
||
const pairJudgements = metadata.compareJudgements || [];
|
||
const conflictSignals = metadata.compareInsights?.conflictSignals || [];
|
||
|
||
if (expected.stopSignals) {
|
||
for (const [key, allowed] of Object.entries(expected.stopSignals)) {
|
||
const actual = stopSignals[key];
|
||
results.push({
|
||
type: 'stopSignal',
|
||
key,
|
||
expected: allowed,
|
||
actual: actual ?? null,
|
||
matched: actual ? allowed.includes(actual) : false,
|
||
});
|
||
}
|
||
}
|
||
|
||
if (expected.pairSignals) {
|
||
for (const [pairType, allowed] of Object.entries(expected.pairSignals)) {
|
||
const actual = pairJudgements
|
||
.filter((item) => item.pairType === pairType)
|
||
.map((item) => item.pairSignal);
|
||
results.push({
|
||
type: 'pairSignal',
|
||
key: pairType,
|
||
expected: allowed,
|
||
actual,
|
||
matched: actual.some((value) => allowed.includes(value)),
|
||
});
|
||
}
|
||
}
|
||
|
||
if (expected.conflictSignals) {
|
||
for (const signal of expected.conflictSignals) {
|
||
results.push({
|
||
type: 'conflictSignal',
|
||
key: signal,
|
||
expected: [signal],
|
||
actual: conflictSignals,
|
||
matched: conflictSignals.includes(signal),
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
function renderExpectationMarkdown(expectationResults) {
|
||
if (!expectationResults.length) {
|
||
return '无预设断言,本样本用于探索式观察。\n';
|
||
}
|
||
|
||
const header = '| 类型 | 键 | 期望 | 实际 | 是否命中 |\n| --- | --- | --- | --- | --- |\n';
|
||
const rows = expectationResults
|
||
.map((item) => {
|
||
const expected = Array.isArray(item.expected) ? item.expected.join(' / ') : String(item.expected);
|
||
const actual = Array.isArray(item.actual) ? item.actual.join(' / ') : String(item.actual);
|
||
return `| ${item.type} | ${item.key} | ${expected} | ${actual} | ${item.matched ? 'yes' : 'no'} |`;
|
||
})
|
||
.join('\n');
|
||
|
||
return `${header}${rows}\n`;
|
||
}
|
||
|
||
function renderScenarioMarkdown(caseConfig) {
|
||
return [
|
||
`# ${caseConfig.title}`,
|
||
'',
|
||
`- caseId: ${caseConfig.id}`,
|
||
`- kind: ${caseConfig.kind}`,
|
||
'',
|
||
caseConfig.description,
|
||
'',
|
||
'## Focus',
|
||
'',
|
||
caseConfig.focus || '无',
|
||
'',
|
||
].join('\n');
|
||
}
|
||
|
||
function createLoggedLLMService(baseLLMService) {
|
||
const calls = [];
|
||
let currentPhase = 'idle';
|
||
|
||
const detectEvaluationPhase = (messages) => {
|
||
const systemContent = messages?.[0]?.content || '';
|
||
const userContent = messages?.[1]?.content || '';
|
||
|
||
if (systemContent.includes('Structured_Compare_Pair_Judge') || systemContent.includes('结构化对比成对判断专家')) {
|
||
const pairMatch =
|
||
userContent.match(/Pair Key[::]\s*([^\n]+)/) ||
|
||
userContent.match(/Pair Key:\s*([^\n]+)/) ||
|
||
userContent.match(/"pairKey"\s*:\s*"([^"]+)"/);
|
||
return `pair-judge:${pairMatch?.[1]?.trim() || 'unknown'}`;
|
||
}
|
||
|
||
if (
|
||
systemContent.includes('structured compare synthesizer') ||
|
||
systemContent.includes('结构化对比综合专家')
|
||
) {
|
||
return 'structured-compare-synthesis';
|
||
}
|
||
|
||
return currentPhase;
|
||
};
|
||
|
||
const logged = {
|
||
async sendMessage(messages, modelKey) {
|
||
const phase = detectEvaluationPhase(messages);
|
||
const entry = {
|
||
phase,
|
||
modelKey,
|
||
attempts: [],
|
||
messages: messages.map((item) => ({
|
||
role: item.role,
|
||
content: item.content,
|
||
})),
|
||
};
|
||
|
||
for (let attempt = 1; attempt <= CALIBRATION_MAX_RETRIES; attempt += 1) {
|
||
try {
|
||
const response = await baseLLMService.sendMessage(messages, modelKey);
|
||
entry.response = response;
|
||
entry.retryCount = attempt - 1;
|
||
calls.push(entry);
|
||
return response;
|
||
} catch (error) {
|
||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||
entry.attempts.push({
|
||
attempt,
|
||
error: errorMessage,
|
||
});
|
||
|
||
if (attempt >= CALIBRATION_MAX_RETRIES || !isRetryableCalibrationError(error)) {
|
||
entry.error = errorMessage;
|
||
entry.retryCount = attempt - 1;
|
||
calls.push(entry);
|
||
throw error;
|
||
}
|
||
|
||
await sleep(attempt * 3000);
|
||
}
|
||
}
|
||
},
|
||
async sendMessageStream(messages, modelKey, callbacks) {
|
||
return baseLLMService.sendMessageStream(messages, modelKey, callbacks);
|
||
},
|
||
async withPhase(phase, fn) {
|
||
const previous = currentPhase;
|
||
currentPhase = phase;
|
||
try {
|
||
return await fn();
|
||
} finally {
|
||
currentPhase = previous;
|
||
}
|
||
},
|
||
getCalls() {
|
||
return calls.slice();
|
||
},
|
||
clearCalls() {
|
||
calls.length = 0;
|
||
},
|
||
};
|
||
|
||
return logged;
|
||
}
|
||
|
||
async function createServices() {
|
||
const storage = new MemoryStorageProvider();
|
||
const preferenceService = createPreferenceService(storage);
|
||
const languageService = createTemplateLanguageService(preferenceService);
|
||
await languageService.initialize();
|
||
await languageService.setLanguage('zh-CN');
|
||
|
||
const templateManager = createTemplateManager(storage, languageService);
|
||
const modelManager = createModelManager(storage);
|
||
await modelManager.ensureInitialized();
|
||
|
||
const customModel = await modelManager.getModel('custom');
|
||
if (!customModel?.enabled) {
|
||
throw new Error('custom 模型未启用,请检查 .env.local 中的 VITE_CUSTOM_API_* 配置。');
|
||
}
|
||
|
||
const deepseekModel = await modelManager.getModel('deepseek');
|
||
if (!deepseekModel?.enabled) {
|
||
throw new Error('deepseek 模型未启用,请检查 .env.local 中的 VITE_DEEPSEEK_API_KEY。');
|
||
}
|
||
|
||
await modelManager.updateModel('deepseek', {
|
||
name: 'DeepSeek Chat (Calibration)',
|
||
paramOverrides: {
|
||
...(deepseekModel.paramOverrides || {}),
|
||
temperature: 0.2,
|
||
timeout: CALIBRATION_TIMEOUT_MS,
|
||
},
|
||
});
|
||
|
||
await modelManager.updateModel('custom', {
|
||
name: 'SiliconFlow Qwen3-32B (Calibration)',
|
||
paramOverrides: {
|
||
...(customModel.paramOverrides || {}),
|
||
temperature: 0.2,
|
||
timeout: CALIBRATION_TIMEOUT_MS,
|
||
},
|
||
});
|
||
|
||
const baseLLMService = createLLMService(modelManager);
|
||
const llmService = createLoggedLLMService(baseLLMService);
|
||
const evaluationService = createEvaluationService(llmService, modelManager, templateManager);
|
||
|
||
return {
|
||
modelManager,
|
||
templateManager,
|
||
llmService,
|
||
evaluationService,
|
||
};
|
||
}
|
||
|
||
async function runLiveCase(caseConfig, services) {
|
||
const executions = [];
|
||
const runExecution = async ({ snapshotId, label, modelKey, promptText, promptRef, versionLabel }) => {
|
||
const messages = [
|
||
{ role: 'system', content: promptText },
|
||
{ role: 'user', content: caseConfig.userInput },
|
||
];
|
||
|
||
const output = await services.llmService.withPhase(`execute:${caseConfig.id}:${snapshotId}`, () =>
|
||
services.llmService.sendMessage(messages, modelKey),
|
||
);
|
||
|
||
const snapshot = {
|
||
id: snapshotId,
|
||
label,
|
||
testCaseId: 'tc-1',
|
||
promptRef,
|
||
promptText,
|
||
output: output.trim(),
|
||
modelKey,
|
||
versionLabel,
|
||
};
|
||
executions.push(snapshot);
|
||
return snapshot;
|
||
};
|
||
|
||
const snapshots = [
|
||
await runExecution({
|
||
snapshotId: 'a',
|
||
label: 'A',
|
||
modelKey: caseConfig.targetModelKey,
|
||
promptText: caseConfig.workspacePrompt,
|
||
promptRef: { kind: 'workspace', label: 'Target Workspace' },
|
||
versionLabel: 'workspace',
|
||
}),
|
||
await runExecution({
|
||
snapshotId: 'b',
|
||
label: 'B',
|
||
modelKey: caseConfig.targetModelKey,
|
||
promptText: caseConfig.previousPrompt,
|
||
promptRef: { kind: 'version', version: 1, label: 'Target Previous' },
|
||
versionLabel: 'previous',
|
||
}),
|
||
await runExecution({
|
||
snapshotId: 'c',
|
||
label: 'C',
|
||
modelKey: caseConfig.teacherModelKey,
|
||
promptText: caseConfig.workspacePrompt,
|
||
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
|
||
versionLabel: 'teacher-workspace',
|
||
}),
|
||
await runExecution({
|
||
snapshotId: 'd',
|
||
label: 'D',
|
||
modelKey: caseConfig.teacherModelKey,
|
||
promptText: caseConfig.previousPrompt,
|
||
promptRef: { kind: 'version', version: 1, label: 'Teacher Previous' },
|
||
versionLabel: 'teacher-previous',
|
||
}),
|
||
];
|
||
|
||
const request = {
|
||
type: 'compare',
|
||
evaluationModelKey: caseConfig.evaluatorModelKey,
|
||
mode: caseConfig.mode,
|
||
focus: {
|
||
content: caseConfig.focus,
|
||
source: 'system',
|
||
priority: 'highest',
|
||
},
|
||
target: {
|
||
workspacePrompt: caseConfig.workspacePrompt,
|
||
},
|
||
testCases: [
|
||
{
|
||
id: 'tc-1',
|
||
input: {
|
||
kind: 'text',
|
||
label: '用户输入',
|
||
content: caseConfig.userInput,
|
||
},
|
||
},
|
||
],
|
||
snapshots,
|
||
compareHints: {
|
||
mode: 'structured',
|
||
snapshotRoles: {
|
||
a: 'target',
|
||
b: 'baseline',
|
||
c: 'reference',
|
||
d: 'referenceBaseline',
|
||
},
|
||
hasSharedTestCases: true,
|
||
hasSamePromptSnapshots: true,
|
||
hasCrossModelComparison: true,
|
||
},
|
||
};
|
||
|
||
const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () =>
|
||
services.evaluationService.evaluate(request),
|
||
);
|
||
|
||
return {
|
||
request,
|
||
response,
|
||
executions,
|
||
};
|
||
}
|
||
|
||
async function runSyntheticCase(caseConfig, services) {
|
||
const request = {
|
||
type: 'compare',
|
||
evaluationModelKey: caseConfig.evaluatorModelKey,
|
||
mode: caseConfig.mode,
|
||
focus: {
|
||
content: caseConfig.focus,
|
||
source: 'system',
|
||
priority: 'highest',
|
||
},
|
||
target: {
|
||
workspacePrompt: caseConfig.targetWorkspacePrompt,
|
||
referencePrompt: caseConfig.targetPreviousPrompt,
|
||
},
|
||
testCases: caseConfig.testCases,
|
||
snapshots: caseConfig.snapshots,
|
||
compareHints: caseConfig.compareHints,
|
||
};
|
||
|
||
const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () =>
|
||
services.evaluationService.evaluate(request),
|
||
);
|
||
|
||
return {
|
||
request,
|
||
response,
|
||
};
|
||
}
|
||
|
||
async function writeCaseArtifacts(caseConfig, result, services) {
|
||
const caseDir = path.join(OUTPUT_ROOT, caseConfig.id);
|
||
await ensureDir(caseDir);
|
||
|
||
const rewritePayload = buildRewritePayload({
|
||
result: result.response,
|
||
type: 'compare',
|
||
mode: caseConfig.mode,
|
||
language: 'zh',
|
||
workspacePrompt:
|
||
caseConfig.kind === 'live'
|
||
? caseConfig.workspacePrompt
|
||
: caseConfig.targetWorkspacePrompt,
|
||
referencePrompt:
|
||
caseConfig.kind === 'live'
|
||
? caseConfig.previousPrompt
|
||
: caseConfig.targetPreviousPrompt,
|
||
});
|
||
const rewriteInput = buildRewritePromptFromEvaluation({
|
||
result: result.response,
|
||
type: 'compare',
|
||
mode: caseConfig.mode,
|
||
language: 'zh',
|
||
workspacePrompt:
|
||
caseConfig.kind === 'live'
|
||
? caseConfig.workspacePrompt
|
||
: caseConfig.targetWorkspacePrompt,
|
||
referencePrompt:
|
||
caseConfig.kind === 'live'
|
||
? caseConfig.previousPrompt
|
||
: caseConfig.targetPreviousPrompt,
|
||
});
|
||
|
||
const rewriteOutput = await services.llmService.withPhase(`rewrite:${caseConfig.id}`, () =>
|
||
services.llmService.sendMessage([{ role: 'user', content: rewriteInput }], caseConfig.rewriteModelKey),
|
||
);
|
||
|
||
const llmCalls = services.llmService.getCalls();
|
||
const promptPayloadArtifacts = collectPromptPayloadArtifacts(llmCalls);
|
||
const expectationResults = evaluateExpectations(caseConfig.expected, result.response);
|
||
const caseSummary = summarizeCaseResult(caseConfig, result.response);
|
||
|
||
await writeText(path.join(caseDir, 'scenario.md'), renderScenarioMarkdown(caseConfig));
|
||
await writeText(path.join(caseDir, 'request.json'), toPrettyJson(result.request));
|
||
await writeText(path.join(caseDir, 'request.md'), jsonFence(result.request));
|
||
await writeText(path.join(caseDir, 'response.json'), toPrettyJson(result.response));
|
||
await writeText(path.join(caseDir, 'response.md'), jsonFence(result.response));
|
||
await writeText(path.join(caseDir, 'pair-judge-payloads.json'), toPrettyJson(promptPayloadArtifacts.pairJudgePayloads));
|
||
await writeText(
|
||
path.join(caseDir, 'synthesis-payload.json'),
|
||
toPrettyJson(promptPayloadArtifacts.synthesisPayload),
|
||
);
|
||
await writeText(path.join(caseDir, 'rewrite-payload.json'), toPrettyJson(rewritePayload));
|
||
await writeText(path.join(caseDir, 'rewrite-input.txt'), `${rewriteInput}\n`);
|
||
await writeText(path.join(caseDir, 'rewrite-output.txt'), `${rewriteOutput.trim()}\n`);
|
||
await writeText(path.join(caseDir, 'llm-calls.json'), toPrettyJson(llmCalls));
|
||
await writeText(
|
||
path.join(caseDir, 'llm-calls.md'),
|
||
['# LLM Calls', '', ...llmCalls.map((call, index) => renderCallMarkdown(call, index))].join('\n'),
|
||
);
|
||
await writeText(path.join(caseDir, 'summary.json'), toPrettyJson({
|
||
generatedAt: nowIso,
|
||
case: {
|
||
id: caseConfig.id,
|
||
title: caseConfig.title,
|
||
kind: caseConfig.kind,
|
||
},
|
||
summary: caseSummary,
|
||
expectationResults,
|
||
}));
|
||
|
||
if (result.executions) {
|
||
await writeText(path.join(caseDir, 'executions.json'), toPrettyJson(result.executions));
|
||
}
|
||
|
||
const summaryMarkdown = [
|
||
`# ${caseConfig.title}`,
|
||
'',
|
||
`- caseId: ${caseConfig.id}`,
|
||
`- kind: ${caseConfig.kind}`,
|
||
`- generatedAt: ${nowIso}`,
|
||
'',
|
||
'## Description',
|
||
'',
|
||
caseConfig.description,
|
||
'',
|
||
'## Compare Result',
|
||
'',
|
||
jsonFence(caseSummary),
|
||
'## Expectation Check',
|
||
'',
|
||
renderExpectationMarkdown(expectationResults),
|
||
'',
|
||
'## Rewrite Output',
|
||
'',
|
||
textFence(rewriteOutput.trim()),
|
||
].join('\n');
|
||
|
||
await writeText(path.join(caseDir, 'summary.md'), summaryMarkdown);
|
||
|
||
return {
|
||
caseSummary,
|
||
expectationResults,
|
||
};
|
||
}
|
||
|
||
async function writeOverallSummary(results) {
|
||
const rows = results.map((item) => {
|
||
const matched = item.expectationResults.length
|
||
? item.expectationResults.filter((entry) => entry.matched).length
|
||
: null;
|
||
const total = item.expectationResults.length || null;
|
||
return {
|
||
caseId: item.caseConfig.id,
|
||
title: item.caseConfig.title,
|
||
kind: item.caseConfig.kind,
|
||
score: item.caseSummary.score,
|
||
stopRecommendation: item.caseSummary.stopSignals?.stopRecommendation || null,
|
||
targetVsBaseline: item.caseSummary.stopSignals?.targetVsBaseline || null,
|
||
targetVsReferenceGap: item.caseSummary.stopSignals?.targetVsReferenceGap || null,
|
||
expectationMatched: matched,
|
||
expectationTotal: total,
|
||
};
|
||
});
|
||
|
||
const markdown = [
|
||
'# Structured Compare Calibration Summary',
|
||
'',
|
||
`- generatedAt: ${nowIso}`,
|
||
`- outputRoot: ${OUTPUT_ROOT}`,
|
||
'',
|
||
'| Case | Kind | Score | targetVsBaseline | targetVsReferenceGap | stopRecommendation | Expectation Match |',
|
||
'| --- | --- | --- | --- | --- | --- | --- |',
|
||
...rows.map((row) => {
|
||
const expectationText =
|
||
row.expectationMatched === null ? 'exploratory' : `${row.expectationMatched}/${row.expectationTotal}`;
|
||
return `| ${row.caseId} | ${row.kind} | ${row.score} | ${row.targetVsBaseline} | ${row.targetVsReferenceGap} | ${row.stopRecommendation} | ${expectationText} |`;
|
||
}),
|
||
'',
|
||
'## Notes',
|
||
'',
|
||
'- synthetic cases 用来检验 judge / synthesis 的提示词边界。',
|
||
'- live case 用来观察真实 target/teacher 执行结果在 structured compare 下是否能收敛成合理结论。',
|
||
'- 每个 case 子目录内都保存了 compare request、compare result、rewrite input / output,以及完整 LLM 调用日志。',
|
||
'',
|
||
].join('\n');
|
||
|
||
await writeText(path.join(OUTPUT_ROOT, 'summary.json'), toPrettyJson({
|
||
generatedAt: nowIso,
|
||
rows,
|
||
}));
|
||
await writeText(path.join(OUTPUT_ROOT, 'summary.md'), markdown);
|
||
}
|
||
|
||
async function main() {
|
||
await fs.rm(OUTPUT_ROOT, { recursive: true, force: true });
|
||
await ensureDir(OUTPUT_ROOT);
|
||
const services = await createServices();
|
||
const cases = [LIVE_BASIC_SYSTEM_CASE, ...SYNTHETIC_CASES];
|
||
const results = [];
|
||
|
||
for (const caseConfig of cases) {
|
||
services.llmService.clearCalls();
|
||
const result =
|
||
caseConfig.kind === 'live'
|
||
? await runLiveCase(caseConfig, services)
|
||
: await runSyntheticCase(caseConfig, services);
|
||
const artifacts = await writeCaseArtifacts(caseConfig, result, services);
|
||
results.push({
|
||
caseConfig,
|
||
...artifacts,
|
||
});
|
||
}
|
||
|
||
await writeOverallSummary(results);
|
||
}
|
||
|
||
main().catch((error) => {
|
||
console.error('[structured-compare-calibration] failed:', error);
|
||
process.exitCode = 1;
|
||
});
|