Files
prompt-optimizer/scripts/run-structured-compare-calibration.mjs

1489 lines
51 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import fs from 'node:fs/promises';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
import dotenv from 'dotenv';
import {
MemoryStorageProvider,
createPreferenceService,
createTemplateLanguageService,
createTemplateManager,
createModelManager,
createLLMService,
createEvaluationService,
buildRewritePayload,
buildRewritePromptFromEvaluation,
} from '../packages/core/dist/index.js';
const ROOT_DIR = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
const OUTPUT_ROOT = path.join(
ROOT_DIR,
'docs',
'workspace',
'compare-evaluation-analysis',
'structured-compare-calibration',
'latest',
);
dotenv.config({ path: path.join(ROOT_DIR, '.env.local') });
const nowIso = new Date().toISOString();
const CALIBRATION_TIMEOUT_MS = 180000;
const CALIBRATION_MAX_RETRIES = 3;
function isRetryableCalibrationError(error) {
const message = error instanceof Error ? error.message : String(error || '');
return (
message.includes('Socket timeout') ||
message.includes('ERR_SOCKET_TIMEOUT') ||
message.includes('ETIMEDOUT') ||
message.includes('ECONNRESET') ||
message.includes('429') ||
message.includes('502') ||
message.includes('503')
);
}
async function sleep(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
const LIVE_BASIC_SYSTEM_CASE = {
id: 'live-basic-system-boundary-control',
kind: 'live',
title: '真实模型: basic-system 边界控制改动',
description:
'使用真实 target/teacher 执行 4 个快照,检验 structured compare 是否能识别“更强边界约束”带来的真实收益,而不是只看表面措辞变化。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
targetModelKey: 'custom',
teacherModelKey: 'deepseek',
focus:
'优先判断改动是否真正减少了额外解释、格式边界滑移和输出结构不稳定,而不是只看表面完整度。',
previousPrompt: [
'你是一个严格的数据抽取助手。',
'阅读用户输入,输出一个 JSON 对象,包含以下字段:',
'- audience: string | null',
'- pain_points: string[]',
'- tone: string | null',
'要求:只返回 JSON。',
].join('\n'),
workspacePrompt: [
'你是一个严格的数据抽取助手。',
'你的任务是阅读用户输入,并输出一个且仅一个 JSON 对象。',
'JSON schema 必须为:',
'{"audience": string|null, "pain_points": string[], "tone": string|null}',
'规则:',
'1. 只输出 JSON 对象,不要输出 Markdown、解释、前后缀或代码块。',
'2. pain_points 只保留用户明确提到的问题,不要脑补。',
'3. 缺失信息时 audience 和 tone 用 nullpain_points 用 []。',
'4. 键名必须完全使用 audience、pain_points、tone。',
].join('\n'),
userInput:
'我在做一个给独立设计师用的合同管理工具,语气希望专业可信。现在最大的问题是版本混乱和客户确认来回很慢。请先解释你的判断依据,再给出结果。',
};
const SYNTHETIC_CASES = [
{
id: 'synthetic-medical-latent-trigger-overfit',
kind: 'synthetic',
title: '合成样本: 医疗分诊里的隐性触发过拟合',
description:
'workspace prompt 针对当前样例里的“胸口发紧 + 上楼更明显”加入了硬编码动作,看起来更果断,但 reference 侧并不支持这条触发规则。这个样本用于观察系统是否能把“样例贴合收益”与“可复用改进”区分开。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
focus:
'如果新规则只是在当前病例上更像“对题作答”,而不是更稳的分诊策略,应保留高风险或 review 结论,而不是盲目判为进步。',
targetWorkspacePrompt: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'特别规则只要用户同时提到“胸口发紧”和“上楼更明显”next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
'不要添加解释或免责声明。',
].join('\n'),
targetPreviousPrompt: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
'不要添加解释或免责声明。',
].join('\n'),
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '分诊输入',
content:
'最近三天我一爬楼梯就胸口发紧,还会气短,休息十分钟会缓一些。请帮我判断现在该怎么处理。',
},
},
],
snapshots: [
{
id: 'a',
label: 'A',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Workspace' },
promptText: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'特别规则只要用户同时提到“胸口发紧”和“上楼更明显”next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
'不要添加解释或免责声明。',
].join('\n'),
output: JSON.stringify({
urgency: 'high',
next_step: '立即去急诊,并停止所有含咖啡因饮品。',
warning_signs: ['胸口发紧', '活动后加重', '气短'],
}),
modelKey: 'custom',
versionLabel: 'workspace',
},
{
id: 'b',
label: 'B',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 7, label: 'Previous' },
promptText: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
'不要添加解释或免责声明。',
].join('\n'),
output: JSON.stringify({
urgency: 'high',
next_step: '今天尽快去线下急诊或胸痛门诊评估;若症状持续或加重,立即就医。',
warning_signs: ['胸口发紧', '活动后加重', '气短'],
}),
modelKey: 'custom',
versionLabel: 'v7',
},
{
id: 'c',
label: 'C',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
promptText: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'特别规则只要用户同时提到“胸口发紧”和“上楼更明显”next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。',
'不要添加解释或免责声明。',
].join('\n'),
output: JSON.stringify({
urgency: 'high',
next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。',
warning_signs: ['胸口发紧', '活动后加重', '气短'],
}),
modelKey: 'deepseek',
versionLabel: 'teacher-workspace',
},
{
id: 'd',
label: 'D',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 7, label: 'Teacher Previous' },
promptText: [
'你是医疗分诊建议助手。',
'输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。',
'基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。',
'不要添加解释或免责声明。',
].join('\n'),
output: JSON.stringify({
urgency: 'high',
next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。',
warning_signs: ['胸口发紧', '活动后加重', '气短'],
}),
modelKey: 'deepseek',
versionLabel: 'teacher-v7',
},
],
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
expected: {
stopSignals: {
overfitRisk: ['high'],
stopRecommendation: ['review'],
},
pairSignals: {
targetBaseline: ['flat', 'improved'],
referenceBaseline: ['unsupported'],
},
conflictSignals: ['improvementNotSupportedOnReference'],
},
},
{
id: 'synthetic-ecommerce-schema-no-model-worship',
kind: 'synthetic',
title: '合成样本: 电商抽取里不能因为 teacher 更会写就忽略 schema',
description:
'workspace prompt 把既有商品抽取 contract 改成了新字段和外层 wrapperteacher 输出也更像“高质量总结”。这个样本用于校验 compare 是否会坚持 schema/contract 优先,而不是因为 reference 更流畅就放过漂移。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
focus:
'即便 reference 看上去更完整、更自然,只要 prompt 改动造成字段名或外层结构变化,就应把 target 相对 baseline 判为回退。',
targetWorkspacePrompt: [
'你是电商商品信息抽取助手。',
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
'为了后续扩展,把三个字段统一包在 payload 里。',
'不要解释。',
].join('\n'),
targetPreviousPrompt: [
'你是电商商品信息抽取助手。',
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
'不要改字段名,不要添加外层包裹对象,不要解释。',
].join('\n'),
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '商品文案',
content:
'便携手冲咖啡壶,容量 600ml适合露营和办公室使用主打双层不锈钢保温注意不支持电磁炉直火加热。',
},
},
],
snapshots: [
{
id: 'a',
label: 'A',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Workspace' },
promptText: [
'你是电商商品信息抽取助手。',
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
'为了后续扩展,把三个字段统一包在 payload 里。',
'不要解释。',
].join('\n'),
output: JSON.stringify({
payload: {
product_name: '便携手冲咖啡壶',
buyer_highlights: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
cautions: ['不支持电磁炉直火加热'],
},
}),
modelKey: 'custom',
versionLabel: 'workspace',
},
{
id: 'b',
label: 'B',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 4, label: 'Previous' },
promptText: [
'你是电商商品信息抽取助手。',
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
'不要改字段名,不要添加外层包裹对象,不要解释。',
].join('\n'),
output: JSON.stringify({
title: '便携手冲咖啡壶',
selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
cautions: ['不支持电磁炉直火加热'],
}),
modelKey: 'custom',
versionLabel: 'v4',
},
{
id: 'c',
label: 'C',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
promptText: [
'你是电商商品信息抽取助手。',
'输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。',
'为了后续扩展,把三个字段统一包在 payload 里。',
'不要解释。',
].join('\n'),
output: JSON.stringify({
payload: {
product_name: '便携手冲咖啡壶',
buyer_highlights: [
'双场景使用:露营与办公室',
'600ml 大容量',
'双层不锈钢保温更稳',
],
cautions: ['不支持电磁炉直火加热'],
},
}),
modelKey: 'deepseek',
versionLabel: 'teacher-workspace',
},
{
id: 'd',
label: 'D',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 4, label: 'Teacher Previous' },
promptText: [
'你是电商商品信息抽取助手。',
'只输出 JSON 对象,字段必须为 title, selling_points, cautions。',
'不要改字段名,不要添加外层包裹对象,不要解释。',
].join('\n'),
output: JSON.stringify({
title: '便携手冲咖啡壶',
selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'],
cautions: ['不支持电磁炉直火加热'],
}),
modelKey: 'deepseek',
versionLabel: 'teacher-v4',
},
],
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
expected: {
stopSignals: {
targetVsBaseline: ['regressed'],
stopRecommendation: ['review'],
},
pairSignals: {
targetBaseline: ['regressed'],
targetReference: ['none', 'minor'],
referenceBaseline: ['unsupported'],
},
conflictSignals: ['regressionOutweighsCosmeticGains'],
},
},
{
id: 'synthetic-legal-flat-not-unclear',
kind: 'synthetic',
title: '合成样本: 法务风险摘要应该判 flat 而不是 unclear',
description:
'workspace prompt 只把表达风格改得更口语化,但目标输出与 previous 在风险结论和行动建议上没有实质变化。这个样本用于观察 judge 是否能稳定给出 flat而不是因为措辞不同就退回 unclear。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
focus:
'当两个版本在核心结论、风险点和动作建议上等价时,应更倾向于 flat而不是把风格差异误判成信息不足。',
targetWorkspacePrompt: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'用更简洁、偏业务同学可读的中文表达。',
'不要添加解释。',
].join('\n'),
targetPreviousPrompt: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'保持客观、精炼。',
'不要添加解释。',
].join('\n'),
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '合同片段',
content:
'合作协议约定平台可单方修改结算周期,并在未通知的情况下暂停服务;违约责任仅约束供应商,不约束平台。',
},
},
],
snapshots: [
{
id: 'a',
label: 'A',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Workspace' },
promptText: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'用更简洁、偏业务同学可读的中文表达。',
'不要添加解释。',
].join('\n'),
output: JSON.stringify({
risk_level: 'high',
core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任明显失衡'],
recommended_action: '要求补充通知义务、限制单方变更范围,并补齐平台违约责任。',
}),
modelKey: 'custom',
versionLabel: 'workspace',
},
{
id: 'b',
label: 'B',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 6, label: 'Previous' },
promptText: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'保持客观、精炼。',
'不要添加解释。',
].join('\n'),
output: JSON.stringify({
risk_level: 'high',
core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任分配失衡'],
recommended_action: '建议增加通知义务、限制单方修改权限,并要求平台承担对等违约责任。',
}),
modelKey: 'custom',
versionLabel: 'v6',
},
{
id: 'c',
label: 'C',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
promptText: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'用更简洁、偏业务同学可读的中文表达。',
'不要添加解释。',
].join('\n'),
output: JSON.stringify({
risk_level: 'high',
core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任缺乏对等性'],
recommended_action: '要求把通知义务、变更边界和平台违约责任补齐后再推进。',
}),
modelKey: 'deepseek',
versionLabel: 'teacher-workspace',
},
{
id: 'd',
label: 'D',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 6, label: 'Teacher Previous' },
promptText: [
'你是法务风险摘要助手。',
'输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。',
'保持客观、精炼。',
'不要添加解释。',
].join('\n'),
output: JSON.stringify({
risk_level: 'high',
core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任缺乏对等性'],
recommended_action: '建议补充通知义务、限制单方变更,并增加平台违约责任。',
}),
modelKey: 'deepseek',
versionLabel: 'teacher-v6',
},
],
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
expected: {
stopSignals: {
targetVsBaseline: ['flat'],
},
pairSignals: {
targetBaseline: ['flat'],
referenceBaseline: ['supported', 'mixed'],
},
},
},
{
id: 'synthetic-teaching-overfit-regression',
kind: 'synthetic',
title: '合成样本: 教学讲解里的样例口诀导致回退',
description:
'workspace prompt 为当前题目硬塞了特定口诀和固定讲法,导致输出只贴当前样例,不再给出通用原理。这个样本用于校验系统能否识别“看似更像老师在说话,实际更窄更脆弱”的回退。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
focus:
'如果工作区版本为了当前题目显得更顺口,却牺牲了可迁移的通用解释结构,应把它判为 regressed并暴露较高过拟合风险。',
targetWorkspacePrompt: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'特别规则当题目出现“3/4 ÷ 1/2”时直接用“半个单位能装两次所以答案翻倍”来讲不要再解释一般规则。',
'不要添加题外扩展。',
].join('\n'),
targetPreviousPrompt: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
'不要添加题外扩展。',
].join('\n'),
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '题目输入',
content: '请解释为什么 3/4 ÷ 1/2 = 3/2给五年级学生听。',
},
},
],
snapshots: [
{
id: 'a',
label: 'A',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Workspace' },
promptText: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'特别规则当题目出现“3/4 ÷ 1/2”时直接用“半个单位能装两次所以答案翻倍”来讲不要再解释一般规则。',
'不要添加题外扩展。',
].join('\n'),
output: JSON.stringify({
explanation:
'因为半个单位能装两次,所以 3/4 里面能装 1/2 两次的一半,也就是 3/2。记住这句就行。',
key_rule: '看到 3/4 ÷ 1/2 时,直接理解成答案翻倍。',
common_mistake: '不要把 3/4 和 1/2 直接相除成 3/8。',
}),
modelKey: 'custom',
versionLabel: 'workspace',
},
{
id: 'b',
label: 'B',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 8, label: 'Previous' },
promptText: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
'不要添加题外扩展。',
].join('\n'),
output: JSON.stringify({
explanation:
'除以 1/2 的意思是看 3/4 里面有几个“半个”。因为把一个数除以 1/2等于看它能分成几个半个所以可以改写成 3/4 × 2 = 3/2。',
key_rule: '除以一个分数,就是乘以这个分数的倒数。',
common_mistake: '不要把“除以 1/2”误算成“再除以 2”。',
}),
modelKey: 'custom',
versionLabel: 'v8',
},
{
id: 'c',
label: 'C',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
promptText: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'特别规则当题目出现“3/4 ÷ 1/2”时直接用“半个单位能装两次所以答案翻倍”来讲不要再解释一般规则。',
'不要添加题外扩展。',
].join('\n'),
output: JSON.stringify({
explanation:
'可以把 3/4 ÷ 1/2 理解成“3/4 里面有几个半个”,但仍然要告诉学生一般规则:除以 1/2 等于乘 2所以结果是 3/2。',
key_rule: '除以一个分数,本质上是乘以它的倒数。',
common_mistake: '不要只背这个题的口诀,换别的分数就会出错。',
}),
modelKey: 'deepseek',
versionLabel: 'teacher-workspace',
},
{
id: 'd',
label: 'D',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 8, label: 'Teacher Previous' },
promptText: [
'你是数学讲解助手。',
'输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。',
'先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。',
'不要添加题外扩展。',
].join('\n'),
output: JSON.stringify({
explanation:
'3/4 ÷ 1/2 的意思是“3/4 里面有几个半个”。因为除以 1/2 就是乘 2所以可以算成 3/4 × 2 = 3/2。',
key_rule: '除以一个分数,就是乘以它的倒数。',
common_mistake: '不要把“除以 1/2”误写成“乘以 1/2”。',
}),
modelKey: 'deepseek',
versionLabel: 'teacher-v8',
},
],
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
expected: {
stopSignals: {
targetVsBaseline: ['regressed'],
overfitRisk: ['high'],
stopRecommendation: ['review'],
},
pairSignals: {
targetBaseline: ['regressed'],
referenceBaseline: ['unsupported'],
},
conflictSignals: ['regressionOutweighsCosmeticGains'],
},
},
{
id: 'synthetic-hiring-replica-semantic-instability',
kind: 'synthetic',
title: '合成样本: 招聘筛选里 replica 语义不稳定',
description:
'workspace prompt 在单次输出里看起来比 previous 更结构化,但同 prompt 的 replica 却给出了不同的录用结论。这个样本用于校验系统是否能识别“单次胜出但语义不稳定”的情况。',
mode: {
functionMode: 'basic',
subMode: 'system',
},
evaluatorModelKey: 'deepseek',
rewriteModelKey: 'deepseek',
focus:
'如果工作区版本在重复执行时连录用建议都发生漂移,就算首个结果更像样,也应优先暴露稳定性问题。',
targetWorkspacePrompt: [
'你是简历筛选总结助手。',
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
'recommendation 只能是 hire、hold、reject 之一。',
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
].join('\n'),
targetPreviousPrompt: [
'你是简历筛选总结助手。',
'输出 recommendation, strengths, risks。',
'结论尽量简洁。',
].join('\n'),
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '候选人信息',
content:
'候选人 6 年 B2B SaaS 产品经验,做过企业权限系统和审计日志,英语一般,过去一年没有直接带人经验。岗位要求:熟悉权限/审计场景,能与海外客户沟通,优先有团队协作经验。',
},
},
],
snapshots: [
{
id: 'a',
label: 'A',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Workspace' },
promptText: [
'你是简历筛选总结助手。',
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
'recommendation 只能是 hire、hold、reject 之一。',
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
].join('\n'),
output: JSON.stringify({
recommendation: 'hold',
strengths: ['有 6 年 B2B SaaS 产品经验', '做过权限系统和审计日志,和岗位高度相关'],
risks: ['英语一般,海外客户沟通能力待确认', '近期没有直接带人经验'],
}),
modelKey: 'custom',
versionLabel: 'workspace',
},
{
id: 'b',
label: 'B',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 5, label: 'Previous' },
promptText: [
'你是简历筛选总结助手。',
'输出 recommendation, strengths, risks。',
'结论尽量简洁。',
].join('\n'),
output: JSON.stringify({
recommendation: 'hold',
strengths: ['经验较匹配'],
risks: ['英语一般'],
}),
modelKey: 'custom',
versionLabel: 'v5',
},
{
id: 'c',
label: 'C',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
promptText: [
'你是简历筛选总结助手。',
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
'recommendation 只能是 hire、hold、reject 之一。',
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
].join('\n'),
output: JSON.stringify({
recommendation: 'hold',
strengths: ['权限系统和审计日志经验与岗位核心场景强相关', 'B2B SaaS 背景成熟'],
risks: ['英语一般,跨海外客户沟通需进一步验证', '缺少近期直接管理经验'],
}),
modelKey: 'deepseek',
versionLabel: 'teacher-workspace',
},
{
id: 'd',
label: 'D',
testCaseId: 'tc-1',
promptRef: { kind: 'version', version: 5, label: 'Teacher Previous' },
promptText: [
'你是简历筛选总结助手。',
'输出 recommendation, strengths, risks。',
'结论尽量简洁。',
].join('\n'),
output: JSON.stringify({
recommendation: 'hold',
strengths: ['岗位相关经验较多'],
risks: ['英语一般,管理经历偏弱'],
}),
modelKey: 'deepseek',
versionLabel: 'teacher-v5',
},
{
id: 'e',
label: 'E',
testCaseId: 'tc-1',
promptRef: { kind: 'workspace', label: 'Replica' },
promptText: [
'你是简历筛选总结助手。',
'只输出 JSON 对象,字段为 recommendation, strengths, risks。',
'recommendation 只能是 hire、hold、reject 之一。',
'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。',
].join('\n'),
output: JSON.stringify({
recommendation: 'hire',
strengths: ['权限系统与审计日志经验高度匹配岗位核心需求', 'B2B SaaS 背景可直接上手复杂业务'],
risks: ['英语一般,但可通过团队支持弥补', '近一年缺少直接带人经验'],
}),
modelKey: 'custom',
versionLabel: 'workspace-replica',
},
],
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
e: 'replica',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
expected: {
stopSignals: {
stopRecommendation: ['review'],
},
pairSignals: {
targetBaseline: ['improved', 'flat'],
targetReplica: ['unstable'],
},
conflictSignals: ['improvementUnstableAcrossReplicas'],
},
},
];
function toPrettyJson(value) {
return `${JSON.stringify(value, null, 2)}\n`;
}
function jsonFence(value) {
return `\`\`\`json\n${JSON.stringify(value, null, 2)}\n\`\`\`\n`;
}
function textFence(value, language = '') {
return `\`\`\`${language}\n${value}\n\`\`\`\n`;
}
async function ensureDir(target) {
await fs.mkdir(target, { recursive: true });
}
async function writeText(filePath, content) {
await ensureDir(path.dirname(filePath));
await fs.writeFile(filePath, content, 'utf8');
}
function renderMessagesMarkdown(messages) {
return messages
.map(
(message, index) =>
`### Message ${index + 1}\n- role: ${message.role}\n\n${textFence(message.content)}`,
)
.join('\n');
}
function renderCallMarkdown(call, index) {
const responseBlock = call.error
? `### Error\n${textFence(call.error)}`
: `### Response\n${textFence(call.response || '')}`;
return [
`## Call ${index + 1}`,
`- phase: ${call.phase}`,
`- modelKey: ${call.modelKey}`,
'',
'### Messages',
renderMessagesMarkdown(call.messages),
'',
responseBlock,
'',
].join('\n');
}
const PAIR_JUDGE_PAYLOAD_MARKERS = [
'Pair Judge Evidence Payload (JSON):',
];
const SYNTHESIS_PAYLOAD_MARKERS = [
'Synthesis Payload (JSON):',
];
const REWRITE_PAYLOAD_MARKERS = [
'Rewrite Payload (JSON):',
];
function extractJsonAfterMarker(content, markers) {
const text = content || '';
for (const marker of markers) {
const index = text.indexOf(marker);
if (index === -1) continue;
const candidate = text.slice(index + marker.length).trim();
if (!candidate) continue;
try {
return JSON.parse(candidate);
} catch (_error) {
return null;
}
}
return null;
}
function collectPromptPayloadArtifacts(llmCalls) {
const pairJudgePayloads = llmCalls
.filter((call) => String(call.phase || '').startsWith('pair-judge:'))
.map((call) => {
const userMessage = call.messages?.find((message) => message.role === 'user')?.content || '';
return {
phase: call.phase,
payload: extractJsonAfterMarker(userMessage, PAIR_JUDGE_PAYLOAD_MARKERS),
};
})
.filter((item) => item.payload);
const synthesisCall = llmCalls.find(
(call) => String(call.phase || '') === 'structured-compare-synthesis'
);
const synthesisPayload = synthesisCall
? extractJsonAfterMarker(
synthesisCall.messages?.find((message) => message.role === 'user')?.content || '',
SYNTHESIS_PAYLOAD_MARKERS,
)
: null;
const rewriteCall = llmCalls.find((call) => String(call.phase || '').startsWith('rewrite:'));
const rewritePayloadFromMessage = rewriteCall
? extractJsonAfterMarker(
rewriteCall.messages?.find((message) => message.role === 'user')?.content || '',
REWRITE_PAYLOAD_MARKERS,
)
: null;
return {
pairJudgePayloads,
synthesisPayload,
rewritePayloadFromMessage,
};
}
function summarizeCaseResult(caseConfig, response) {
const metadata = response.metadata || {};
return {
compareMode: metadata.compareMode || null,
summary: response.summary,
score: response.score?.overall ?? null,
improvements: response.improvements || [],
stopSignals: metadata.compareStopSignals || null,
conflictSignals: metadata.compareInsights?.conflictSignals || [],
pairJudgements:
metadata.compareJudgements?.map((judgement) => ({
pairType: judgement.pairType,
pairSignal: judgement.pairSignal,
verdict: judgement.verdict,
confidence: judgement.confidence,
})) || [],
expected: caseConfig.expected || null,
};
}
function evaluateExpectations(expected, response) {
if (!expected) {
return [];
}
const metadata = response.metadata || {};
const results = [];
const stopSignals = metadata.compareStopSignals || {};
const pairJudgements = metadata.compareJudgements || [];
const conflictSignals = metadata.compareInsights?.conflictSignals || [];
if (expected.stopSignals) {
for (const [key, allowed] of Object.entries(expected.stopSignals)) {
const actual = stopSignals[key];
results.push({
type: 'stopSignal',
key,
expected: allowed,
actual: actual ?? null,
matched: actual ? allowed.includes(actual) : false,
});
}
}
if (expected.pairSignals) {
for (const [pairType, allowed] of Object.entries(expected.pairSignals)) {
const actual = pairJudgements
.filter((item) => item.pairType === pairType)
.map((item) => item.pairSignal);
results.push({
type: 'pairSignal',
key: pairType,
expected: allowed,
actual,
matched: actual.some((value) => allowed.includes(value)),
});
}
}
if (expected.conflictSignals) {
for (const signal of expected.conflictSignals) {
results.push({
type: 'conflictSignal',
key: signal,
expected: [signal],
actual: conflictSignals,
matched: conflictSignals.includes(signal),
});
}
}
return results;
}
function renderExpectationMarkdown(expectationResults) {
if (!expectationResults.length) {
return '无预设断言,本样本用于探索式观察。\n';
}
const header = '| 类型 | 键 | 期望 | 实际 | 是否命中 |\n| --- | --- | --- | --- | --- |\n';
const rows = expectationResults
.map((item) => {
const expected = Array.isArray(item.expected) ? item.expected.join(' / ') : String(item.expected);
const actual = Array.isArray(item.actual) ? item.actual.join(' / ') : String(item.actual);
return `| ${item.type} | ${item.key} | ${expected} | ${actual} | ${item.matched ? 'yes' : 'no'} |`;
})
.join('\n');
return `${header}${rows}\n`;
}
function renderScenarioMarkdown(caseConfig) {
return [
`# ${caseConfig.title}`,
'',
`- caseId: ${caseConfig.id}`,
`- kind: ${caseConfig.kind}`,
'',
caseConfig.description,
'',
'## Focus',
'',
caseConfig.focus || '无',
'',
].join('\n');
}
function createLoggedLLMService(baseLLMService) {
const calls = [];
let currentPhase = 'idle';
const detectEvaluationPhase = (messages) => {
const systemContent = messages?.[0]?.content || '';
const userContent = messages?.[1]?.content || '';
if (systemContent.includes('Structured_Compare_Pair_Judge') || systemContent.includes('结构化对比成对判断专家')) {
const pairMatch =
userContent.match(/Pair Key[:]\s*([^\n]+)/) ||
userContent.match(/Pair Key\s*([^\n]+)/) ||
userContent.match(/"pairKey"\s*:\s*"([^"]+)"/);
return `pair-judge:${pairMatch?.[1]?.trim() || 'unknown'}`;
}
if (
systemContent.includes('structured compare synthesizer') ||
systemContent.includes('结构化对比综合专家')
) {
return 'structured-compare-synthesis';
}
return currentPhase;
};
const logged = {
async sendMessage(messages, modelKey) {
const phase = detectEvaluationPhase(messages);
const entry = {
phase,
modelKey,
attempts: [],
messages: messages.map((item) => ({
role: item.role,
content: item.content,
})),
};
for (let attempt = 1; attempt <= CALIBRATION_MAX_RETRIES; attempt += 1) {
try {
const response = await baseLLMService.sendMessage(messages, modelKey);
entry.response = response;
entry.retryCount = attempt - 1;
calls.push(entry);
return response;
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
entry.attempts.push({
attempt,
error: errorMessage,
});
if (attempt >= CALIBRATION_MAX_RETRIES || !isRetryableCalibrationError(error)) {
entry.error = errorMessage;
entry.retryCount = attempt - 1;
calls.push(entry);
throw error;
}
await sleep(attempt * 3000);
}
}
},
async sendMessageStream(messages, modelKey, callbacks) {
return baseLLMService.sendMessageStream(messages, modelKey, callbacks);
},
async withPhase(phase, fn) {
const previous = currentPhase;
currentPhase = phase;
try {
return await fn();
} finally {
currentPhase = previous;
}
},
getCalls() {
return calls.slice();
},
clearCalls() {
calls.length = 0;
},
};
return logged;
}
async function createServices() {
const storage = new MemoryStorageProvider();
const preferenceService = createPreferenceService(storage);
const languageService = createTemplateLanguageService(preferenceService);
await languageService.initialize();
await languageService.setLanguage('zh-CN');
const templateManager = createTemplateManager(storage, languageService);
const modelManager = createModelManager(storage);
await modelManager.ensureInitialized();
const customModel = await modelManager.getModel('custom');
if (!customModel?.enabled) {
throw new Error('custom 模型未启用,请检查 .env.local 中的 VITE_CUSTOM_API_* 配置。');
}
const deepseekModel = await modelManager.getModel('deepseek');
if (!deepseekModel?.enabled) {
throw new Error('deepseek 模型未启用,请检查 .env.local 中的 VITE_DEEPSEEK_API_KEY。');
}
await modelManager.updateModel('deepseek', {
name: 'DeepSeek Chat (Calibration)',
paramOverrides: {
...(deepseekModel.paramOverrides || {}),
temperature: 0.2,
timeout: CALIBRATION_TIMEOUT_MS,
},
});
await modelManager.updateModel('custom', {
name: 'SiliconFlow Qwen3-32B (Calibration)',
paramOverrides: {
...(customModel.paramOverrides || {}),
temperature: 0.2,
timeout: CALIBRATION_TIMEOUT_MS,
},
});
const baseLLMService = createLLMService(modelManager);
const llmService = createLoggedLLMService(baseLLMService);
const evaluationService = createEvaluationService(llmService, modelManager, templateManager);
return {
modelManager,
templateManager,
llmService,
evaluationService,
};
}
async function runLiveCase(caseConfig, services) {
const executions = [];
const runExecution = async ({ snapshotId, label, modelKey, promptText, promptRef, versionLabel }) => {
const messages = [
{ role: 'system', content: promptText },
{ role: 'user', content: caseConfig.userInput },
];
const output = await services.llmService.withPhase(`execute:${caseConfig.id}:${snapshotId}`, () =>
services.llmService.sendMessage(messages, modelKey),
);
const snapshot = {
id: snapshotId,
label,
testCaseId: 'tc-1',
promptRef,
promptText,
output: output.trim(),
modelKey,
versionLabel,
};
executions.push(snapshot);
return snapshot;
};
const snapshots = [
await runExecution({
snapshotId: 'a',
label: 'A',
modelKey: caseConfig.targetModelKey,
promptText: caseConfig.workspacePrompt,
promptRef: { kind: 'workspace', label: 'Target Workspace' },
versionLabel: 'workspace',
}),
await runExecution({
snapshotId: 'b',
label: 'B',
modelKey: caseConfig.targetModelKey,
promptText: caseConfig.previousPrompt,
promptRef: { kind: 'version', version: 1, label: 'Target Previous' },
versionLabel: 'previous',
}),
await runExecution({
snapshotId: 'c',
label: 'C',
modelKey: caseConfig.teacherModelKey,
promptText: caseConfig.workspacePrompt,
promptRef: { kind: 'workspace', label: 'Teacher Workspace' },
versionLabel: 'teacher-workspace',
}),
await runExecution({
snapshotId: 'd',
label: 'D',
modelKey: caseConfig.teacherModelKey,
promptText: caseConfig.previousPrompt,
promptRef: { kind: 'version', version: 1, label: 'Teacher Previous' },
versionLabel: 'teacher-previous',
}),
];
const request = {
type: 'compare',
evaluationModelKey: caseConfig.evaluatorModelKey,
mode: caseConfig.mode,
focus: {
content: caseConfig.focus,
source: 'system',
priority: 'highest',
},
target: {
workspacePrompt: caseConfig.workspacePrompt,
},
testCases: [
{
id: 'tc-1',
input: {
kind: 'text',
label: '用户输入',
content: caseConfig.userInput,
},
},
],
snapshots,
compareHints: {
mode: 'structured',
snapshotRoles: {
a: 'target',
b: 'baseline',
c: 'reference',
d: 'referenceBaseline',
},
hasSharedTestCases: true,
hasSamePromptSnapshots: true,
hasCrossModelComparison: true,
},
};
const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () =>
services.evaluationService.evaluate(request),
);
return {
request,
response,
executions,
};
}
async function runSyntheticCase(caseConfig, services) {
const request = {
type: 'compare',
evaluationModelKey: caseConfig.evaluatorModelKey,
mode: caseConfig.mode,
focus: {
content: caseConfig.focus,
source: 'system',
priority: 'highest',
},
target: {
workspacePrompt: caseConfig.targetWorkspacePrompt,
referencePrompt: caseConfig.targetPreviousPrompt,
},
testCases: caseConfig.testCases,
snapshots: caseConfig.snapshots,
compareHints: caseConfig.compareHints,
};
const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () =>
services.evaluationService.evaluate(request),
);
return {
request,
response,
};
}
async function writeCaseArtifacts(caseConfig, result, services) {
const caseDir = path.join(OUTPUT_ROOT, caseConfig.id);
await ensureDir(caseDir);
const rewritePayload = buildRewritePayload({
result: result.response,
type: 'compare',
mode: caseConfig.mode,
language: 'zh',
workspacePrompt:
caseConfig.kind === 'live'
? caseConfig.workspacePrompt
: caseConfig.targetWorkspacePrompt,
referencePrompt:
caseConfig.kind === 'live'
? caseConfig.previousPrompt
: caseConfig.targetPreviousPrompt,
});
const rewriteInput = buildRewritePromptFromEvaluation({
result: result.response,
type: 'compare',
mode: caseConfig.mode,
language: 'zh',
workspacePrompt:
caseConfig.kind === 'live'
? caseConfig.workspacePrompt
: caseConfig.targetWorkspacePrompt,
referencePrompt:
caseConfig.kind === 'live'
? caseConfig.previousPrompt
: caseConfig.targetPreviousPrompt,
});
const rewriteOutput = await services.llmService.withPhase(`rewrite:${caseConfig.id}`, () =>
services.llmService.sendMessage([{ role: 'user', content: rewriteInput }], caseConfig.rewriteModelKey),
);
const llmCalls = services.llmService.getCalls();
const promptPayloadArtifacts = collectPromptPayloadArtifacts(llmCalls);
const expectationResults = evaluateExpectations(caseConfig.expected, result.response);
const caseSummary = summarizeCaseResult(caseConfig, result.response);
await writeText(path.join(caseDir, 'scenario.md'), renderScenarioMarkdown(caseConfig));
await writeText(path.join(caseDir, 'request.json'), toPrettyJson(result.request));
await writeText(path.join(caseDir, 'request.md'), jsonFence(result.request));
await writeText(path.join(caseDir, 'response.json'), toPrettyJson(result.response));
await writeText(path.join(caseDir, 'response.md'), jsonFence(result.response));
await writeText(path.join(caseDir, 'pair-judge-payloads.json'), toPrettyJson(promptPayloadArtifacts.pairJudgePayloads));
await writeText(
path.join(caseDir, 'synthesis-payload.json'),
toPrettyJson(promptPayloadArtifacts.synthesisPayload),
);
await writeText(path.join(caseDir, 'rewrite-payload.json'), toPrettyJson(rewritePayload));
await writeText(path.join(caseDir, 'rewrite-input.txt'), `${rewriteInput}\n`);
await writeText(path.join(caseDir, 'rewrite-output.txt'), `${rewriteOutput.trim()}\n`);
await writeText(path.join(caseDir, 'llm-calls.json'), toPrettyJson(llmCalls));
await writeText(
path.join(caseDir, 'llm-calls.md'),
['# LLM Calls', '', ...llmCalls.map((call, index) => renderCallMarkdown(call, index))].join('\n'),
);
await writeText(path.join(caseDir, 'summary.json'), toPrettyJson({
generatedAt: nowIso,
case: {
id: caseConfig.id,
title: caseConfig.title,
kind: caseConfig.kind,
},
summary: caseSummary,
expectationResults,
}));
if (result.executions) {
await writeText(path.join(caseDir, 'executions.json'), toPrettyJson(result.executions));
}
const summaryMarkdown = [
`# ${caseConfig.title}`,
'',
`- caseId: ${caseConfig.id}`,
`- kind: ${caseConfig.kind}`,
`- generatedAt: ${nowIso}`,
'',
'## Description',
'',
caseConfig.description,
'',
'## Compare Result',
'',
jsonFence(caseSummary),
'## Expectation Check',
'',
renderExpectationMarkdown(expectationResults),
'',
'## Rewrite Output',
'',
textFence(rewriteOutput.trim()),
].join('\n');
await writeText(path.join(caseDir, 'summary.md'), summaryMarkdown);
return {
caseSummary,
expectationResults,
};
}
async function writeOverallSummary(results) {
const rows = results.map((item) => {
const matched = item.expectationResults.length
? item.expectationResults.filter((entry) => entry.matched).length
: null;
const total = item.expectationResults.length || null;
return {
caseId: item.caseConfig.id,
title: item.caseConfig.title,
kind: item.caseConfig.kind,
score: item.caseSummary.score,
stopRecommendation: item.caseSummary.stopSignals?.stopRecommendation || null,
targetVsBaseline: item.caseSummary.stopSignals?.targetVsBaseline || null,
targetVsReferenceGap: item.caseSummary.stopSignals?.targetVsReferenceGap || null,
expectationMatched: matched,
expectationTotal: total,
};
});
const markdown = [
'# Structured Compare Calibration Summary',
'',
`- generatedAt: ${nowIso}`,
`- outputRoot: ${OUTPUT_ROOT}`,
'',
'| Case | Kind | Score | targetVsBaseline | targetVsReferenceGap | stopRecommendation | Expectation Match |',
'| --- | --- | --- | --- | --- | --- | --- |',
...rows.map((row) => {
const expectationText =
row.expectationMatched === null ? 'exploratory' : `${row.expectationMatched}/${row.expectationTotal}`;
return `| ${row.caseId} | ${row.kind} | ${row.score} | ${row.targetVsBaseline} | ${row.targetVsReferenceGap} | ${row.stopRecommendation} | ${expectationText} |`;
}),
'',
'## Notes',
'',
'- synthetic cases 用来检验 judge / synthesis 的提示词边界。',
'- live case 用来观察真实 target/teacher 执行结果在 structured compare 下是否能收敛成合理结论。',
'- 每个 case 子目录内都保存了 compare request、compare result、rewrite input / output以及完整 LLM 调用日志。',
'',
].join('\n');
await writeText(path.join(OUTPUT_ROOT, 'summary.json'), toPrettyJson({
generatedAt: nowIso,
rows,
}));
await writeText(path.join(OUTPUT_ROOT, 'summary.md'), markdown);
}
async function main() {
await fs.rm(OUTPUT_ROOT, { recursive: true, force: true });
await ensureDir(OUTPUT_ROOT);
const services = await createServices();
const cases = [LIVE_BASIC_SYSTEM_CASE, ...SYNTHETIC_CASES];
const results = [];
for (const caseConfig of cases) {
services.llmService.clearCalls();
const result =
caseConfig.kind === 'live'
? await runLiveCase(caseConfig, services)
: await runSyntheticCase(caseConfig, services);
const artifacts = await writeCaseArtifacts(caseConfig, result, services);
results.push({
caseConfig,
...artifacts,
});
}
await writeOverallSummary(results);
}
main().catch((error) => {
console.error('[structured-compare-calibration] failed:', error);
process.exitCode = 1;
});