import fs from 'node:fs/promises'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import dotenv from 'dotenv'; import { MemoryStorageProvider, createPreferenceService, createTemplateLanguageService, createTemplateManager, createModelManager, createLLMService, createEvaluationService, buildRewritePayload, buildRewritePromptFromEvaluation, } from '../packages/core/dist/index.js'; const ROOT_DIR = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..'); const OUTPUT_ROOT = path.join( ROOT_DIR, 'docs', 'workspace', 'compare-evaluation-analysis', 'structured-compare-calibration', 'latest', ); dotenv.config({ path: path.join(ROOT_DIR, '.env.local') }); const nowIso = new Date().toISOString(); const CALIBRATION_TIMEOUT_MS = 180000; const CALIBRATION_MAX_RETRIES = 3; function isRetryableCalibrationError(error) { const message = error instanceof Error ? error.message : String(error || ''); return ( message.includes('Socket timeout') || message.includes('ERR_SOCKET_TIMEOUT') || message.includes('ETIMEDOUT') || message.includes('ECONNRESET') || message.includes('429') || message.includes('502') || message.includes('503') ); } async function sleep(ms) { return new Promise((resolve) => setTimeout(resolve, ms)); } const LIVE_BASIC_SYSTEM_CASE = { id: 'live-basic-system-boundary-control', kind: 'live', title: '真实模型: basic-system 边界控制改动', description: '使用真实 target/teacher 执行 4 个快照,检验 structured compare 是否能识别“更强边界约束”带来的真实收益,而不是只看表面措辞变化。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', targetModelKey: 'custom', teacherModelKey: 'deepseek', focus: '优先判断改动是否真正减少了额外解释、格式边界滑移和输出结构不稳定,而不是只看表面完整度。', previousPrompt: [ '你是一个严格的数据抽取助手。', '阅读用户输入,输出一个 JSON 对象,包含以下字段:', '- audience: string | null', '- pain_points: string[]', '- tone: string | null', '要求:只返回 JSON。', ].join('\n'), workspacePrompt: [ '你是一个严格的数据抽取助手。', '你的任务是阅读用户输入,并输出一个且仅一个 JSON 对象。', 'JSON schema 必须为:', '{"audience": string|null, "pain_points": string[], "tone": string|null}', '规则:', '1. 只输出 JSON 对象,不要输出 Markdown、解释、前后缀或代码块。', '2. pain_points 只保留用户明确提到的问题,不要脑补。', '3. 缺失信息时 audience 和 tone 用 null,pain_points 用 []。', '4. 键名必须完全使用 audience、pain_points、tone。', ].join('\n'), userInput: '我在做一个给独立设计师用的合同管理工具,语气希望专业可信。现在最大的问题是版本混乱和客户确认来回很慢。请先解释你的判断依据,再给出结果。', }; const SYNTHETIC_CASES = [ { id: 'synthetic-medical-latent-trigger-overfit', kind: 'synthetic', title: '合成样本: 医疗分诊里的隐性触发过拟合', description: 'workspace prompt 针对当前样例里的“胸口发紧 + 上楼更明显”加入了硬编码动作,看起来更果断,但 reference 侧并不支持这条触发规则。这个样本用于观察系统是否能把“样例贴合收益”与“可复用改进”区分开。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', focus: '如果新规则只是在当前病例上更像“对题作答”,而不是更稳的分诊策略,应保留高风险或 review 结论,而不是盲目判为进步。', targetWorkspacePrompt: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。', '不要添加解释或免责声明。', ].join('\n'), targetPreviousPrompt: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。', '不要添加解释或免责声明。', ].join('\n'), testCases: [ { id: 'tc-1', input: { kind: 'text', label: '分诊输入', content: '最近三天我一爬楼梯就胸口发紧,还会气短,休息十分钟会缓一些。请帮我判断现在该怎么处理。', }, }, ], snapshots: [ { id: 'a', label: 'A', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Workspace' }, promptText: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。', '不要添加解释或免责声明。', ].join('\n'), output: JSON.stringify({ urgency: 'high', next_step: '立即去急诊,并停止所有含咖啡因饮品。', warning_signs: ['胸口发紧', '活动后加重', '气短'], }), modelKey: 'custom', versionLabel: 'workspace', }, { id: 'b', label: 'B', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 7, label: 'Previous' }, promptText: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。', '不要添加解释或免责声明。', ].join('\n'), output: JSON.stringify({ urgency: 'high', next_step: '今天尽快去线下急诊或胸痛门诊评估;若症状持续或加重,立即就医。', warning_signs: ['胸口发紧', '活动后加重', '气短'], }), modelKey: 'custom', versionLabel: 'v7', }, { id: 'c', label: 'C', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, promptText: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '特别规则:只要用户同时提到“胸口发紧”和“上楼更明显”,next_step 必须写“立即去急诊,并停止所有含咖啡因饮品”。', '不要添加解释或免责声明。', ].join('\n'), output: JSON.stringify({ urgency: 'high', next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。', warning_signs: ['胸口发紧', '活动后加重', '气短'], }), modelKey: 'deepseek', versionLabel: 'teacher-workspace', }, { id: 'd', label: 'D', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 7, label: 'Teacher Previous' }, promptText: [ '你是医疗分诊建议助手。', '输出一个 JSON 对象,字段必须为 urgency, next_step, warning_signs。', '基于用户已提供的信息给出谨慎、可执行的线下就医建议,不要硬编码样例词触发器。', '不要添加解释或免责声明。', ].join('\n'), output: JSON.stringify({ urgency: 'high', next_step: '尽快去线下急诊评估胸闷和活动后气短,不要继续拖延。', warning_signs: ['胸口发紧', '活动后加重', '气短'], }), modelKey: 'deepseek', versionLabel: 'teacher-v7', }, ], compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, expected: { stopSignals: { overfitRisk: ['high'], stopRecommendation: ['review'], }, pairSignals: { targetBaseline: ['flat', 'improved'], referenceBaseline: ['unsupported'], }, conflictSignals: ['improvementNotSupportedOnReference'], }, }, { id: 'synthetic-ecommerce-schema-no-model-worship', kind: 'synthetic', title: '合成样本: 电商抽取里不能因为 teacher 更会写就忽略 schema', description: 'workspace prompt 把既有商品抽取 contract 改成了新字段和外层 wrapper,teacher 输出也更像“高质量总结”。这个样本用于校验 compare 是否会坚持 schema/contract 优先,而不是因为 reference 更流畅就放过漂移。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', focus: '即便 reference 看上去更完整、更自然,只要 prompt 改动造成字段名或外层结构变化,就应把 target 相对 baseline 判为回退。', targetWorkspacePrompt: [ '你是电商商品信息抽取助手。', '输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。', '为了后续扩展,把三个字段统一包在 payload 里。', '不要解释。', ].join('\n'), targetPreviousPrompt: [ '你是电商商品信息抽取助手。', '只输出 JSON 对象,字段必须为 title, selling_points, cautions。', '不要改字段名,不要添加外层包裹对象,不要解释。', ].join('\n'), testCases: [ { id: 'tc-1', input: { kind: 'text', label: '商品文案', content: '便携手冲咖啡壶,容量 600ml,适合露营和办公室使用,主打双层不锈钢保温,注意不支持电磁炉直火加热。', }, }, ], snapshots: [ { id: 'a', label: 'A', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Workspace' }, promptText: [ '你是电商商品信息抽取助手。', '输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。', '为了后续扩展,把三个字段统一包在 payload 里。', '不要解释。', ].join('\n'), output: JSON.stringify({ payload: { product_name: '便携手冲咖啡壶', buyer_highlights: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'], cautions: ['不支持电磁炉直火加热'], }, }), modelKey: 'custom', versionLabel: 'workspace', }, { id: 'b', label: 'B', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 4, label: 'Previous' }, promptText: [ '你是电商商品信息抽取助手。', '只输出 JSON 对象,字段必须为 title, selling_points, cautions。', '不要改字段名,不要添加外层包裹对象,不要解释。', ].join('\n'), output: JSON.stringify({ title: '便携手冲咖啡壶', selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'], cautions: ['不支持电磁炉直火加热'], }), modelKey: 'custom', versionLabel: 'v4', }, { id: 'c', label: 'C', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, promptText: [ '你是电商商品信息抽取助手。', '输出一个 JSON 对象,字段改为 product_name, buyer_highlights, cautions。', '为了后续扩展,把三个字段统一包在 payload 里。', '不要解释。', ].join('\n'), output: JSON.stringify({ payload: { product_name: '便携手冲咖啡壶', buyer_highlights: [ '双场景使用:露营与办公室', '600ml 大容量', '双层不锈钢保温更稳', ], cautions: ['不支持电磁炉直火加热'], }, }), modelKey: 'deepseek', versionLabel: 'teacher-workspace', }, { id: 'd', label: 'D', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 4, label: 'Teacher Previous' }, promptText: [ '你是电商商品信息抽取助手。', '只输出 JSON 对象,字段必须为 title, selling_points, cautions。', '不要改字段名,不要添加外层包裹对象,不要解释。', ].join('\n'), output: JSON.stringify({ title: '便携手冲咖啡壶', selling_points: ['600ml 容量', '适合露营和办公室', '双层不锈钢保温'], cautions: ['不支持电磁炉直火加热'], }), modelKey: 'deepseek', versionLabel: 'teacher-v4', }, ], compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, expected: { stopSignals: { targetVsBaseline: ['regressed'], stopRecommendation: ['review'], }, pairSignals: { targetBaseline: ['regressed'], targetReference: ['none', 'minor'], referenceBaseline: ['unsupported'], }, conflictSignals: ['regressionOutweighsCosmeticGains'], }, }, { id: 'synthetic-legal-flat-not-unclear', kind: 'synthetic', title: '合成样本: 法务风险摘要应该判 flat 而不是 unclear', description: 'workspace prompt 只把表达风格改得更口语化,但目标输出与 previous 在风险结论和行动建议上没有实质变化。这个样本用于观察 judge 是否能稳定给出 flat,而不是因为措辞不同就退回 unclear。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', focus: '当两个版本在核心结论、风险点和动作建议上等价时,应更倾向于 flat,而不是把风格差异误判成信息不足。', targetWorkspacePrompt: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '用更简洁、偏业务同学可读的中文表达。', '不要添加解释。', ].join('\n'), targetPreviousPrompt: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '保持客观、精炼。', '不要添加解释。', ].join('\n'), testCases: [ { id: 'tc-1', input: { kind: 'text', label: '合同片段', content: '合作协议约定平台可单方修改结算周期,并在未通知的情况下暂停服务;违约责任仅约束供应商,不约束平台。', }, }, ], snapshots: [ { id: 'a', label: 'A', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Workspace' }, promptText: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '用更简洁、偏业务同学可读的中文表达。', '不要添加解释。', ].join('\n'), output: JSON.stringify({ risk_level: 'high', core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任明显失衡'], recommended_action: '要求补充通知义务、限制单方变更范围,并补齐平台违约责任。', }), modelKey: 'custom', versionLabel: 'workspace', }, { id: 'b', label: 'B', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 6, label: 'Previous' }, promptText: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '保持客观、精炼。', '不要添加解释。', ].join('\n'), output: JSON.stringify({ risk_level: 'high', core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任分配失衡'], recommended_action: '建议增加通知义务、限制单方修改权限,并要求平台承担对等违约责任。', }), modelKey: 'custom', versionLabel: 'v6', }, { id: 'c', label: 'C', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, promptText: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '用更简洁、偏业务同学可读的中文表达。', '不要添加解释。', ].join('\n'), output: JSON.stringify({ risk_level: 'high', core_risks: ['平台可单方改结算周期', '平台可未通知暂停服务', '违约责任缺乏对等性'], recommended_action: '要求把通知义务、变更边界和平台违约责任补齐后再推进。', }), modelKey: 'deepseek', versionLabel: 'teacher-workspace', }, { id: 'd', label: 'D', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 6, label: 'Teacher Previous' }, promptText: [ '你是法务风险摘要助手。', '输出一个 JSON 对象,字段为 risk_level, core_risks, recommended_action。', '保持客观、精炼。', '不要添加解释。', ].join('\n'), output: JSON.stringify({ risk_level: 'high', core_risks: ['平台可单方调整结算周期', '平台可在未通知情况下暂停服务', '违约责任缺乏对等性'], recommended_action: '建议补充通知义务、限制单方变更,并增加平台违约责任。', }), modelKey: 'deepseek', versionLabel: 'teacher-v6', }, ], compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, expected: { stopSignals: { targetVsBaseline: ['flat'], }, pairSignals: { targetBaseline: ['flat'], referenceBaseline: ['supported', 'mixed'], }, }, }, { id: 'synthetic-teaching-overfit-regression', kind: 'synthetic', title: '合成样本: 教学讲解里的样例口诀导致回退', description: 'workspace prompt 为当前题目硬塞了特定口诀和固定讲法,导致输出只贴当前样例,不再给出通用原理。这个样本用于校验系统能否识别“看似更像老师在说话,实际更窄更脆弱”的回退。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', focus: '如果工作区版本为了当前题目显得更顺口,却牺牲了可迁移的通用解释结构,应把它判为 regressed,并暴露较高过拟合风险。', targetWorkspacePrompt: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。', '不要添加题外扩展。', ].join('\n'), targetPreviousPrompt: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。', '不要添加题外扩展。', ].join('\n'), testCases: [ { id: 'tc-1', input: { kind: 'text', label: '题目输入', content: '请解释为什么 3/4 ÷ 1/2 = 3/2,给五年级学生听。', }, }, ], snapshots: [ { id: 'a', label: 'A', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Workspace' }, promptText: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。', '不要添加题外扩展。', ].join('\n'), output: JSON.stringify({ explanation: '因为半个单位能装两次,所以 3/4 里面能装 1/2 两次的一半,也就是 3/2。记住这句就行。', key_rule: '看到 3/4 ÷ 1/2 时,直接理解成答案翻倍。', common_mistake: '不要把 3/4 和 1/2 直接相除成 3/8。', }), modelKey: 'custom', versionLabel: 'workspace', }, { id: 'b', label: 'B', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 8, label: 'Previous' }, promptText: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。', '不要添加题外扩展。', ].join('\n'), output: JSON.stringify({ explanation: '除以 1/2 的意思是看 3/4 里面有几个“半个”。因为把一个数除以 1/2,等于看它能分成几个半个,所以可以改写成 3/4 × 2 = 3/2。', key_rule: '除以一个分数,就是乘以这个分数的倒数。', common_mistake: '不要把“除以 1/2”误算成“再除以 2”。', }), modelKey: 'custom', versionLabel: 'v8', }, { id: 'c', label: 'C', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, promptText: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '特别规则:当题目出现“3/4 ÷ 1/2”时,直接用“半个单位能装两次,所以答案翻倍”来讲,不要再解释一般规则。', '不要添加题外扩展。', ].join('\n'), output: JSON.stringify({ explanation: '可以把 3/4 ÷ 1/2 理解成“3/4 里面有几个半个”,但仍然要告诉学生一般规则:除以 1/2 等于乘 2,所以结果是 3/2。', key_rule: '除以一个分数,本质上是乘以它的倒数。', common_mistake: '不要只背这个题的口诀,换别的分数就会出错。', }), modelKey: 'deepseek', versionLabel: 'teacher-workspace', }, { id: 'd', label: 'D', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 8, label: 'Teacher Previous' }, promptText: [ '你是数学讲解助手。', '输出一个 JSON 对象,字段为 explanation, key_rule, common_mistake。', '先解释为什么“除以分数等于乘以它的倒数”,再回到题目演示。', '不要添加题外扩展。', ].join('\n'), output: JSON.stringify({ explanation: '3/4 ÷ 1/2 的意思是“3/4 里面有几个半个”。因为除以 1/2 就是乘 2,所以可以算成 3/4 × 2 = 3/2。', key_rule: '除以一个分数,就是乘以它的倒数。', common_mistake: '不要把“除以 1/2”误写成“乘以 1/2”。', }), modelKey: 'deepseek', versionLabel: 'teacher-v8', }, ], compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, expected: { stopSignals: { targetVsBaseline: ['regressed'], overfitRisk: ['high'], stopRecommendation: ['review'], }, pairSignals: { targetBaseline: ['regressed'], referenceBaseline: ['unsupported'], }, conflictSignals: ['regressionOutweighsCosmeticGains'], }, }, { id: 'synthetic-hiring-replica-semantic-instability', kind: 'synthetic', title: '合成样本: 招聘筛选里 replica 语义不稳定', description: 'workspace prompt 在单次输出里看起来比 previous 更结构化,但同 prompt 的 replica 却给出了不同的录用结论。这个样本用于校验系统是否能识别“单次胜出但语义不稳定”的情况。', mode: { functionMode: 'basic', subMode: 'system', }, evaluatorModelKey: 'deepseek', rewriteModelKey: 'deepseek', focus: '如果工作区版本在重复执行时连录用建议都发生漂移,就算首个结果更像样,也应优先暴露稳定性问题。', targetWorkspacePrompt: [ '你是简历筛选总结助手。', '只输出 JSON 对象,字段为 recommendation, strengths, risks。', 'recommendation 只能是 hire、hold、reject 之一。', 'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。', ].join('\n'), targetPreviousPrompt: [ '你是简历筛选总结助手。', '输出 recommendation, strengths, risks。', '结论尽量简洁。', ].join('\n'), testCases: [ { id: 'tc-1', input: { kind: 'text', label: '候选人信息', content: '候选人 6 年 B2B SaaS 产品经验,做过企业权限系统和审计日志,英语一般,过去一年没有直接带人经验。岗位要求:熟悉权限/审计场景,能与海外客户沟通,优先有团队协作经验。', }, }, ], snapshots: [ { id: 'a', label: 'A', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Workspace' }, promptText: [ '你是简历筛选总结助手。', '只输出 JSON 对象,字段为 recommendation, strengths, risks。', 'recommendation 只能是 hire、hold、reject 之一。', 'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。', ].join('\n'), output: JSON.stringify({ recommendation: 'hold', strengths: ['有 6 年 B2B SaaS 产品经验', '做过权限系统和审计日志,和岗位高度相关'], risks: ['英语一般,海外客户沟通能力待确认', '近期没有直接带人经验'], }), modelKey: 'custom', versionLabel: 'workspace', }, { id: 'b', label: 'B', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 5, label: 'Previous' }, promptText: [ '你是简历筛选总结助手。', '输出 recommendation, strengths, risks。', '结论尽量简洁。', ].join('\n'), output: JSON.stringify({ recommendation: 'hold', strengths: ['经验较匹配'], risks: ['英语一般'], }), modelKey: 'custom', versionLabel: 'v5', }, { id: 'c', label: 'C', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, promptText: [ '你是简历筛选总结助手。', '只输出 JSON 对象,字段为 recommendation, strengths, risks。', 'recommendation 只能是 hire、hold、reject 之一。', 'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。', ].join('\n'), output: JSON.stringify({ recommendation: 'hold', strengths: ['权限系统和审计日志经验与岗位核心场景强相关', 'B2B SaaS 背景成熟'], risks: ['英语一般,跨海外客户沟通需进一步验证', '缺少近期直接管理经验'], }), modelKey: 'deepseek', versionLabel: 'teacher-workspace', }, { id: 'd', label: 'D', testCaseId: 'tc-1', promptRef: { kind: 'version', version: 5, label: 'Teacher Previous' }, promptText: [ '你是简历筛选总结助手。', '输出 recommendation, strengths, risks。', '结论尽量简洁。', ].join('\n'), output: JSON.stringify({ recommendation: 'hold', strengths: ['岗位相关经验较多'], risks: ['英语一般,管理经历偏弱'], }), modelKey: 'deepseek', versionLabel: 'teacher-v5', }, { id: 'e', label: 'E', testCaseId: 'tc-1', promptRef: { kind: 'workspace', label: 'Replica' }, promptText: [ '你是简历筛选总结助手。', '只输出 JSON 对象,字段为 recommendation, strengths, risks。', 'recommendation 只能是 hire、hold、reject 之一。', 'strengths 和 risks 都要紧扣岗位要求,避免泛泛而谈。', ].join('\n'), output: JSON.stringify({ recommendation: 'hire', strengths: ['权限系统与审计日志经验高度匹配岗位核心需求', 'B2B SaaS 背景可直接上手复杂业务'], risks: ['英语一般,但可通过团队支持弥补', '近一年缺少直接带人经验'], }), modelKey: 'custom', versionLabel: 'workspace-replica', }, ], compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', e: 'replica', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, expected: { stopSignals: { stopRecommendation: ['review'], }, pairSignals: { targetBaseline: ['improved', 'flat'], targetReplica: ['unstable'], }, conflictSignals: ['improvementUnstableAcrossReplicas'], }, }, ]; function toPrettyJson(value) { return `${JSON.stringify(value, null, 2)}\n`; } function jsonFence(value) { return `\`\`\`json\n${JSON.stringify(value, null, 2)}\n\`\`\`\n`; } function textFence(value, language = '') { return `\`\`\`${language}\n${value}\n\`\`\`\n`; } async function ensureDir(target) { await fs.mkdir(target, { recursive: true }); } async function writeText(filePath, content) { await ensureDir(path.dirname(filePath)); await fs.writeFile(filePath, content, 'utf8'); } function renderMessagesMarkdown(messages) { return messages .map( (message, index) => `### Message ${index + 1}\n- role: ${message.role}\n\n${textFence(message.content)}`, ) .join('\n'); } function renderCallMarkdown(call, index) { const responseBlock = call.error ? `### Error\n${textFence(call.error)}` : `### Response\n${textFence(call.response || '')}`; return [ `## Call ${index + 1}`, `- phase: ${call.phase}`, `- modelKey: ${call.modelKey}`, '', '### Messages', renderMessagesMarkdown(call.messages), '', responseBlock, '', ].join('\n'); } const PAIR_JUDGE_PAYLOAD_MARKERS = [ 'Pair Judge Evidence Payload (JSON):', ]; const SYNTHESIS_PAYLOAD_MARKERS = [ 'Synthesis Payload (JSON):', ]; const REWRITE_PAYLOAD_MARKERS = [ 'Rewrite Payload (JSON):', ]; function extractJsonAfterMarker(content, markers) { const text = content || ''; for (const marker of markers) { const index = text.indexOf(marker); if (index === -1) continue; const candidate = text.slice(index + marker.length).trim(); if (!candidate) continue; try { return JSON.parse(candidate); } catch (_error) { return null; } } return null; } function collectPromptPayloadArtifacts(llmCalls) { const pairJudgePayloads = llmCalls .filter((call) => String(call.phase || '').startsWith('pair-judge:')) .map((call) => { const userMessage = call.messages?.find((message) => message.role === 'user')?.content || ''; return { phase: call.phase, payload: extractJsonAfterMarker(userMessage, PAIR_JUDGE_PAYLOAD_MARKERS), }; }) .filter((item) => item.payload); const synthesisCall = llmCalls.find( (call) => String(call.phase || '') === 'structured-compare-synthesis' ); const synthesisPayload = synthesisCall ? extractJsonAfterMarker( synthesisCall.messages?.find((message) => message.role === 'user')?.content || '', SYNTHESIS_PAYLOAD_MARKERS, ) : null; const rewriteCall = llmCalls.find((call) => String(call.phase || '').startsWith('rewrite:')); const rewritePayloadFromMessage = rewriteCall ? extractJsonAfterMarker( rewriteCall.messages?.find((message) => message.role === 'user')?.content || '', REWRITE_PAYLOAD_MARKERS, ) : null; return { pairJudgePayloads, synthesisPayload, rewritePayloadFromMessage, }; } function summarizeCaseResult(caseConfig, response) { const metadata = response.metadata || {}; return { compareMode: metadata.compareMode || null, summary: response.summary, score: response.score?.overall ?? null, improvements: response.improvements || [], stopSignals: metadata.compareStopSignals || null, conflictSignals: metadata.compareInsights?.conflictSignals || [], pairJudgements: metadata.compareJudgements?.map((judgement) => ({ pairType: judgement.pairType, pairSignal: judgement.pairSignal, verdict: judgement.verdict, confidence: judgement.confidence, })) || [], expected: caseConfig.expected || null, }; } function evaluateExpectations(expected, response) { if (!expected) { return []; } const metadata = response.metadata || {}; const results = []; const stopSignals = metadata.compareStopSignals || {}; const pairJudgements = metadata.compareJudgements || []; const conflictSignals = metadata.compareInsights?.conflictSignals || []; if (expected.stopSignals) { for (const [key, allowed] of Object.entries(expected.stopSignals)) { const actual = stopSignals[key]; results.push({ type: 'stopSignal', key, expected: allowed, actual: actual ?? null, matched: actual ? allowed.includes(actual) : false, }); } } if (expected.pairSignals) { for (const [pairType, allowed] of Object.entries(expected.pairSignals)) { const actual = pairJudgements .filter((item) => item.pairType === pairType) .map((item) => item.pairSignal); results.push({ type: 'pairSignal', key: pairType, expected: allowed, actual, matched: actual.some((value) => allowed.includes(value)), }); } } if (expected.conflictSignals) { for (const signal of expected.conflictSignals) { results.push({ type: 'conflictSignal', key: signal, expected: [signal], actual: conflictSignals, matched: conflictSignals.includes(signal), }); } } return results; } function renderExpectationMarkdown(expectationResults) { if (!expectationResults.length) { return '无预设断言,本样本用于探索式观察。\n'; } const header = '| 类型 | 键 | 期望 | 实际 | 是否命中 |\n| --- | --- | --- | --- | --- |\n'; const rows = expectationResults .map((item) => { const expected = Array.isArray(item.expected) ? item.expected.join(' / ') : String(item.expected); const actual = Array.isArray(item.actual) ? item.actual.join(' / ') : String(item.actual); return `| ${item.type} | ${item.key} | ${expected} | ${actual} | ${item.matched ? 'yes' : 'no'} |`; }) .join('\n'); return `${header}${rows}\n`; } function renderScenarioMarkdown(caseConfig) { return [ `# ${caseConfig.title}`, '', `- caseId: ${caseConfig.id}`, `- kind: ${caseConfig.kind}`, '', caseConfig.description, '', '## Focus', '', caseConfig.focus || '无', '', ].join('\n'); } function createLoggedLLMService(baseLLMService) { const calls = []; let currentPhase = 'idle'; const detectEvaluationPhase = (messages) => { const systemContent = messages?.[0]?.content || ''; const userContent = messages?.[1]?.content || ''; if (systemContent.includes('Structured_Compare_Pair_Judge') || systemContent.includes('结构化对比成对判断专家')) { const pairMatch = userContent.match(/Pair Key[::]\s*([^\n]+)/) || userContent.match(/Pair Key:\s*([^\n]+)/) || userContent.match(/"pairKey"\s*:\s*"([^"]+)"/); return `pair-judge:${pairMatch?.[1]?.trim() || 'unknown'}`; } if ( systemContent.includes('structured compare synthesizer') || systemContent.includes('结构化对比综合专家') ) { return 'structured-compare-synthesis'; } return currentPhase; }; const logged = { async sendMessage(messages, modelKey) { const phase = detectEvaluationPhase(messages); const entry = { phase, modelKey, attempts: [], messages: messages.map((item) => ({ role: item.role, content: item.content, })), }; for (let attempt = 1; attempt <= CALIBRATION_MAX_RETRIES; attempt += 1) { try { const response = await baseLLMService.sendMessage(messages, modelKey); entry.response = response; entry.retryCount = attempt - 1; calls.push(entry); return response; } catch (error) { const errorMessage = error instanceof Error ? error.message : String(error); entry.attempts.push({ attempt, error: errorMessage, }); if (attempt >= CALIBRATION_MAX_RETRIES || !isRetryableCalibrationError(error)) { entry.error = errorMessage; entry.retryCount = attempt - 1; calls.push(entry); throw error; } await sleep(attempt * 3000); } } }, async sendMessageStream(messages, modelKey, callbacks) { return baseLLMService.sendMessageStream(messages, modelKey, callbacks); }, async withPhase(phase, fn) { const previous = currentPhase; currentPhase = phase; try { return await fn(); } finally { currentPhase = previous; } }, getCalls() { return calls.slice(); }, clearCalls() { calls.length = 0; }, }; return logged; } async function createServices() { const storage = new MemoryStorageProvider(); const preferenceService = createPreferenceService(storage); const languageService = createTemplateLanguageService(preferenceService); await languageService.initialize(); await languageService.setLanguage('zh-CN'); const templateManager = createTemplateManager(storage, languageService); const modelManager = createModelManager(storage); await modelManager.ensureInitialized(); const customModel = await modelManager.getModel('custom'); if (!customModel?.enabled) { throw new Error('custom 模型未启用,请检查 .env.local 中的 VITE_CUSTOM_API_* 配置。'); } const deepseekModel = await modelManager.getModel('deepseek'); if (!deepseekModel?.enabled) { throw new Error('deepseek 模型未启用,请检查 .env.local 中的 VITE_DEEPSEEK_API_KEY。'); } await modelManager.updateModel('deepseek', { name: 'DeepSeek Chat (Calibration)', paramOverrides: { ...(deepseekModel.paramOverrides || {}), temperature: 0.2, timeout: CALIBRATION_TIMEOUT_MS, }, }); await modelManager.updateModel('custom', { name: 'SiliconFlow Qwen3-32B (Calibration)', paramOverrides: { ...(customModel.paramOverrides || {}), temperature: 0.2, timeout: CALIBRATION_TIMEOUT_MS, }, }); const baseLLMService = createLLMService(modelManager); const llmService = createLoggedLLMService(baseLLMService); const evaluationService = createEvaluationService(llmService, modelManager, templateManager); return { modelManager, templateManager, llmService, evaluationService, }; } async function runLiveCase(caseConfig, services) { const executions = []; const runExecution = async ({ snapshotId, label, modelKey, promptText, promptRef, versionLabel }) => { const messages = [ { role: 'system', content: promptText }, { role: 'user', content: caseConfig.userInput }, ]; const output = await services.llmService.withPhase(`execute:${caseConfig.id}:${snapshotId}`, () => services.llmService.sendMessage(messages, modelKey), ); const snapshot = { id: snapshotId, label, testCaseId: 'tc-1', promptRef, promptText, output: output.trim(), modelKey, versionLabel, }; executions.push(snapshot); return snapshot; }; const snapshots = [ await runExecution({ snapshotId: 'a', label: 'A', modelKey: caseConfig.targetModelKey, promptText: caseConfig.workspacePrompt, promptRef: { kind: 'workspace', label: 'Target Workspace' }, versionLabel: 'workspace', }), await runExecution({ snapshotId: 'b', label: 'B', modelKey: caseConfig.targetModelKey, promptText: caseConfig.previousPrompt, promptRef: { kind: 'version', version: 1, label: 'Target Previous' }, versionLabel: 'previous', }), await runExecution({ snapshotId: 'c', label: 'C', modelKey: caseConfig.teacherModelKey, promptText: caseConfig.workspacePrompt, promptRef: { kind: 'workspace', label: 'Teacher Workspace' }, versionLabel: 'teacher-workspace', }), await runExecution({ snapshotId: 'd', label: 'D', modelKey: caseConfig.teacherModelKey, promptText: caseConfig.previousPrompt, promptRef: { kind: 'version', version: 1, label: 'Teacher Previous' }, versionLabel: 'teacher-previous', }), ]; const request = { type: 'compare', evaluationModelKey: caseConfig.evaluatorModelKey, mode: caseConfig.mode, focus: { content: caseConfig.focus, source: 'system', priority: 'highest', }, target: { workspacePrompt: caseConfig.workspacePrompt, }, testCases: [ { id: 'tc-1', input: { kind: 'text', label: '用户输入', content: caseConfig.userInput, }, }, ], snapshots, compareHints: { mode: 'structured', snapshotRoles: { a: 'target', b: 'baseline', c: 'reference', d: 'referenceBaseline', }, hasSharedTestCases: true, hasSamePromptSnapshots: true, hasCrossModelComparison: true, }, }; const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () => services.evaluationService.evaluate(request), ); return { request, response, executions, }; } async function runSyntheticCase(caseConfig, services) { const request = { type: 'compare', evaluationModelKey: caseConfig.evaluatorModelKey, mode: caseConfig.mode, focus: { content: caseConfig.focus, source: 'system', priority: 'highest', }, target: { workspacePrompt: caseConfig.targetWorkspacePrompt, referencePrompt: caseConfig.targetPreviousPrompt, }, testCases: caseConfig.testCases, snapshots: caseConfig.snapshots, compareHints: caseConfig.compareHints, }; const response = await services.llmService.withPhase(`evaluate:${caseConfig.id}`, () => services.evaluationService.evaluate(request), ); return { request, response, }; } async function writeCaseArtifacts(caseConfig, result, services) { const caseDir = path.join(OUTPUT_ROOT, caseConfig.id); await ensureDir(caseDir); const rewritePayload = buildRewritePayload({ result: result.response, type: 'compare', mode: caseConfig.mode, language: 'zh', workspacePrompt: caseConfig.kind === 'live' ? caseConfig.workspacePrompt : caseConfig.targetWorkspacePrompt, referencePrompt: caseConfig.kind === 'live' ? caseConfig.previousPrompt : caseConfig.targetPreviousPrompt, }); const rewriteInput = buildRewritePromptFromEvaluation({ result: result.response, type: 'compare', mode: caseConfig.mode, language: 'zh', workspacePrompt: caseConfig.kind === 'live' ? caseConfig.workspacePrompt : caseConfig.targetWorkspacePrompt, referencePrompt: caseConfig.kind === 'live' ? caseConfig.previousPrompt : caseConfig.targetPreviousPrompt, }); const rewriteOutput = await services.llmService.withPhase(`rewrite:${caseConfig.id}`, () => services.llmService.sendMessage([{ role: 'user', content: rewriteInput }], caseConfig.rewriteModelKey), ); const llmCalls = services.llmService.getCalls(); const promptPayloadArtifacts = collectPromptPayloadArtifacts(llmCalls); const expectationResults = evaluateExpectations(caseConfig.expected, result.response); const caseSummary = summarizeCaseResult(caseConfig, result.response); await writeText(path.join(caseDir, 'scenario.md'), renderScenarioMarkdown(caseConfig)); await writeText(path.join(caseDir, 'request.json'), toPrettyJson(result.request)); await writeText(path.join(caseDir, 'request.md'), jsonFence(result.request)); await writeText(path.join(caseDir, 'response.json'), toPrettyJson(result.response)); await writeText(path.join(caseDir, 'response.md'), jsonFence(result.response)); await writeText(path.join(caseDir, 'pair-judge-payloads.json'), toPrettyJson(promptPayloadArtifacts.pairJudgePayloads)); await writeText( path.join(caseDir, 'synthesis-payload.json'), toPrettyJson(promptPayloadArtifacts.synthesisPayload), ); await writeText(path.join(caseDir, 'rewrite-payload.json'), toPrettyJson(rewritePayload)); await writeText(path.join(caseDir, 'rewrite-input.txt'), `${rewriteInput}\n`); await writeText(path.join(caseDir, 'rewrite-output.txt'), `${rewriteOutput.trim()}\n`); await writeText(path.join(caseDir, 'llm-calls.json'), toPrettyJson(llmCalls)); await writeText( path.join(caseDir, 'llm-calls.md'), ['# LLM Calls', '', ...llmCalls.map((call, index) => renderCallMarkdown(call, index))].join('\n'), ); await writeText(path.join(caseDir, 'summary.json'), toPrettyJson({ generatedAt: nowIso, case: { id: caseConfig.id, title: caseConfig.title, kind: caseConfig.kind, }, summary: caseSummary, expectationResults, })); if (result.executions) { await writeText(path.join(caseDir, 'executions.json'), toPrettyJson(result.executions)); } const summaryMarkdown = [ `# ${caseConfig.title}`, '', `- caseId: ${caseConfig.id}`, `- kind: ${caseConfig.kind}`, `- generatedAt: ${nowIso}`, '', '## Description', '', caseConfig.description, '', '## Compare Result', '', jsonFence(caseSummary), '## Expectation Check', '', renderExpectationMarkdown(expectationResults), '', '## Rewrite Output', '', textFence(rewriteOutput.trim()), ].join('\n'); await writeText(path.join(caseDir, 'summary.md'), summaryMarkdown); return { caseSummary, expectationResults, }; } async function writeOverallSummary(results) { const rows = results.map((item) => { const matched = item.expectationResults.length ? item.expectationResults.filter((entry) => entry.matched).length : null; const total = item.expectationResults.length || null; return { caseId: item.caseConfig.id, title: item.caseConfig.title, kind: item.caseConfig.kind, score: item.caseSummary.score, stopRecommendation: item.caseSummary.stopSignals?.stopRecommendation || null, targetVsBaseline: item.caseSummary.stopSignals?.targetVsBaseline || null, targetVsReferenceGap: item.caseSummary.stopSignals?.targetVsReferenceGap || null, expectationMatched: matched, expectationTotal: total, }; }); const markdown = [ '# Structured Compare Calibration Summary', '', `- generatedAt: ${nowIso}`, `- outputRoot: ${OUTPUT_ROOT}`, '', '| Case | Kind | Score | targetVsBaseline | targetVsReferenceGap | stopRecommendation | Expectation Match |', '| --- | --- | --- | --- | --- | --- | --- |', ...rows.map((row) => { const expectationText = row.expectationMatched === null ? 'exploratory' : `${row.expectationMatched}/${row.expectationTotal}`; return `| ${row.caseId} | ${row.kind} | ${row.score} | ${row.targetVsBaseline} | ${row.targetVsReferenceGap} | ${row.stopRecommendation} | ${expectationText} |`; }), '', '## Notes', '', '- synthetic cases 用来检验 judge / synthesis 的提示词边界。', '- live case 用来观察真实 target/teacher 执行结果在 structured compare 下是否能收敛成合理结论。', '- 每个 case 子目录内都保存了 compare request、compare result、rewrite input / output,以及完整 LLM 调用日志。', '', ].join('\n'); await writeText(path.join(OUTPUT_ROOT, 'summary.json'), toPrettyJson({ generatedAt: nowIso, rows, })); await writeText(path.join(OUTPUT_ROOT, 'summary.md'), markdown); } async function main() { await fs.rm(OUTPUT_ROOT, { recursive: true, force: true }); await ensureDir(OUTPUT_ROOT); const services = await createServices(); const cases = [LIVE_BASIC_SYSTEM_CASE, ...SYNTHETIC_CASES]; const results = []; for (const caseConfig of cases) { services.llmService.clearCalls(); const result = caseConfig.kind === 'live' ? await runLiveCase(caseConfig, services) : await runSyntheticCase(caseConfig, services); const artifacts = await writeCaseArtifacts(caseConfig, result, services); results.push({ caseConfig, ...artifacts, }); } await writeOverallSummary(results); } main().catch((error) => { console.error('[structured-compare-calibration] failed:', error); process.exitCode = 1; });