mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-21 22:42:47 +08:00
* refactor: extract agent core package Introduce packages/agent-core as the OpenClaw-owned home for reusable agent loop, harness, session, prompt, and runtime dependency contracts. * refactor: extract shared llm runtime Move provider model registries, stream wrappers, OAuth helpers, and LLM utilities into src/llm with plugin-sdk barrels instead of depending on the old embedded runtime layout. * refactor: remove pi runtime internals Rename remaining Pi-shaped agent surfaces to OpenClaw agent runtime names, delete obsolete Pi docs and package graph checks, and add the third-party notice for incorporated code. * refactor: tighten agent session runtime Make agent-core/runtime dependencies explicit, consolidate compaction and session transcript helpers, and move model/session helpers behind OpenClaw-owned contracts. * refactor: remove static model and pi auth paths Drop static model catalogs and Pi auth bridges, move model/provider facts to manifest-owned runtime contracts, and harden internal embedded-agent utilities. * refactor: remove legacy provider compat paths * docs: remove agent parity notes * fix: skip provider wildcard metadata parsing * refactor: share session extension sdk loading * refactor: inline acpx proxy error formatter * refactor: fold edit recovery into edit tool * fix: accept extension batch separator * test: align startup provider plugin expectations * fix: restore provider-scoped release discovery * test: align static asset packaging expectations * fix: run static provider catalogs during scoped discovery * fix: add provider entry catalogs for scoped live discovery * fix: load lightweight provider catalog entries * fix: refresh provider-scoped plugin metadata * fix: keep provider catalog entries on release live path * fix: keep static manifest models in release live checks * fix: harden release model discovery * fix: reduce OpenAI live cache probe reasoning * fix: disable OpenAI cache probe reasoning * ci: extend OpenAI gateway live timeout * fix: extend live gateway model budget * fix: stabilize release validation regressions * fix: honor provider aliases in model rows * fix: stabilize release validation lanes * fix: stabilize release memory qa * ci: stabilize release validation lanes * ci: prefer ipv4 for live docker node calls * fix: restore shared tool-call stream wrapper * ci: remove legacy pi test shard alias * fix: clean up embedded agent test drift * fix: stabilize runtime alias status * fix: clean up embedded agent ci drift * fix: restore release ci invariants * fix: clean up post-rebase runtime drift * fix: restore release ci checks * fix: restore release ci after rebase * fix: remove stale pi runtime path * test: align compaction runtime expectations * test: update plugin prerelease expectations * fix: handle claude live tool approvals * fix: stabilize release validation gates * fix: finish agent runtime import * test: finish post-rebase agent runtime mocks * fix: keep codex compaction native * fix: stabilize codex app-server hook tests * test: isolate codex diagnostic active run * test: remove codex diagnostic completion race # Conflicts: # extensions/codex/src/app-server/run-attempt.test.ts * ci: fix full release manifest performance run id * refactor: narrow llm plugin sdk boundary * chore: drop generated google boundary stamps * fix: repair rebase fallout * fix: clean up rebased runtime references * fix: decode codex jwt payloads as base64url * fix: preserve shipped pi runtime alias * fix: add scoped sdk virtual modules * fix: decode llm codex oauth jwt as base64url * fix: avoid stale vertex adc negative cache * fix: harden tool arg decoding and codeql path * fix: keep vertex adc negative checks live * refactor: consolidate codex jwt and edit helpers * fix: await codex oauth node runtime imports * fix: preserve sdk tool and notice contracts * fix: preserve shipped compat config boundaries * fix: align codex oauth callback host * fix: terminate agent-core loop streams on failure * fix: keep codex oauth callback alive during fallback * ci: include session tools in critical codeql scans * fix: keep Cloudflare Anthropic provider auth header * docs: redirect legacy pi runtime pages * fix: honor bundled web provider compat discovery * fix: protect session output spill files * fix: keep legacy agent dir env blocked * fix: contain auto-discovered skill symlinks * fix: harden agent core sdk proxy surfaces * fix: restore approval reaction sdk compat * fix: keep live docker runs bounded * fix: keep codex oauth redirect host aligned * fix: resolve post-rebase agent runtime drift * fix: redact anthropic oauth parse failures * fix: preserve responses strict tool shaping * fix: repair agent runtime rebase cleanup * docs: redirect retired parity pages * fix: bound auto-discovered resources to roots * fix: repair post-rebase agent test drift * fix: preserve bundled provider allowlist migration * fix: preserve manifest-owned provider aliases * fix: declare photon image dependency * fix: keep provider headers out of proxy body * fix: preserve shipped env aliases * fix: refresh control ui i18n generated state * fix: quote read fallback paths * fix: preview edits through configured backend * test: satisfy core test typecheck * fix: preserve ZAI usage auth fallback * test: repair codex diagnostic test * fix: repair agent runtime rebase drift * test: finish embedded runner import rename * fix: repair agent runtime rebase integrations * test: align compaction oauth fallback expectations * fix: allow sdk-auth session models * fix: update doctor tool schema import * fix: preserve bedrock plugin region * fix: stream harmony-like prose immediately * ci: include session runtime in codeql shards * fix: repair latest rebase integrations * fix: honor explicit codex websocket transport * fix: keep openai-compatible credentials provider-scoped * fix: refresh sdk api baseline after rebase * fix: route cli runtime aliases through openclaw harness * test: rename stale harness mock expectation * test: rename embedded agent overflow calls * test: clean embedded auth test wording * test: use openclaw stream types in deepinfra cache test * fix: refresh sdk api baseline on latest main * fix: honor bundled discovery compat allowlists * fix: refresh sdk api baseline after latest rebase * fix: remove stale rebase imports * test: rename stale model catalog mock * test: mock renamed doctor runtime modules * fix: map canonical kimi env auth * fix: use internal model registry in bench script * fix: migrate deepinfra provider catalog entry * fix: enforce builtin tool suppression * fix: route compaction auth and proxy payloads safely * refactor: prune unused llm registry leftovers * test: update codex hooks session import * test: fix model picker ci coverage * test: align model picker auth mock types
1239 lines
39 KiB
TypeScript
1239 lines
39 KiB
TypeScript
import fs from "node:fs/promises";
|
|
import path from "node:path";
|
|
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
|
import {
|
|
formatGatewayLogSentinelSummary,
|
|
type GatewayLogSentinelFinding,
|
|
} from "./gateway-log-sentinel.js";
|
|
import {
|
|
buildHarnessParityCell,
|
|
buildHarnessParityResult,
|
|
type HarnessParityDrift,
|
|
type HarnessRuntimeParityCell,
|
|
type RuntimeParitySystemPromptReport,
|
|
} from "./harness-parity.js";
|
|
import {
|
|
runRuntimeParityScenario,
|
|
type RuntimeParityCell,
|
|
type RuntimeParityDrift,
|
|
type RuntimeParityResult,
|
|
type RuntimeParityToolCall,
|
|
} from "./runtime-parity.js";
|
|
import { buildTokenEfficiencyReport } from "./token-efficiency-report.js";
|
|
|
|
export const QA_CONFIDENCE_VERDICTS = [
|
|
"pass",
|
|
"product-bug",
|
|
"qa-harness-bug",
|
|
"fixture-bug",
|
|
"optional-gap",
|
|
"mock-limitation",
|
|
"environment-blocked",
|
|
] as const;
|
|
|
|
export type QaConfidenceVerdict = (typeof QA_CONFIDENCE_VERDICTS)[number];
|
|
|
|
export type QaConfidenceLaneKind =
|
|
| "qa-suite-summary"
|
|
| "runtime-parity-summary"
|
|
| "harness-parity-summary"
|
|
| "token-efficiency-summary"
|
|
| "jsonl-replay-summary"
|
|
| "self-test-summary"
|
|
| "generic-pass-summary";
|
|
|
|
export type QaConfidenceManifestLane = {
|
|
id: string;
|
|
title: string;
|
|
kind: QaConfidenceLaneKind;
|
|
artifact: string;
|
|
required: boolean;
|
|
failureVerdict?: Exclude<QaConfidenceVerdict, "pass" | "environment-blocked">;
|
|
missingVerdict?: "environment-blocked" | "optional-gap";
|
|
missingReason?: string;
|
|
expectedTokenUsageSource?: "mock-estimate" | "live-usage";
|
|
skipBackfillLane?: string;
|
|
productImpact?: string;
|
|
qaImpact?: string;
|
|
issue?: string;
|
|
ownerAction?: string;
|
|
labels?: string[];
|
|
};
|
|
|
|
export type QaConfidenceManifest = {
|
|
version: 1;
|
|
profile: string;
|
|
lanes: QaConfidenceManifestLane[];
|
|
};
|
|
|
|
export type QaConfidenceLaneStatus = "pass" | "fail" | "blocked" | "missing" | "unknown";
|
|
|
|
export type QaConfidenceLaneResult = {
|
|
id: string;
|
|
title: string;
|
|
kind: QaConfidenceLaneKind;
|
|
artifact: string;
|
|
artifactPath: string;
|
|
required: boolean;
|
|
status: QaConfidenceLaneStatus;
|
|
verdict?: QaConfidenceVerdict;
|
|
details: string;
|
|
productImpact?: string;
|
|
qaImpact?: string;
|
|
issue?: string;
|
|
ownerAction?: string;
|
|
labels?: string[];
|
|
skippedCount?: number;
|
|
skipBackfillLane?: string;
|
|
skipBackfilled?: boolean;
|
|
};
|
|
|
|
export type QaConfidenceReport = {
|
|
generatedAt: string;
|
|
profile: string;
|
|
strictZeroUnknowns: boolean;
|
|
strictGlobalPass: boolean;
|
|
pass: boolean;
|
|
zeroUnknowns: boolean;
|
|
globalPass: boolean;
|
|
counts: {
|
|
total: number;
|
|
passed: number;
|
|
failed: number;
|
|
blocked: number;
|
|
missing: number;
|
|
unknown: number;
|
|
};
|
|
failures: string[];
|
|
lanes: QaConfidenceLaneResult[];
|
|
};
|
|
|
|
export type QaConfidenceSelfTestCanary = {
|
|
id: string;
|
|
category:
|
|
| "prompt"
|
|
| "tool-schema"
|
|
| "tool-call"
|
|
| "tool-result"
|
|
| "failure-mode"
|
|
| "token-efficiency"
|
|
| "jsonl-replay";
|
|
detected: boolean;
|
|
expectedVerdict: Exclude<QaConfidenceVerdict, "pass" | "environment-blocked">;
|
|
details: string;
|
|
};
|
|
|
|
export type QaConfidenceSelfTestSummary = {
|
|
generatedAt: string;
|
|
pass: boolean;
|
|
canaries: QaConfidenceSelfTestCanary[];
|
|
};
|
|
|
|
const QA_CONFIDENCE_SELF_TEST_CANARY_IDS = [
|
|
"prompt-drift",
|
|
"tool-description-schema-drift",
|
|
"runtime-tool-call-drop",
|
|
"tool-result-mismatch",
|
|
"failure-mode-drift",
|
|
"token-efficiency-regression",
|
|
"jsonl-replay-ordering-drift",
|
|
] as const;
|
|
|
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
return Boolean(value) && typeof value === "object" && !Array.isArray(value);
|
|
}
|
|
|
|
function readString(value: unknown): string | undefined {
|
|
return typeof value === "string" && value.trim().length > 0 ? value.trim() : undefined;
|
|
}
|
|
|
|
function readNumber(value: unknown): number | undefined {
|
|
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
|
}
|
|
|
|
function readBoolean(value: unknown): boolean | undefined {
|
|
return typeof value === "boolean" ? value : undefined;
|
|
}
|
|
|
|
function readStringArray(value: unknown): string[] | undefined {
|
|
if (!Array.isArray(value)) {
|
|
return undefined;
|
|
}
|
|
const values = value.filter((entry): entry is string => typeof entry === "string");
|
|
return values.length === value.length ? values : undefined;
|
|
}
|
|
|
|
function isGatewayLogSentinelFinding(value: unknown): value is GatewayLogSentinelFinding {
|
|
if (!isRecord(value)) {
|
|
return false;
|
|
}
|
|
const kind = readString(value.kind);
|
|
const verdict = readString(value.verdict);
|
|
return Boolean(kind && verdict && isQaConfidenceVerdict(verdict));
|
|
}
|
|
|
|
function collectGatewayLogSentinels(value: unknown): GatewayLogSentinelFinding[] {
|
|
const findings: GatewayLogSentinelFinding[] = [];
|
|
const visit = (candidate: unknown) => {
|
|
if (Array.isArray(candidate)) {
|
|
for (const entry of candidate) {
|
|
visit(entry);
|
|
}
|
|
return;
|
|
}
|
|
if (!isRecord(candidate)) {
|
|
return;
|
|
}
|
|
if (Array.isArray(candidate.gatewayLogSentinels)) {
|
|
findings.push(...candidate.gatewayLogSentinels.filter(isGatewayLogSentinelFinding));
|
|
}
|
|
if (Array.isArray(candidate.sentinelFindings)) {
|
|
findings.push(...candidate.sentinelFindings.filter(isGatewayLogSentinelFinding));
|
|
}
|
|
for (const [key, nested] of Object.entries(candidate)) {
|
|
if (key === "gatewayLogSentinels" || key === "sentinelFindings") {
|
|
continue;
|
|
}
|
|
visit(nested);
|
|
}
|
|
};
|
|
visit(value);
|
|
return findings;
|
|
}
|
|
|
|
function isQaConfidenceVerdict(value: string): value is QaConfidenceVerdict {
|
|
return QA_CONFIDENCE_VERDICTS.includes(value as QaConfidenceVerdict);
|
|
}
|
|
|
|
function readRequiredString(record: Record<string, unknown>, key: string): string {
|
|
const value = readString(record[key]);
|
|
if (!value) {
|
|
throw new Error(`confidence manifest lane missing ${key}`);
|
|
}
|
|
return value;
|
|
}
|
|
|
|
function readVerdict(value: unknown, key: string): QaConfidenceVerdict | undefined {
|
|
const text = readString(value);
|
|
if (!text) {
|
|
return undefined;
|
|
}
|
|
if (!isQaConfidenceVerdict(text)) {
|
|
throw new Error(
|
|
`confidence manifest ${key} must be one of ${QA_CONFIDENCE_VERDICTS.join(", ")}`,
|
|
);
|
|
}
|
|
return text;
|
|
}
|
|
|
|
function readLaneKind(value: unknown): QaConfidenceLaneKind {
|
|
const text = readString(value);
|
|
switch (text) {
|
|
case "qa-suite-summary":
|
|
case "runtime-parity-summary":
|
|
case "harness-parity-summary":
|
|
case "token-efficiency-summary":
|
|
case "jsonl-replay-summary":
|
|
case "self-test-summary":
|
|
case "generic-pass-summary":
|
|
return text;
|
|
default:
|
|
throw new Error(`unknown confidence manifest lane kind: ${text ?? "missing"}`);
|
|
}
|
|
}
|
|
|
|
function normalizeManifestLane(value: unknown): QaConfidenceManifestLane {
|
|
if (!isRecord(value)) {
|
|
throw new Error("confidence manifest lanes must be objects");
|
|
}
|
|
const failureVerdict = readVerdict(value.failureVerdict, "failureVerdict");
|
|
if (failureVerdict === "pass" || failureVerdict === "environment-blocked") {
|
|
throw new Error("confidence manifest failureVerdict must classify an actual failure");
|
|
}
|
|
const missingVerdict = readVerdict(value.missingVerdict, "missingVerdict");
|
|
if (
|
|
missingVerdict !== undefined &&
|
|
missingVerdict !== "environment-blocked" &&
|
|
missingVerdict !== "optional-gap"
|
|
) {
|
|
throw new Error(
|
|
"confidence manifest missingVerdict must be environment-blocked or optional-gap",
|
|
);
|
|
}
|
|
const expectedTokenUsageSource = readString(value.expectedTokenUsageSource);
|
|
if (
|
|
expectedTokenUsageSource !== undefined &&
|
|
expectedTokenUsageSource !== "mock-estimate" &&
|
|
expectedTokenUsageSource !== "live-usage"
|
|
) {
|
|
throw new Error(
|
|
"confidence manifest expectedTokenUsageSource must be mock-estimate or live-usage",
|
|
);
|
|
}
|
|
return {
|
|
id: readRequiredString(value, "id"),
|
|
title: readRequiredString(value, "title"),
|
|
kind: readLaneKind(value.kind),
|
|
artifact: readRequiredString(value, "artifact"),
|
|
required: readBoolean(value.required) ?? true,
|
|
...(failureVerdict ? { failureVerdict } : {}),
|
|
...(missingVerdict ? { missingVerdict } : {}),
|
|
...(readString(value.missingReason) ? { missingReason: readString(value.missingReason) } : {}),
|
|
...(expectedTokenUsageSource ? { expectedTokenUsageSource } : {}),
|
|
...(readString(value.skipBackfillLane)
|
|
? { skipBackfillLane: readString(value.skipBackfillLane) }
|
|
: {}),
|
|
...(readString(value.productImpact) ? { productImpact: readString(value.productImpact) } : {}),
|
|
...(readString(value.qaImpact) ? { qaImpact: readString(value.qaImpact) } : {}),
|
|
...(readString(value.issue) ? { issue: readString(value.issue) } : {}),
|
|
...(readString(value.ownerAction) ? { ownerAction: readString(value.ownerAction) } : {}),
|
|
...(readStringArray(value.labels) ? { labels: readStringArray(value.labels) } : {}),
|
|
};
|
|
}
|
|
|
|
export function normalizeQaConfidenceManifest(value: unknown): QaConfidenceManifest {
|
|
if (!isRecord(value)) {
|
|
throw new Error("confidence manifest must be an object");
|
|
}
|
|
if (value.version !== 1) {
|
|
throw new Error("confidence manifest version must be 1");
|
|
}
|
|
const profile = readString(value.profile);
|
|
if (!profile) {
|
|
throw new Error("confidence manifest missing profile");
|
|
}
|
|
if (!Array.isArray(value.lanes) || value.lanes.length === 0) {
|
|
throw new Error("confidence manifest must include at least one lane");
|
|
}
|
|
const lanes = value.lanes.map(normalizeManifestLane);
|
|
const ids = new Set<string>();
|
|
for (const lane of lanes) {
|
|
if (ids.has(lane.id)) {
|
|
throw new Error(`confidence manifest duplicate lane id: ${lane.id}`);
|
|
}
|
|
ids.add(lane.id);
|
|
}
|
|
return {
|
|
version: 1,
|
|
profile,
|
|
lanes,
|
|
};
|
|
}
|
|
|
|
export async function readQaConfidenceManifestFile(
|
|
filePath: string,
|
|
): Promise<QaConfidenceManifest> {
|
|
let payload: unknown;
|
|
try {
|
|
payload = JSON.parse(await fs.readFile(filePath, "utf8")) as unknown;
|
|
} catch (error) {
|
|
throw new Error(
|
|
`Could not read confidence manifest at ${filePath}: ${formatErrorMessage(error)}`,
|
|
{
|
|
cause: error,
|
|
},
|
|
);
|
|
}
|
|
return normalizeQaConfidenceManifest(payload);
|
|
}
|
|
|
|
function resolveArtifactPath(artifactRoot: string, artifact: string): string {
|
|
return path.isAbsolute(artifact) ? artifact : path.resolve(artifactRoot, artifact);
|
|
}
|
|
|
|
async function readJsonFile(filePath: string): Promise<unknown> {
|
|
return JSON.parse(await fs.readFile(filePath, "utf8")) as unknown;
|
|
}
|
|
|
|
function isMissingFileError(error: unknown): boolean {
|
|
return isRecord(error) && error.code === "ENOENT";
|
|
}
|
|
|
|
function statusFromPassed(passed: boolean): Pick<QaConfidenceLaneResult, "status" | "verdict"> {
|
|
return passed ? { status: "pass", verdict: "pass" } : { status: "unknown" };
|
|
}
|
|
|
|
type QaConfidenceLaneEvaluation = {
|
|
passed: boolean;
|
|
details: string;
|
|
skippedCount?: number;
|
|
status?: QaConfidenceLaneStatus;
|
|
verdict?: QaConfidenceVerdict;
|
|
};
|
|
|
|
function evaluateQaSuiteSummary(payload: unknown): QaConfidenceLaneEvaluation {
|
|
if (!isRecord(payload)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "qa-suite-summary payload was not an object",
|
|
};
|
|
}
|
|
const counts = isRecord(payload.counts) ? payload.counts : undefined;
|
|
const totalCount = readNumber(counts?.total);
|
|
const passedCount = readNumber(counts?.passed);
|
|
const failedCount = readNumber(counts?.failed);
|
|
const scenarios = Array.isArray(payload.scenarios) ? payload.scenarios : undefined;
|
|
const failedScenarios = scenarios?.filter(
|
|
(scenario) => isRecord(scenario) && scenario.status === "fail",
|
|
);
|
|
const skippedScenarioCount =
|
|
scenarios?.filter(
|
|
(scenario) =>
|
|
isRecord(scenario) && (scenario.status === "skip" || scenario.status === "skipped"),
|
|
).length ?? 0;
|
|
const hasScenarioRows = scenarios !== undefined && scenarios.length > 0;
|
|
const gatewayLogSentinels = collectGatewayLogSentinels(payload);
|
|
if (gatewayLogSentinels.length > 0) {
|
|
const allEnvironmentBlocked = gatewayLogSentinels.every(
|
|
(finding) => finding.verdict === "environment-blocked",
|
|
);
|
|
const suiteHasFailures =
|
|
(failedCount !== undefined && failedCount > 0) || (failedScenarios?.length ?? 0) > 0;
|
|
if (allEnvironmentBlocked && suiteHasFailures) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: `gateway log sentinel(s): ${formatGatewayLogSentinelSummary(
|
|
gatewayLogSentinels,
|
|
)}; suite also reports failures`,
|
|
};
|
|
}
|
|
const firstBlockingSentinel =
|
|
gatewayLogSentinels.find((finding) => finding.verdict !== "environment-blocked") ??
|
|
gatewayLogSentinels[0];
|
|
return {
|
|
passed: false,
|
|
status: allEnvironmentBlocked ? "blocked" : "fail",
|
|
verdict: allEnvironmentBlocked
|
|
? "environment-blocked"
|
|
: (firstBlockingSentinel?.verdict ?? "product-bug"),
|
|
details: `gateway log sentinel(s): ${formatGatewayLogSentinelSummary(gatewayLogSentinels)}`,
|
|
};
|
|
}
|
|
if (failedCount !== undefined) {
|
|
if (failedCount === 0 && !(totalCount !== undefined && totalCount > 0) && !hasScenarioRows) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "qa-suite-summary has no executed scenarios",
|
|
};
|
|
}
|
|
if (failedScenarios !== undefined && Math.floor(failedCount) !== failedScenarios.length) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: `qa-suite-summary count/scenario mismatch: counts.failed=${Math.max(
|
|
0,
|
|
Math.floor(failedCount),
|
|
)}, failed scenarios=${failedScenarios.length}`,
|
|
};
|
|
}
|
|
const explicitSkippedCount = readNumber(counts?.skipped);
|
|
const inferredSkippedCount =
|
|
totalCount === undefined || passedCount === undefined
|
|
? undefined
|
|
: Math.max(0, Math.floor(totalCount) - Math.floor(passedCount) - Math.floor(failedCount));
|
|
const skippedCount = Math.max(
|
|
0,
|
|
...[explicitSkippedCount, inferredSkippedCount, skippedScenarioCount].filter(
|
|
(count): count is number => count !== undefined,
|
|
),
|
|
);
|
|
const shouldReportSkippedCount = explicitSkippedCount !== undefined || skippedCount > 0;
|
|
const skippedDetails = shouldReportSkippedCount
|
|
? ` counts.skipped=${Math.max(0, Math.floor(skippedCount))}`
|
|
: "";
|
|
const totalDetails =
|
|
totalCount === undefined ? "" : ` counts.total=${Math.max(0, Math.floor(totalCount))}`;
|
|
return {
|
|
passed: failedCount === 0,
|
|
details: `qa-suite-summary counts.failed=${Math.max(0, Math.floor(failedCount))}${totalDetails}${skippedDetails}`,
|
|
...(skippedCount === 0 ? {} : { skippedCount: Math.max(0, Math.floor(skippedCount)) }),
|
|
};
|
|
}
|
|
if (!Array.isArray(payload.scenarios)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "qa-suite-summary missing counts.failed and scenarios[]",
|
|
};
|
|
}
|
|
if (payload.scenarios.length === 0) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "qa-suite-summary has no executed scenarios",
|
|
};
|
|
}
|
|
const fallbackFailedScenarios = payload.scenarios.filter(
|
|
(scenario) => isRecord(scenario) && scenario.status === "fail",
|
|
);
|
|
return {
|
|
passed: fallbackFailedScenarios.length === 0,
|
|
details: `qa-suite-summary failed scenarios=${fallbackFailedScenarios.length}`,
|
|
};
|
|
}
|
|
|
|
function evaluatePassSummary(payload: unknown): QaConfidenceLaneEvaluation {
|
|
if (!isRecord(payload)) {
|
|
return { passed: false, details: "summary payload was not an object" };
|
|
}
|
|
const pass = readBoolean(payload.pass);
|
|
if (pass !== undefined) {
|
|
return { passed: pass, details: `summary pass=${String(pass)}` };
|
|
}
|
|
const verdict = readString(payload.verdict);
|
|
if (verdict) {
|
|
return { passed: verdict === "pass", details: `summary verdict=${verdict}` };
|
|
}
|
|
const status = readString(payload.status);
|
|
if (status) {
|
|
if (
|
|
status === "pass" ||
|
|
status === "passed" ||
|
|
status === "success" ||
|
|
status === "succeeded"
|
|
) {
|
|
return { passed: true, details: `summary status=${status}` };
|
|
}
|
|
if (status === "fail" || status === "failed" || status === "error") {
|
|
return { passed: false, details: `summary status=${status}` };
|
|
}
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: `summary status=${status}`,
|
|
};
|
|
}
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "summary did not expose an explicit pass signal",
|
|
};
|
|
}
|
|
|
|
function evaluateTokenEfficiencySummary(
|
|
payload: unknown,
|
|
expectedTokenUsageSource: QaConfidenceManifestLane["expectedTokenUsageSource"],
|
|
): QaConfidenceLaneEvaluation {
|
|
const base = evaluatePassSummary(payload);
|
|
if (!base.passed || !expectedTokenUsageSource) {
|
|
return base;
|
|
}
|
|
if (!isRecord(payload) || !Array.isArray(payload.rows)) {
|
|
return {
|
|
passed: false,
|
|
details: `token summary missing rows for expected usageSource=${expectedTokenUsageSource}`,
|
|
};
|
|
}
|
|
if (readString(payload.status) === "skipped" || payload.rows.length === 0) {
|
|
return {
|
|
passed: false,
|
|
details: `token summary has no ${expectedTokenUsageSource} rows`,
|
|
};
|
|
}
|
|
const mismatched = payload.rows.filter(
|
|
(row) => !isRecord(row) || row.usageSource !== expectedTokenUsageSource,
|
|
);
|
|
return {
|
|
passed: mismatched.length === 0,
|
|
details:
|
|
mismatched.length === 0
|
|
? `token summary rows all usageSource=${expectedTokenUsageSource}`
|
|
: `token summary has ${mismatched.length} row(s) not labeled ${expectedTokenUsageSource}`,
|
|
};
|
|
}
|
|
|
|
function evaluateJsonlReplaySummary(payload: unknown): QaConfidenceLaneEvaluation {
|
|
if (!isRecord(payload) || !Array.isArray(payload.transcripts)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay summary missing transcripts array",
|
|
};
|
|
}
|
|
if (payload.transcripts.length === 0) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay summary has no transcripts",
|
|
};
|
|
}
|
|
let drifted = 0;
|
|
let replayedUserTurns = 0;
|
|
for (const transcript of payload.transcripts) {
|
|
if (!isRecord(transcript)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay summary has an invalid transcript row",
|
|
};
|
|
}
|
|
const userTurnCount = readNumber(transcript.userTurnCount);
|
|
if (userTurnCount !== undefined && userTurnCount > 0) {
|
|
replayedUserTurns += userTurnCount;
|
|
}
|
|
const hasFirstDrift = transcript.firstDriftAtTurn !== undefined;
|
|
if (!Array.isArray(transcript.drift)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay transcript missing drift array",
|
|
};
|
|
}
|
|
if (userTurnCount !== undefined && transcript.drift.length !== userTurnCount) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay transcript drift count does not match userTurnCount",
|
|
};
|
|
}
|
|
const drift = transcript.drift;
|
|
const hasDrift = drift.some((entry) => entry !== "none");
|
|
if (hasFirstDrift || hasDrift) {
|
|
drifted += 1;
|
|
}
|
|
}
|
|
if (replayedUserTurns === 0) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "jsonl replay summary has no replayed user turns",
|
|
};
|
|
}
|
|
return {
|
|
passed: drifted === 0,
|
|
details: `jsonl replay turns=${replayedUserTurns}, drifted transcripts=${drifted}`,
|
|
};
|
|
}
|
|
|
|
function evaluateSelfTestSummary(payload: unknown): QaConfidenceLaneEvaluation {
|
|
if (!isRecord(payload) || !Array.isArray(payload.canaries)) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "confidence self-test summary missing canaries array",
|
|
};
|
|
}
|
|
if (payload.canaries.length === 0) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: "confidence self-test summary has no canaries",
|
|
};
|
|
}
|
|
const canariesById = new Map(
|
|
payload.canaries
|
|
.filter((canary): canary is Record<string, unknown> => isRecord(canary))
|
|
.map((canary) => [readString(canary.id), canary]),
|
|
);
|
|
const missingExpected = QA_CONFIDENCE_SELF_TEST_CANARY_IDS.filter(
|
|
(canaryId) => !canariesById.has(canaryId),
|
|
);
|
|
if (missingExpected.length > 0) {
|
|
return {
|
|
passed: false,
|
|
status: "unknown",
|
|
details: `confidence self-test missing expected canaries: ${missingExpected.join(", ")}`,
|
|
};
|
|
}
|
|
const missed = QA_CONFIDENCE_SELF_TEST_CANARY_IDS.filter(
|
|
(canaryId) => canariesById.get(canaryId)?.detected !== true,
|
|
);
|
|
const pass = readBoolean(payload.pass) ?? missed.length === 0;
|
|
return {
|
|
passed: pass && missed.length === 0,
|
|
details: `confidence self-test detected=${
|
|
QA_CONFIDENCE_SELF_TEST_CANARY_IDS.length - missed.length
|
|
}/${QA_CONFIDENCE_SELF_TEST_CANARY_IDS.length}`,
|
|
};
|
|
}
|
|
|
|
function evaluateLaneArtifact(
|
|
lane: QaConfidenceManifestLane,
|
|
payload: unknown,
|
|
): QaConfidenceLaneEvaluation {
|
|
switch (lane.kind) {
|
|
case "qa-suite-summary":
|
|
return evaluateQaSuiteSummary(payload);
|
|
case "runtime-parity-summary":
|
|
case "harness-parity-summary":
|
|
case "generic-pass-summary":
|
|
return evaluatePassSummary(payload);
|
|
case "token-efficiency-summary":
|
|
return evaluateTokenEfficiencySummary(payload, lane.expectedTokenUsageSource);
|
|
case "jsonl-replay-summary":
|
|
return evaluateJsonlReplaySummary(payload);
|
|
case "self-test-summary":
|
|
return evaluateSelfTestSummary(payload);
|
|
default:
|
|
return {
|
|
passed: false,
|
|
details: `unknown confidence lane kind: ${(lane as { kind?: string }).kind ?? "missing"}`,
|
|
};
|
|
}
|
|
}
|
|
|
|
function resultForMissingLane(
|
|
lane: QaConfidenceManifestLane,
|
|
artifactPath: string,
|
|
): QaConfidenceLaneResult {
|
|
if (lane.missingVerdict) {
|
|
return {
|
|
...baseLaneResult(lane, artifactPath),
|
|
status: lane.missingVerdict === "environment-blocked" ? "blocked" : "fail",
|
|
verdict: lane.missingVerdict,
|
|
details: lane.missingReason ?? "artifact missing with explicit missing verdict",
|
|
};
|
|
}
|
|
return {
|
|
...baseLaneResult(lane, artifactPath),
|
|
status: "missing",
|
|
details: "artifact missing and no missingVerdict was configured",
|
|
};
|
|
}
|
|
|
|
function baseLaneResult(
|
|
lane: QaConfidenceManifestLane,
|
|
artifactPath: string,
|
|
): Omit<QaConfidenceLaneResult, "status" | "details"> {
|
|
const reportArtifactPath = path.isAbsolute(lane.artifact)
|
|
? path.basename(artifactPath)
|
|
: lane.artifact;
|
|
return {
|
|
id: lane.id,
|
|
title: lane.title,
|
|
kind: lane.kind,
|
|
artifact: lane.artifact,
|
|
artifactPath: reportArtifactPath,
|
|
required: lane.required,
|
|
...(lane.productImpact ? { productImpact: lane.productImpact } : {}),
|
|
...(lane.qaImpact ? { qaImpact: lane.qaImpact } : {}),
|
|
...(lane.issue ? { issue: lane.issue } : {}),
|
|
...(lane.ownerAction ? { ownerAction: lane.ownerAction } : {}),
|
|
...(lane.labels ? { labels: lane.labels } : {}),
|
|
...(lane.skipBackfillLane ? { skipBackfillLane: lane.skipBackfillLane } : {}),
|
|
};
|
|
}
|
|
|
|
function classifiedFailureResult(
|
|
lane: QaConfidenceManifestLane,
|
|
artifactPath: string,
|
|
details: string,
|
|
): QaConfidenceLaneResult {
|
|
const base = baseLaneResult(lane, artifactPath);
|
|
if (lane.failureVerdict) {
|
|
return {
|
|
...base,
|
|
status: "fail",
|
|
verdict: lane.failureVerdict,
|
|
details,
|
|
};
|
|
}
|
|
return {
|
|
...base,
|
|
status: "unknown",
|
|
details,
|
|
};
|
|
}
|
|
|
|
function evaluatedFailureResult(
|
|
lane: QaConfidenceManifestLane,
|
|
artifactPath: string,
|
|
evaluated: QaConfidenceLaneEvaluation,
|
|
): QaConfidenceLaneResult {
|
|
if (evaluated.status || evaluated.verdict) {
|
|
return {
|
|
...baseLaneResult(lane, artifactPath),
|
|
status: evaluated.status ?? "fail",
|
|
...(evaluated.verdict ? { verdict: evaluated.verdict } : {}),
|
|
details: evaluated.details,
|
|
};
|
|
}
|
|
return classifiedFailureResult(lane, artifactPath, evaluated.details);
|
|
}
|
|
|
|
async function evaluateLane(
|
|
lane: QaConfidenceManifestLane,
|
|
artifactRoot: string,
|
|
): Promise<QaConfidenceLaneResult> {
|
|
const artifactPath = resolveArtifactPath(artifactRoot, lane.artifact);
|
|
let payload: unknown;
|
|
try {
|
|
payload = await readJsonFile(artifactPath);
|
|
} catch (error) {
|
|
if (!isMissingFileError(error)) {
|
|
return {
|
|
...baseLaneResult(lane, artifactPath),
|
|
status: "unknown",
|
|
details: `artifact unreadable: ${formatErrorMessage(error)}`,
|
|
};
|
|
}
|
|
return resultForMissingLane(lane, artifactPath);
|
|
}
|
|
const evaluated = evaluateLaneArtifact(lane, payload);
|
|
if (!evaluated.passed) {
|
|
return {
|
|
...evaluatedFailureResult(lane, artifactPath, evaluated),
|
|
...(evaluated.skippedCount === undefined ? {} : { skippedCount: evaluated.skippedCount }),
|
|
};
|
|
}
|
|
return {
|
|
...baseLaneResult(lane, artifactPath),
|
|
...statusFromPassed(true),
|
|
details: evaluated.details,
|
|
...(evaluated.skippedCount === undefined ? {} : { skippedCount: evaluated.skippedCount }),
|
|
};
|
|
}
|
|
|
|
function applySkipBackfillState(
|
|
lanes: readonly QaConfidenceLaneResult[],
|
|
): QaConfidenceLaneResult[] {
|
|
const byId = new Map(lanes.map((lane) => [lane.id, lane]));
|
|
return lanes.map((lane) => {
|
|
if (!lane.skippedCount || lane.skippedCount <= 0 || !lane.skipBackfillLane) {
|
|
return lane;
|
|
}
|
|
const backfillLane = byId.get(lane.skipBackfillLane);
|
|
const skipBackfilled = backfillLane?.status === "pass";
|
|
return {
|
|
...lane,
|
|
skipBackfilled,
|
|
details: `${lane.details}; skipped rows backfilled by ${lane.skipBackfillLane}: ${
|
|
skipBackfilled ? "yes" : "no"
|
|
}`,
|
|
};
|
|
});
|
|
}
|
|
|
|
function countLaneResults(lanes: readonly QaConfidenceLaneResult[]): QaConfidenceReport["counts"] {
|
|
return {
|
|
total: lanes.length,
|
|
passed: lanes.filter((lane) => lane.status === "pass").length,
|
|
failed: lanes.filter((lane) => lane.status === "fail").length,
|
|
blocked: lanes.filter((lane) => lane.status === "blocked").length,
|
|
missing: lanes.filter((lane) => lane.status === "missing").length,
|
|
unknown: lanes.filter((lane) => lane.status === "unknown" || lane.status === "missing").length,
|
|
};
|
|
}
|
|
|
|
function failuresForLaneResults(lanes: readonly QaConfidenceLaneResult[]): string[] {
|
|
return lanes
|
|
.filter((lane) => lane.status === "unknown" || lane.status === "missing")
|
|
.map((lane) => `${lane.id} is unclassified: ${lane.details}`);
|
|
}
|
|
|
|
function globalFailuresForLaneResults(lanes: readonly QaConfidenceLaneResult[]): string[] {
|
|
return lanes.flatMap((lane) => {
|
|
if (lane.status === "blocked") {
|
|
return [`${lane.id} is blocked: ${lane.details}`];
|
|
}
|
|
if (lane.status === "missing") {
|
|
return [`${lane.id} is missing: ${lane.details}`];
|
|
}
|
|
if (lane.status === "unknown") {
|
|
return [`${lane.id} is unclassified: ${lane.details}`];
|
|
}
|
|
if (lane.status === "fail") {
|
|
return [`${lane.id} is classified ${lane.verdict ?? "unclassified"}: ${lane.details}`];
|
|
}
|
|
if ((lane.skippedCount ?? 0) > 0 && lane.skipBackfilled !== true) {
|
|
return [`${lane.id} has ${lane.skippedCount} skipped row(s) with no passing backfill lane`];
|
|
}
|
|
return [];
|
|
});
|
|
}
|
|
|
|
export async function buildQaConfidenceReport(params: {
|
|
manifest: QaConfidenceManifest;
|
|
artifactRoot: string;
|
|
strictZeroUnknowns?: boolean;
|
|
strictGlobalPass?: boolean;
|
|
generatedAt?: string;
|
|
}): Promise<QaConfidenceReport> {
|
|
const evaluatedLanes = [];
|
|
for (const lane of params.manifest.lanes) {
|
|
evaluatedLanes.push(await evaluateLane(lane, params.artifactRoot));
|
|
}
|
|
const lanes = applySkipBackfillState(evaluatedLanes);
|
|
const requiredLanes = lanes.filter((lane) => lane.required);
|
|
const counts = countLaneResults(requiredLanes);
|
|
const unclassifiedFailures = failuresForLaneResults(requiredLanes);
|
|
const globalFailures = globalFailuresForLaneResults(requiredLanes);
|
|
const zeroUnknowns = counts.unknown === 0;
|
|
const globalPass = zeroUnknowns && globalFailures.length === 0;
|
|
const strictZeroUnknowns = params.strictZeroUnknowns === true;
|
|
const strictGlobalPass = params.strictGlobalPass === true;
|
|
return {
|
|
generatedAt: params.generatedAt ?? new Date().toISOString(),
|
|
profile: params.manifest.profile,
|
|
strictZeroUnknowns,
|
|
strictGlobalPass,
|
|
pass: strictGlobalPass
|
|
? globalPass
|
|
: strictZeroUnknowns
|
|
? zeroUnknowns
|
|
: unclassifiedFailures.length === 0,
|
|
zeroUnknowns,
|
|
globalPass,
|
|
counts,
|
|
failures: strictGlobalPass ? globalFailures : unclassifiedFailures,
|
|
lanes,
|
|
};
|
|
}
|
|
|
|
function formatVerdict(lane: QaConfidenceLaneResult): string {
|
|
return lane.verdict ?? "unclassified";
|
|
}
|
|
|
|
function escapeTableCell(value: string): string {
|
|
return value.replace(/\|/gu, "\\|").replace(/\s+/gu, " ").trim();
|
|
}
|
|
|
|
export function renderQaConfidenceMarkdownReport(report: QaConfidenceReport): string {
|
|
const lines = [
|
|
`# OpenClaw QA Confidence Report - ${report.profile}`,
|
|
"",
|
|
`- Generated at: ${report.generatedAt}`,
|
|
`- Verdict: ${report.pass ? "pass" : "fail"}`,
|
|
`- Strict zero unknowns: ${report.strictZeroUnknowns ? "yes" : "no"}`,
|
|
`- Strict global pass: ${report.strictGlobalPass ? "yes" : "no"}`,
|
|
`- Zero unknowns: ${report.zeroUnknowns ? "yes" : "no"}`,
|
|
`- Global pass: ${report.globalPass ? "yes" : "no"}`,
|
|
`- Counts: ${report.counts.passed} pass, ${report.counts.failed} classified fail, ${report.counts.blocked} blocked, ${report.counts.unknown} unknown`,
|
|
"",
|
|
"| Lane | Status | Verdict | Product impact | QA impact | Details |",
|
|
"| --- | --- | --- | --- | --- | --- |",
|
|
];
|
|
for (const lane of report.lanes) {
|
|
lines.push(
|
|
`| ${escapeTableCell(lane.id)} | ${lane.status} | ${formatVerdict(lane)} | ${lane.productImpact ?? ""} | ${lane.qaImpact ?? ""} | ${escapeTableCell(lane.details)} |`,
|
|
);
|
|
}
|
|
if (report.failures.length > 0) {
|
|
lines.push(
|
|
"",
|
|
report.strictGlobalPass ? "## Global Gate Failures" : "## Unclassified Failures",
|
|
"",
|
|
);
|
|
for (const failure of report.failures) {
|
|
lines.push(`- ${failure}`);
|
|
}
|
|
}
|
|
return `${lines.join("\n")}\n`;
|
|
}
|
|
|
|
function syntheticRuntimeCell(
|
|
runtime: RuntimeParityCell["runtime"],
|
|
overrides: Partial<HarnessRuntimeParityCell> = {},
|
|
): HarnessRuntimeParityCell {
|
|
return {
|
|
runtime,
|
|
transcriptBytes: JSON.stringify({ message: { role: "assistant", content: "ok" } }),
|
|
toolCalls: [],
|
|
finalText: "ok",
|
|
usage: {
|
|
inputTokens: 10,
|
|
outputTokens: 5,
|
|
totalTokens: 15,
|
|
},
|
|
wallClockMs: 10,
|
|
bootStateLines: [],
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
function syntheticToolCall(overrides: Partial<RuntimeParityToolCall> = {}): RuntimeParityToolCall {
|
|
return {
|
|
tool: "openclaw.synthetic",
|
|
argsHash: "args-a",
|
|
resultHash: "result-a",
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
async function detectRuntimeDrift(params: {
|
|
scenarioId: string;
|
|
openclaw: RuntimeParityCell;
|
|
codex: RuntimeParityCell;
|
|
expectedDrift: RuntimeParityDrift;
|
|
}): Promise<boolean> {
|
|
const result = await runRuntimeParityScenario({
|
|
scenarioId: params.scenarioId,
|
|
runCell: async (runtime) => ({
|
|
scenarioStatus: "pass",
|
|
cell: runtime === "openclaw" ? params.openclaw : params.codex,
|
|
}),
|
|
});
|
|
return result.drift === params.expectedDrift;
|
|
}
|
|
|
|
function syntheticPromptReport(
|
|
overrides: Partial<RuntimeParitySystemPromptReport> = {},
|
|
): RuntimeParitySystemPromptReport {
|
|
return {
|
|
systemPrompt: {
|
|
chars: 100,
|
|
projectContextChars: 10,
|
|
nonProjectContextChars: 90,
|
|
hash: "system-prompt-a",
|
|
},
|
|
skills: {
|
|
promptChars: 20,
|
|
hash: "skills-a",
|
|
},
|
|
tools: {
|
|
listChars: 30,
|
|
schemaChars: 40,
|
|
entries: [
|
|
{
|
|
name: "openclaw.synthetic",
|
|
summaryChars: 12,
|
|
summaryHash: "summary-a",
|
|
schemaChars: 18,
|
|
schemaHash: "schema-a",
|
|
propertiesCount: 2,
|
|
},
|
|
],
|
|
},
|
|
...overrides,
|
|
};
|
|
}
|
|
|
|
function detectHarnessDrift(params: {
|
|
leftReport: RuntimeParitySystemPromptReport;
|
|
rightReport: RuntimeParitySystemPromptReport;
|
|
expectedDrift: HarnessParityDrift;
|
|
}): boolean {
|
|
const left = buildHarnessParityCell({
|
|
variant: { id: "left", label: "Left" },
|
|
cell: syntheticRuntimeCell("openclaw", { systemPromptReport: params.leftReport }),
|
|
tokenUsageSource: "mock-estimate",
|
|
});
|
|
const right = buildHarnessParityCell({
|
|
variant: { id: "right", label: "Right" },
|
|
cell: syntheticRuntimeCell("codex", { systemPromptReport: params.rightReport }),
|
|
tokenUsageSource: "mock-estimate",
|
|
});
|
|
return (
|
|
buildHarnessParityResult({
|
|
scenarioId: "confidence-self-test",
|
|
left,
|
|
right,
|
|
}).drift === params.expectedDrift
|
|
);
|
|
}
|
|
|
|
function detectTokenEfficiencyRegression(): boolean {
|
|
const openclaw = syntheticRuntimeCell("openclaw", {
|
|
usage: { inputTokens: 100, outputTokens: 20, totalTokens: 120 },
|
|
});
|
|
const codex = syntheticRuntimeCell("codex", {
|
|
usage: { inputTokens: 200, outputTokens: 40, totalTokens: 240 },
|
|
});
|
|
const runtimeParity: RuntimeParityResult = {
|
|
scenarioId: "token-efficiency-regression",
|
|
cells: { openclaw, codex },
|
|
drift: "none",
|
|
};
|
|
const report = buildTokenEfficiencyReport({
|
|
summary: {
|
|
run: {
|
|
providerMode: "live-frontier",
|
|
runtimePair: ["openclaw", "codex"],
|
|
},
|
|
scenarios: [
|
|
{
|
|
name: "token-efficiency-regression",
|
|
status: "pass",
|
|
runtimeParity,
|
|
},
|
|
],
|
|
},
|
|
thresholdPercent: 15,
|
|
generatedAt: "2026-05-12T00:00:00.000Z",
|
|
});
|
|
return !report.pass && report.failures.length === 1;
|
|
}
|
|
|
|
function detectJsonlReplayDrift(): boolean {
|
|
return !evaluateJsonlReplaySummary({
|
|
transcripts: [
|
|
{
|
|
transcriptPath: "synthetic.jsonl",
|
|
userTurnCount: 2,
|
|
drift: ["none", "tool-result-shape"],
|
|
firstDriftAtTurn: 2,
|
|
},
|
|
],
|
|
}).passed;
|
|
}
|
|
|
|
export async function buildQaConfidenceSelfTestSummary(
|
|
generatedAt = new Date().toISOString(),
|
|
): Promise<QaConfidenceSelfTestSummary> {
|
|
const promptDriftDetected = detectHarnessDrift({
|
|
leftReport: syntheticPromptReport(),
|
|
rightReport: syntheticPromptReport({
|
|
systemPrompt: {
|
|
chars: 100,
|
|
projectContextChars: 10,
|
|
nonProjectContextChars: 90,
|
|
hash: "system-prompt-b",
|
|
},
|
|
}),
|
|
expectedDrift: "system-prompt",
|
|
});
|
|
const toolDescriptionDetected = detectHarnessDrift({
|
|
leftReport: syntheticPromptReport(),
|
|
rightReport: syntheticPromptReport({
|
|
tools: {
|
|
listChars: 30,
|
|
schemaChars: 40,
|
|
entries: [
|
|
{
|
|
name: "openclaw.synthetic",
|
|
summaryChars: 12,
|
|
summaryHash: "summary-b",
|
|
schemaChars: 18,
|
|
schemaHash: "schema-a",
|
|
propertiesCount: 2,
|
|
},
|
|
],
|
|
},
|
|
}),
|
|
expectedDrift: "tool-description",
|
|
});
|
|
const toolSchemaDetected = detectHarnessDrift({
|
|
leftReport: syntheticPromptReport(),
|
|
rightReport: syntheticPromptReport({
|
|
tools: {
|
|
listChars: 30,
|
|
schemaChars: 40,
|
|
entries: [
|
|
{
|
|
name: "openclaw.synthetic",
|
|
summaryChars: 12,
|
|
summaryHash: "summary-a",
|
|
schemaChars: 18,
|
|
schemaHash: "schema-b",
|
|
propertiesCount: 2,
|
|
},
|
|
],
|
|
},
|
|
}),
|
|
expectedDrift: "tool-schema",
|
|
});
|
|
const runtimeToolCallDropDetected = await detectRuntimeDrift({
|
|
scenarioId: "runtime-tool-call-drop",
|
|
openclaw: syntheticRuntimeCell("openclaw", { toolCalls: [syntheticToolCall()] }),
|
|
codex: syntheticRuntimeCell("codex", { toolCalls: [] }),
|
|
expectedDrift: "tool-call-shape",
|
|
});
|
|
const toolResultMismatchDetected = await detectRuntimeDrift({
|
|
scenarioId: "tool-result-mismatch",
|
|
openclaw: syntheticRuntimeCell("openclaw", { toolCalls: [syntheticToolCall()] }),
|
|
codex: syntheticRuntimeCell("codex", {
|
|
toolCalls: [syntheticToolCall({ resultHash: "result-b" })],
|
|
}),
|
|
expectedDrift: "tool-result-shape",
|
|
});
|
|
const failureModeDriftDetected = await detectRuntimeDrift({
|
|
scenarioId: "failure-mode-drift",
|
|
openclaw: syntheticRuntimeCell("openclaw"),
|
|
codex: syntheticRuntimeCell("codex", { transportErrorClass: "synthetic-transport" }),
|
|
expectedDrift: "failure-mode",
|
|
});
|
|
const canaries: QaConfidenceSelfTestCanary[] = [
|
|
{
|
|
id: "prompt-drift",
|
|
category: "prompt",
|
|
detected: promptDriftDetected,
|
|
expectedVerdict: "qa-harness-bug",
|
|
details: "synthetic harness prompt hash changed",
|
|
},
|
|
{
|
|
id: "tool-description-schema-drift",
|
|
category: "tool-schema",
|
|
detected: toolDescriptionDetected && toolSchemaDetected,
|
|
expectedVerdict: "qa-harness-bug",
|
|
details: "synthetic tool description/schema hash changed",
|
|
},
|
|
{
|
|
id: "runtime-tool-call-drop",
|
|
category: "tool-call",
|
|
detected: runtimeToolCallDropDetected,
|
|
expectedVerdict: "product-bug",
|
|
details: "synthetic runtime transcript omitted a required tool call",
|
|
},
|
|
{
|
|
id: "tool-result-mismatch",
|
|
category: "tool-result",
|
|
detected: toolResultMismatchDetected,
|
|
expectedVerdict: "product-bug",
|
|
details: "synthetic runtime transcript returned a mismatched tool result",
|
|
},
|
|
{
|
|
id: "failure-mode-drift",
|
|
category: "failure-mode",
|
|
detected: failureModeDriftDetected,
|
|
expectedVerdict: "product-bug",
|
|
details: "synthetic runtime failed with a different failure mode",
|
|
},
|
|
{
|
|
id: "token-efficiency-regression",
|
|
category: "token-efficiency",
|
|
detected: detectTokenEfficiencyRegression(),
|
|
expectedVerdict: "qa-harness-bug",
|
|
details: "synthetic token row exceeded the configured efficiency threshold",
|
|
},
|
|
{
|
|
id: "jsonl-replay-ordering-drift",
|
|
category: "jsonl-replay",
|
|
detected: detectJsonlReplayDrift(),
|
|
expectedVerdict: "fixture-bug",
|
|
details: "synthetic JSONL replay drifted after turn ordering changed",
|
|
},
|
|
];
|
|
return {
|
|
generatedAt,
|
|
pass: canaries.every((canary) => canary.detected),
|
|
canaries,
|
|
};
|
|
}
|
|
|
|
export function renderQaConfidenceSelfTestMarkdownReport(
|
|
summary: QaConfidenceSelfTestSummary,
|
|
): string {
|
|
const lines = [
|
|
"# OpenClaw QA Confidence Self-Test",
|
|
"",
|
|
`- Generated at: ${summary.generatedAt}`,
|
|
`- Verdict: ${summary.pass ? "pass" : "fail"}`,
|
|
"",
|
|
"| Canary | Category | Detected | Expected verdict | Details |",
|
|
"| --- | --- | --- | --- | --- |",
|
|
];
|
|
for (const canary of summary.canaries) {
|
|
lines.push(
|
|
`| ${canary.id} | ${canary.category} | ${canary.detected ? "yes" : "no"} | ${canary.expectedVerdict} | ${escapeTableCell(canary.details)} |`,
|
|
);
|
|
}
|
|
return `${lines.join("\n")}\n`;
|
|
}
|
|
|
|
export async function writeQaConfidenceSelfTestArtifacts(params: {
|
|
outputDir: string;
|
|
generatedAt?: string;
|
|
}): Promise<{ reportPath: string; summaryPath: string; summary: QaConfidenceSelfTestSummary }> {
|
|
await fs.mkdir(params.outputDir, { recursive: true });
|
|
const summary = await buildQaConfidenceSelfTestSummary(params.generatedAt);
|
|
const report = renderQaConfidenceSelfTestMarkdownReport(summary);
|
|
const reportPath = path.join(params.outputDir, "qa-confidence-self-test-report.md");
|
|
const summaryPath = path.join(params.outputDir, "qa-confidence-self-test-summary.json");
|
|
await fs.writeFile(reportPath, report, "utf8");
|
|
await fs.writeFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`, "utf8");
|
|
return { reportPath, summaryPath, summary };
|
|
}
|