mirror of
https://github.com/openclaw/openclaw.git
synced 2026-06-03 05:34:41 +08:00
* feat(browser): add optional vision understanding to screenshot tool
* fix(browser): wrap vision output as external content, enforce maxBytes, forward auth profiles
* fix(browser): remove no-op scope/attachments config, drop profile pass-through lacking runtime support
* feat(media-understanding): add profile/preferredProfile to DescribeImageFileWithModelParams and forward to describeImage
* style(browser): add curly braces to satisfy eslint curly rule
* fix(browser): correct tools.browser.enabled help text to match actual behavior
* fix(browser): thread agentDir/workspaceDir from plugin tool context into browser vision
* refactor(browser): move vision config from tools.browser to browser.models
The browser plugin's vision configuration now lives on the top-level
`browser` config namespace (browser.models, browser.visionEnabled,
browser.visionPrompt, etc.) instead of `tools.browser`. This aligns
with the plugin's existing config location and avoids confusion between
tool-level and plugin-level settings.
- Remove tools.browser from ToolsSchema and ToolsConfig
- Add models/vision* fields to BrowserConfig and its zod schema
- Update getBrowserVisionConfig to read from cfg.browser
- Update schema help, labels, and quality test
- Update vision.test.ts to use new config shape
* docs(browser): add screenshot vision configuration section
Document the new browser.models config for automatic screenshot
description via vision models, enabling text-only main models to
reason about web page content.
* fix(browser): remove deliverable media markers from vision result, drop unused import
P1: Vision-success path no longer exposes the raw screenshot as
deliverable media (removes MEDIA: line and details.media.mediaUrl).
This prevents channel delivery from auto-sending sensitive page content
when the intended output is a text description.
P2: Remove unused ToolsMediaUnderstandingSchema import that would fail
noUnusedLocals typecheck.
* fix(browser): add command/args fields to browser models schema
The browser vision model schema uses .strict(), so CLI-type entries
with command/args were rejected by TypeScript. Add these fields to
align with MediaUnderstandingModelSchema.
* chore(browser): remove debug console.log statements
* fix(browser): harden screenshot vision result against MEDIA: directive injection and restore image sanitization on failure fallback
ClawSweeper #84247 review round 2:
P1 (security, high): neutralize line-start MEDIA: directives in vision descriptions
before wrapping with wrapExternalContent. The agent media extractor scans every
browser tool-result text block via splitMediaFromOutput which treats line-start
MEDIA: as a trusted local-media delivery directive, and browser is on the
trusted-media allowlist. Without neutralization, page or vision-provider output
containing 'MEDIA:/tmp/secret.png' could synthesize a channel-deliverable media
artifact from untrusted content. wrapExternalContent itself does not strip
line-start directives. Introduce neutralizeMediaDirectives in vision.ts that
prepends '[neutralized] ' to any line whose trimStart() begins with MEDIA:
(case-insensitive), defanging the parser anchor while keeping the original
text human-readable.
P2 (compatibility): pass resolveRuntimeImageSanitization() to imageResultFromFile
in the vision-failure catch fallback. The non-vision screenshot path already
forwards this option (d5cc0d53b7) so configured agents.defaults.imageMaxDimensionPx
takes effect. Without this fix, any provider timeout/error silently bypasses the
sanitization guard and returns a raw full-resolution screenshot.
Regression coverage:
- vision.test.ts: 6 unit cases for neutralizeMediaDirectives (no-op fast path,
mid-line MEDIA: untouched, line-start defanged, leading-whitespace defanged,
case-insensitive, multiple directives per blob).
- browser-tool.test.ts: 2 integration cases that drive the full screenshot
tool execute path:
- 'neutralizes MEDIA: directives in vision text and does not attach media'
asserts no line matches /^\s*MEDIA:/i in returned text, secret path text
is preserved verbatim, details.media is absent, and imageResultFromFile
is not called on the success path.
- 'preserves screenshot image sanitization on vision failure fallback'
mocks describeImageFileWithModel to reject and asserts the fallback
imageResultFromFile call receives imageSanitization: {maxDimensionPx:1600}
plus the 'browser screenshot vision failed' extraText.
* fix(browser): apply clawsweeper fallback media fix from PR #84247
* refactor: reuse media image understanding for browser screenshots
* refactor: use structured media delivery
* test: update music completion media instruction expectation
* fix: trim buffered reply directive padding
* test: refresh codex prompt snapshots for message media aliases
---------
Co-authored-by: scotthuang <scotthuang@tencent.com>
Co-authored-by: Peter Steinberger <steipete@gmail.com>
336 lines
12 KiB
TypeScript
336 lines
12 KiB
TypeScript
import fs from "node:fs";
|
|
import path from "node:path";
|
|
import { createTestPluginApi } from "openclaw/plugin-sdk/plugin-test-api";
|
|
import { afterEach, beforeAll, beforeEach, describe, expect, it, vi } from "vitest";
|
|
import {
|
|
browserPluginNodeHostCommands,
|
|
browserPluginReload,
|
|
browserSecurityAuditCollectors,
|
|
registerBrowserPlugin,
|
|
} from "./plugin-registration.js";
|
|
import type { OpenClawPluginApi } from "./runtime-api.js";
|
|
import setupPlugin from "./setup-api.js";
|
|
|
|
type BrowserAutoEnableProbe = Parameters<OpenClawPluginApi["registerAutoEnableProbe"]>[0];
|
|
|
|
const runtimeApiMocks = vi.hoisted(() => ({
|
|
createBrowserPluginService: vi.fn(() => ({ id: "browser-control", start: vi.fn() })),
|
|
createBrowserTool: vi.fn(() => ({
|
|
name: "browser",
|
|
description: "browser",
|
|
parameters: { type: "object", properties: {} },
|
|
execute: vi.fn(async () => ({ type: "json", value: { ok: true } })),
|
|
})),
|
|
collectBrowserSecurityAuditFindings: vi.fn(() => []),
|
|
handleBrowserGatewayRequest: vi.fn(),
|
|
registerBrowserCli: vi.fn(),
|
|
runBrowserProxyCommand: vi.fn(async () => "ok"),
|
|
stopBrowserControlService: vi.fn(async () => undefined),
|
|
}));
|
|
|
|
vi.mock("./register.runtime.js", async () => {
|
|
const actual =
|
|
await vi.importActual<typeof import("./register.runtime.js")>("./register.runtime.js");
|
|
return {
|
|
...actual,
|
|
collectBrowserSecurityAuditFindings: runtimeApiMocks.collectBrowserSecurityAuditFindings,
|
|
createBrowserPluginService: runtimeApiMocks.createBrowserPluginService,
|
|
createBrowserTool: runtimeApiMocks.createBrowserTool,
|
|
handleBrowserGatewayRequest: runtimeApiMocks.handleBrowserGatewayRequest,
|
|
runBrowserProxyCommand: runtimeApiMocks.runBrowserProxyCommand,
|
|
};
|
|
});
|
|
|
|
vi.mock("./src/cli/browser-cli.js", () => ({
|
|
registerBrowserCli: runtimeApiMocks.registerBrowserCli,
|
|
}));
|
|
|
|
vi.mock("./src/control-service.js", () => ({
|
|
stopBrowserControlService: runtimeApiMocks.stopBrowserControlService,
|
|
}));
|
|
|
|
beforeAll(async () => {
|
|
await import("./register.runtime.js");
|
|
});
|
|
|
|
beforeEach(() => {
|
|
vi.clearAllMocks();
|
|
});
|
|
|
|
afterEach(() => {
|
|
vi.unstubAllEnvs();
|
|
});
|
|
|
|
function createApi() {
|
|
const registerCli = vi.fn();
|
|
const registerGatewayMethod = vi.fn();
|
|
const registerService = vi.fn();
|
|
const registerTool = vi.fn();
|
|
const api = createTestPluginApi({
|
|
id: "browser",
|
|
name: "Browser",
|
|
source: "test",
|
|
config: {},
|
|
runtime: {} as OpenClawPluginApi["runtime"],
|
|
registerCli,
|
|
registerGatewayMethod,
|
|
registerService,
|
|
registerTool,
|
|
});
|
|
return { api, registerCli, registerGatewayMethod, registerService, registerTool };
|
|
}
|
|
|
|
function mockCallArg(mock: { mock: { calls: unknown[][] } }, index = 0, argIndex = 0): unknown {
|
|
const call = mock.mock.calls.at(index);
|
|
if (!call) {
|
|
throw new Error(`expected mock call ${index}`);
|
|
}
|
|
return call[argIndex];
|
|
}
|
|
|
|
function registerBrowserAutoEnableProbe(): BrowserAutoEnableProbe {
|
|
const probes: BrowserAutoEnableProbe[] = [];
|
|
setupPlugin.register(
|
|
createTestPluginApi({
|
|
registerAutoEnableProbe(probe) {
|
|
probes.push(probe);
|
|
},
|
|
}),
|
|
);
|
|
const probe = probes[0];
|
|
if (!probe) {
|
|
throw new Error("expected browser setup plugin to register an auto-enable probe");
|
|
}
|
|
return probe;
|
|
}
|
|
|
|
describe("browser plugin", () => {
|
|
it("exposes static browser metadata on the plugin definition", () => {
|
|
expect(browserPluginReload).toEqual({ restartPrefixes: ["browser"] });
|
|
expect(browserPluginNodeHostCommands).toHaveLength(1);
|
|
expect(browserPluginNodeHostCommands[0]?.command).toBe("browser.proxy");
|
|
expect(browserPluginNodeHostCommands[0]?.cap).toBe("browser");
|
|
expect(typeof browserPluginNodeHostCommands[0]?.handle).toBe("function");
|
|
expect(browserSecurityAuditCollectors).toHaveLength(1);
|
|
});
|
|
|
|
it("bundles the browser automation skill with the plugin", () => {
|
|
const manifest = JSON.parse(
|
|
fs.readFileSync(path.join(__dirname, "openclaw.plugin.json"), "utf8"),
|
|
) as { skills?: string[] };
|
|
const skillPath = path.join(__dirname, "skills", "browser-automation", "SKILL.md");
|
|
|
|
expect(manifest.skills).toEqual(["./skills"]);
|
|
expect(fs.readFileSync(skillPath, "utf8")).toContain("name: browser-automation");
|
|
});
|
|
|
|
it("keeps browser tool registration synchronous while loading runtime on execute", async () => {
|
|
const { api, registerTool } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const factory = mockCallArg(registerTool);
|
|
if (typeof factory !== "function") {
|
|
throw new Error("expected browser plugin to register a tool factory");
|
|
}
|
|
|
|
const tool = factory({
|
|
sessionKey: "agent:main:webchat:direct:123",
|
|
browser: {
|
|
sandboxBridgeUrl: "http://127.0.0.1:9999",
|
|
allowHostControl: true,
|
|
},
|
|
});
|
|
if (!tool || Array.isArray(tool)) {
|
|
throw new Error("expected browser plugin to return a single tool");
|
|
}
|
|
|
|
expect(tool.name).toBe("browser");
|
|
expect(runtimeApiMocks.createBrowserTool).not.toHaveBeenCalled();
|
|
await tool.execute("call-1", { action: "status" });
|
|
expect(runtimeApiMocks.createBrowserTool).toHaveBeenCalledWith({
|
|
sandboxBridgeUrl: "http://127.0.0.1:9999",
|
|
allowHostControl: true,
|
|
agentSessionKey: "agent:main:webchat:direct:123",
|
|
mediaScope: {
|
|
sessionKey: "agent:main:webchat:direct:123",
|
|
chatType: "direct",
|
|
},
|
|
});
|
|
});
|
|
|
|
it("passes runtime context needed for screenshot image understanding", async () => {
|
|
const { api, registerTool } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const factory = mockCallArg(registerTool);
|
|
if (typeof factory !== "function") {
|
|
throw new Error("expected browser plugin to register a tool factory");
|
|
}
|
|
|
|
const tool = factory({
|
|
sessionKey: "agent:main:webchat:direct:123",
|
|
agentDir: "/tmp/agent",
|
|
workspaceDir: "/tmp/workspace",
|
|
activeModel: { provider: "openai", modelId: "gpt-5.5" },
|
|
deliveryContext: { channel: "telegram" },
|
|
});
|
|
if (!tool || Array.isArray(tool)) {
|
|
throw new Error("expected browser plugin to return a single tool");
|
|
}
|
|
|
|
await tool.execute("call-1", { action: "status" });
|
|
expect(runtimeApiMocks.createBrowserTool).toHaveBeenCalledWith({
|
|
agentSessionKey: "agent:main:webchat:direct:123",
|
|
agentDir: "/tmp/agent",
|
|
workspaceDir: "/tmp/workspace",
|
|
activeModel: { provider: "openai", model: "gpt-5.5" },
|
|
mediaScope: {
|
|
sessionKey: "agent:main:webchat:direct:123",
|
|
channel: "telegram",
|
|
chatType: "direct",
|
|
},
|
|
});
|
|
});
|
|
|
|
it("derives group chat type for browser media scope", async () => {
|
|
const { api, registerTool } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const factory = mockCallArg(registerTool);
|
|
if (typeof factory !== "function") {
|
|
throw new Error("expected browser plugin to register a tool factory");
|
|
}
|
|
|
|
const tool = factory({
|
|
sessionKey: "agent:main:telegram:group:chat-123",
|
|
messageChannel: "telegram",
|
|
});
|
|
if (!tool || Array.isArray(tool)) {
|
|
throw new Error("expected browser plugin to return a single tool");
|
|
}
|
|
|
|
await tool.execute("call-1", { action: "status" });
|
|
expect(runtimeApiMocks.createBrowserTool).toHaveBeenCalledWith({
|
|
agentSessionKey: "agent:main:telegram:group:chat-123",
|
|
mediaScope: {
|
|
sessionKey: "agent:main:telegram:group:chat-123",
|
|
channel: "telegram",
|
|
chatType: "group",
|
|
},
|
|
});
|
|
});
|
|
|
|
it("registers CLI descriptors and lazy-loads the lightweight browser CLI", async () => {
|
|
const { api, registerCli } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
expect(registerCli).toHaveBeenCalledTimes(1);
|
|
const registrar = mockCallArg(registerCli) as (params: { program: never }) => unknown;
|
|
expect(typeof registrar).toBe("function");
|
|
expect(mockCallArg(registerCli, 0, 1)).toEqual({
|
|
commands: ["browser"],
|
|
descriptors: [
|
|
{
|
|
name: "browser",
|
|
description: "Manage OpenClaw's dedicated browser (Chrome/Chromium)",
|
|
hasSubcommands: true,
|
|
},
|
|
],
|
|
});
|
|
await registrar({ program: {} as never });
|
|
expect(runtimeApiMocks.registerBrowserCli).toHaveBeenCalledWith({});
|
|
});
|
|
|
|
it("registers browser.request as an admin gateway method and lazy-loads handler", async () => {
|
|
const { api, registerGatewayMethod } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
expect(registerGatewayMethod).toHaveBeenCalledTimes(1);
|
|
expect(mockCallArg(registerGatewayMethod)).toBe("browser.request");
|
|
const handler = mockCallArg(registerGatewayMethod, 0, 1) as (request: {
|
|
method: string;
|
|
}) => unknown;
|
|
expect(typeof handler).toBe("function");
|
|
expect(mockCallArg(registerGatewayMethod, 0, 2)).toEqual({
|
|
scope: "operator.admin",
|
|
});
|
|
await handler({ method: "browser.request" });
|
|
expect(runtimeApiMocks.handleBrowserGatewayRequest).toHaveBeenCalledWith({
|
|
method: "browser.request",
|
|
});
|
|
});
|
|
|
|
it("lazy-loads node host and audit runtime handlers", async () => {
|
|
await expect(browserPluginNodeHostCommands[0]?.handle("{}")).resolves.toBe("ok");
|
|
expect(runtimeApiMocks.runBrowserProxyCommand).toHaveBeenCalledWith("{}");
|
|
|
|
await expect(browserSecurityAuditCollectors[0]?.({} as never)).resolves.toStrictEqual([]);
|
|
expect(runtimeApiMocks.collectBrowserSecurityAuditFindings).toHaveBeenCalled();
|
|
});
|
|
|
|
it("registers a lazy browser control service", async () => {
|
|
const { api, registerService } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const service = mockCallArg(registerService) as {
|
|
id: string;
|
|
start: (...args: unknown[]) => unknown;
|
|
stop: (...args: unknown[]) => unknown;
|
|
};
|
|
expect(service?.id).toBe("browser-control");
|
|
expect(typeof service?.start).toBe("function");
|
|
expect(typeof service?.stop).toBe("function");
|
|
expect(runtimeApiMocks.createBrowserPluginService).not.toHaveBeenCalled();
|
|
|
|
await service.start({ config: {}, stateDir: "/tmp/openclaw", logger: { warn: vi.fn() } });
|
|
expect(runtimeApiMocks.createBrowserPluginService).not.toHaveBeenCalled();
|
|
|
|
await service.stop({ config: {}, stateDir: "/tmp/openclaw", logger: { warn: vi.fn() } });
|
|
expect(runtimeApiMocks.stopBrowserControlService).toHaveBeenCalledOnce();
|
|
});
|
|
|
|
it("eager-loads the browser control service when explicitly requested", async () => {
|
|
vi.stubEnv("OPENCLAW_EAGER_BROWSER_CONTROL_SERVER", "1");
|
|
const { api, registerService } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const service = mockCallArg(registerService) as {
|
|
id: string;
|
|
start: (...args: unknown[]) => unknown;
|
|
};
|
|
|
|
await service.start({ config: {}, stateDir: "/tmp/openclaw", logger: { warn: vi.fn() } });
|
|
expect(runtimeApiMocks.createBrowserPluginService).toHaveBeenCalledOnce();
|
|
});
|
|
|
|
for (const value of ["false", "", "disabled"]) {
|
|
it(`keeps browser control service env value ${JSON.stringify(value)} lazy`, async () => {
|
|
vi.stubEnv("OPENCLAW_EAGER_BROWSER_CONTROL_SERVER", value);
|
|
const { api, registerService } = createApi();
|
|
registerBrowserPlugin(api);
|
|
|
|
const service = mockCallArg(registerService) as {
|
|
id: string;
|
|
start: (...args: unknown[]) => unknown;
|
|
};
|
|
|
|
await service.start({ config: {}, stateDir: "/tmp/openclaw", logger: { warn: vi.fn() } });
|
|
expect(runtimeApiMocks.createBrowserPluginService).not.toHaveBeenCalled();
|
|
});
|
|
}
|
|
|
|
it("declares setup auto-enable reasons for browser config surfaces", () => {
|
|
const probe = registerBrowserAutoEnableProbe();
|
|
|
|
expect(probe({ config: { browser: { defaultProfile: "openclaw" } }, env: {} })).toBe(
|
|
"browser configured",
|
|
);
|
|
expect(probe({ config: { tools: { alsoAllow: ["browser"] } }, env: {} })).toBe(
|
|
"browser tool referenced",
|
|
);
|
|
expect(
|
|
probe({ config: { browser: { defaultProfile: "openclaw", enabled: false } }, env: {} }),
|
|
).toBeNull();
|
|
});
|
|
});
|