canary(common): drain gateway/mock_llm stdout pipes (was deadlocking CI)

scripts/live_canary/common.py spawns the IronClaw gateway and the mock LLM with stdout=PIPE + stderr=STDOUT, reads one line of mock_llm output to discover its bound port, then never reads from either pipe again. On Linux the kernel pipe buffer caps at 64 KiB; once a sustained chat request fills it with `RUST_LOG=info` output, the child blocks on its next stdout write and the request handler freezes mid-response. That's why every auth-browser-consent CI run got stuck on "Thinking (step 1)..." for the full chat-wait budget while the same test passes locally — macOS pipe buffers are larger and the test completes before the buffer fills. Fix: spawn a daemon thread per subprocess that drains the pipe to a log file under the run's output_dir. Two wins: - Pipes never fill, child never blocks. - gateway.log and mock_llm.log become CI artifacts, so the next failure that doesn't have a clear runner-side error message is immediately debuggable from IronClaw's own logs. Verified locally that the lane still passes after the change and both log files are produced. Locally each is < 10 KiB; CI runs may be larger but well under any artifact size limit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 16:24:32 +08:00 · 2026-04-27 14:29:28 -07:00
parent 98abeebe0d
commit f59981d362
2 changed files with 41 additions and 0 deletions
--- a/scripts/auth_live_canary/run_live_canary.py
+++ b/scripts/auth_live_canary/run_live_canary.py
@@ -1136,6 +1136,7 @@ async def async_main(args: argparse.Namespace) -> int:
        gateway_token_prefix=mode_cfg["gateway_token_prefix"],
        extra_gateway_env=extra_gateway_env,
        oauth_proxy=(args.mode == "seeded"),
+        log_dir=args.output_dir,
    )
    try:
        if args.mode == "seeded":
--- a/scripts/live_canary/common.py
+++ b/scripts/live_canary/common.py
@@ -11,6 +11,7 @@ import socket
 import subprocess
 import sys
 import tempfile
+import threading
 import time
 import uuid
 from dataclasses import asdict, dataclass, field
@@ -314,6 +315,30 @@ def build_gateway_env(
    return env


+def _drain_to_file(stream: Any, path: Path) -> threading.Thread:
+    """Drain a subprocess stdout/stderr stream to a file in a daemon thread.
+
+    Without this, ``subprocess.Popen(stdout=PIPE)`` deadlocks: the kernel
+    pipe buffer (64 KiB on Linux, varies on macOS) fills under sustained
+    log output and the child blocks on its next write. That manifests on
+    CI as IronClaw freezing mid-request — locally the pipe fills more
+    slowly so the symptom is masked. See PR #2978-ish (this fix).
+    """
+
+    def _drain() -> None:
+        try:
+            with path.open("a", encoding="utf-8", errors="replace") as fh:
+                for line in stream:
+                    fh.write(line)
+                    fh.flush()
+        except Exception:  # noqa: BLE001
+            pass
+
+    thread = threading.Thread(target=_drain, daemon=True)
+    thread.start()
+    return thread
+
+
 async def start_gateway_stack(
    *,
    venv_dir: Path,
@@ -323,6 +348,7 @@ async def start_gateway_stack(
    gateway_token_prefix: str,
    extra_gateway_env: dict[str, str] | None = None,
    oauth_proxy: bool = False,
+    log_dir: Path | None = None,
 ) -> GatewayStack:
    secrets_master_key = secrets_master_key or generate_secrets_master_key()
    python = venv_python(venv_dir)
@@ -358,6 +384,14 @@ async def start_gateway_stack(
        mock_llm_url = f"http://127.0.0.1:{match.group(1)}"
        await wait_for_ready(f"{mock_llm_url}/v1/models", timeout=30.0)

+        # Now that the port-discovery line has been consumed, drain the
+        # rest of mock_llm.py's stdout to a log file so the pipe never
+        # fills (64 KiB pipe buffers on Linux deadlock the child once
+        # full).
+        if log_dir is not None and mock_llm_proc.stdout is not None:
+            log_dir.mkdir(parents=True, exist_ok=True)
+            _drain_to_file(mock_llm_proc.stdout, log_dir / "mock_llm.log")
+
        if oauth_proxy:
            proxy_env = {
                "IRONCLAW_OAUTH_EXCHANGE_URL": mock_llm_url,
@@ -393,6 +427,12 @@ async def start_gateway_stack(
            bufsize=1,
            env=env,
        )
+        # Same deadlock guard as mock_llm above — drain ironclaw's
+        # stdout/stderr so a chatty `RUST_LOG=info` doesn't fill the pipe
+        # buffer and freeze the request handler mid-response.
+        if log_dir is not None and gateway_proc.stdout is not None:
+            log_dir.mkdir(parents=True, exist_ok=True)
+            _drain_to_file(gateway_proc.stdout, log_dir / "gateway.log")
        base_url = f"http://127.0.0.1:{gateway_port}"
        await wait_for_ready(f"{base_url}/api/health", timeout=60.0)
        return GatewayStack(