canary(common): drain gateway/mock_llm stdout pipes (was deadlocking CI)

scripts/live_canary/common.py spawns the IronClaw gateway and the
mock LLM with stdout=PIPE + stderr=STDOUT, reads one line of mock_llm
output to discover its bound port, then never reads from either pipe
again. On Linux the kernel pipe buffer caps at 64 KiB; once a
sustained chat request fills it with `RUST_LOG=info` output, the
child blocks on its next stdout write and the request handler
freezes mid-response.

That's why every auth-browser-consent CI run got stuck on
"Thinking (step 1)..." for the full chat-wait budget while the same
test passes locally — macOS pipe buffers are larger and the test
completes before the buffer fills.

Fix: spawn a daemon thread per subprocess that drains the pipe to a
log file under the run's output_dir. Two wins:

- Pipes never fill, child never blocks.
- gateway.log and mock_llm.log become CI artifacts, so the next
  failure that doesn't have a clear runner-side error message is
  immediately debuggable from IronClaw's own logs.

Verified locally that the lane still passes after the change and
both log files are produced. Locally each is < 10 KiB; CI runs may
be larger but well under any artifact size limit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Nikolay Pismenkov
2026-04-27 14:29:28 -07:00
parent 98abeebe0d
commit f59981d362
2 changed files with 41 additions and 0 deletions

View File

@@ -1136,6 +1136,7 @@ async def async_main(args: argparse.Namespace) -> int:
gateway_token_prefix=mode_cfg["gateway_token_prefix"],
extra_gateway_env=extra_gateway_env,
oauth_proxy=(args.mode == "seeded"),
log_dir=args.output_dir,
)
try:
if args.mode == "seeded":

View File

@@ -11,6 +11,7 @@ import socket
import subprocess
import sys
import tempfile
import threading
import time
import uuid
from dataclasses import asdict, dataclass, field
@@ -314,6 +315,30 @@ def build_gateway_env(
return env
def _drain_to_file(stream: Any, path: Path) -> threading.Thread:
"""Drain a subprocess stdout/stderr stream to a file in a daemon thread.
Without this, ``subprocess.Popen(stdout=PIPE)`` deadlocks: the kernel
pipe buffer (64 KiB on Linux, varies on macOS) fills under sustained
log output and the child blocks on its next write. That manifests on
CI as IronClaw freezing mid-request — locally the pipe fills more
slowly so the symptom is masked. See PR #2978-ish (this fix).
"""
def _drain() -> None:
try:
with path.open("a", encoding="utf-8", errors="replace") as fh:
for line in stream:
fh.write(line)
fh.flush()
except Exception: # noqa: BLE001
pass
thread = threading.Thread(target=_drain, daemon=True)
thread.start()
return thread
async def start_gateway_stack(
*,
venv_dir: Path,
@@ -323,6 +348,7 @@ async def start_gateway_stack(
gateway_token_prefix: str,
extra_gateway_env: dict[str, str] | None = None,
oauth_proxy: bool = False,
log_dir: Path | None = None,
) -> GatewayStack:
secrets_master_key = secrets_master_key or generate_secrets_master_key()
python = venv_python(venv_dir)
@@ -358,6 +384,14 @@ async def start_gateway_stack(
mock_llm_url = f"http://127.0.0.1:{match.group(1)}"
await wait_for_ready(f"{mock_llm_url}/v1/models", timeout=30.0)
# Now that the port-discovery line has been consumed, drain the
# rest of mock_llm.py's stdout to a log file so the pipe never
# fills (64 KiB pipe buffers on Linux deadlock the child once
# full).
if log_dir is not None and mock_llm_proc.stdout is not None:
log_dir.mkdir(parents=True, exist_ok=True)
_drain_to_file(mock_llm_proc.stdout, log_dir / "mock_llm.log")
if oauth_proxy:
proxy_env = {
"IRONCLAW_OAUTH_EXCHANGE_URL": mock_llm_url,
@@ -393,6 +427,12 @@ async def start_gateway_stack(
bufsize=1,
env=env,
)
# Same deadlock guard as mock_llm above — drain ironclaw's
# stdout/stderr so a chatty `RUST_LOG=info` doesn't fill the pipe
# buffer and freeze the request handler mid-response.
if log_dir is not None and gateway_proc.stdout is not None:
log_dir.mkdir(parents=True, exist_ok=True)
_drain_to_file(gateway_proc.stdout, log_dir / "gateway.log")
base_url = f"http://127.0.0.1:{gateway_port}"
await wait_for_ready(f"{base_url}/api/health", timeout=60.0)
return GatewayStack(