mirror of
https://github.com/nearai/ironclaw.git
synced 2026-05-19 16:24:32 +08:00
canary(common): drain gateway/mock_llm stdout pipes (was deadlocking CI)
scripts/live_canary/common.py spawns the IronClaw gateway and the mock LLM with stdout=PIPE + stderr=STDOUT, reads one line of mock_llm output to discover its bound port, then never reads from either pipe again. On Linux the kernel pipe buffer caps at 64 KiB; once a sustained chat request fills it with `RUST_LOG=info` output, the child blocks on its next stdout write and the request handler freezes mid-response. That's why every auth-browser-consent CI run got stuck on "Thinking (step 1)..." for the full chat-wait budget while the same test passes locally — macOS pipe buffers are larger and the test completes before the buffer fills. Fix: spawn a daemon thread per subprocess that drains the pipe to a log file under the run's output_dir. Two wins: - Pipes never fill, child never blocks. - gateway.log and mock_llm.log become CI artifacts, so the next failure that doesn't have a clear runner-side error message is immediately debuggable from IronClaw's own logs. Verified locally that the lane still passes after the change and both log files are produced. Locally each is < 10 KiB; CI runs may be larger but well under any artifact size limit. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1136,6 +1136,7 @@ async def async_main(args: argparse.Namespace) -> int:
|
||||
gateway_token_prefix=mode_cfg["gateway_token_prefix"],
|
||||
extra_gateway_env=extra_gateway_env,
|
||||
oauth_proxy=(args.mode == "seeded"),
|
||||
log_dir=args.output_dir,
|
||||
)
|
||||
try:
|
||||
if args.mode == "seeded":
|
||||
|
||||
@@ -11,6 +11,7 @@ import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import asdict, dataclass, field
|
||||
@@ -314,6 +315,30 @@ def build_gateway_env(
|
||||
return env
|
||||
|
||||
|
||||
def _drain_to_file(stream: Any, path: Path) -> threading.Thread:
|
||||
"""Drain a subprocess stdout/stderr stream to a file in a daemon thread.
|
||||
|
||||
Without this, ``subprocess.Popen(stdout=PIPE)`` deadlocks: the kernel
|
||||
pipe buffer (64 KiB on Linux, varies on macOS) fills under sustained
|
||||
log output and the child blocks on its next write. That manifests on
|
||||
CI as IronClaw freezing mid-request — locally the pipe fills more
|
||||
slowly so the symptom is masked. See PR #2978-ish (this fix).
|
||||
"""
|
||||
|
||||
def _drain() -> None:
|
||||
try:
|
||||
with path.open("a", encoding="utf-8", errors="replace") as fh:
|
||||
for line in stream:
|
||||
fh.write(line)
|
||||
fh.flush()
|
||||
except Exception: # noqa: BLE001
|
||||
pass
|
||||
|
||||
thread = threading.Thread(target=_drain, daemon=True)
|
||||
thread.start()
|
||||
return thread
|
||||
|
||||
|
||||
async def start_gateway_stack(
|
||||
*,
|
||||
venv_dir: Path,
|
||||
@@ -323,6 +348,7 @@ async def start_gateway_stack(
|
||||
gateway_token_prefix: str,
|
||||
extra_gateway_env: dict[str, str] | None = None,
|
||||
oauth_proxy: bool = False,
|
||||
log_dir: Path | None = None,
|
||||
) -> GatewayStack:
|
||||
secrets_master_key = secrets_master_key or generate_secrets_master_key()
|
||||
python = venv_python(venv_dir)
|
||||
@@ -358,6 +384,14 @@ async def start_gateway_stack(
|
||||
mock_llm_url = f"http://127.0.0.1:{match.group(1)}"
|
||||
await wait_for_ready(f"{mock_llm_url}/v1/models", timeout=30.0)
|
||||
|
||||
# Now that the port-discovery line has been consumed, drain the
|
||||
# rest of mock_llm.py's stdout to a log file so the pipe never
|
||||
# fills (64 KiB pipe buffers on Linux deadlock the child once
|
||||
# full).
|
||||
if log_dir is not None and mock_llm_proc.stdout is not None:
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
_drain_to_file(mock_llm_proc.stdout, log_dir / "mock_llm.log")
|
||||
|
||||
if oauth_proxy:
|
||||
proxy_env = {
|
||||
"IRONCLAW_OAUTH_EXCHANGE_URL": mock_llm_url,
|
||||
@@ -393,6 +427,12 @@ async def start_gateway_stack(
|
||||
bufsize=1,
|
||||
env=env,
|
||||
)
|
||||
# Same deadlock guard as mock_llm above — drain ironclaw's
|
||||
# stdout/stderr so a chatty `RUST_LOG=info` doesn't fill the pipe
|
||||
# buffer and freeze the request handler mid-response.
|
||||
if log_dir is not None and gateway_proc.stdout is not None:
|
||||
log_dir.mkdir(parents=True, exist_ok=True)
|
||||
_drain_to_file(gateway_proc.stdout, log_dir / "gateway.log")
|
||||
base_url = f"http://127.0.0.1:{gateway_port}"
|
||||
await wait_for_ready(f"{base_url}/api/health", timeout=60.0)
|
||||
return GatewayStack(
|
||||
|
||||
Reference in New Issue
Block a user