mirror of
https://github.com/anthropic-experimental/sandbox-runtime.git
synced 2026-05-07 06:01:25 +08:00
apply-seccomp now creates a nested userns and writes /proc/self/setgroups
and uid_map before applying the seccomp filter. That broke
enableWeakerNestedSandbox in two ways:
1. Without --proc, bwrap's --ro-bind / / leaves /proc read-only.
apply-seccomp's setgroups write dies with EROFS.
2. In unprivileged Docker (the flag's target), apply-seccomp's proc
remount fails the kernel domination check — Docker's /proc masks
are MNT_LOCKED in the less-privileged nested userns.
And the reason bwrap never got that far in Docker: bwrap only auto-adds
--unshare-user when EUID != 0. Docker's default is EUID=0 without
CAP_SYS_ADMIN; bwrap assumes it has caps, tries direct clone(NEWPID),
and EPERMs before apply-seccomp runs.
Changes:
- bwrap args for weak mode: --unshare-user (force userns even as
EUID=0) and --bind /proc /proc (restore rw /proc for setgroups)
- apply-seccomp: tolerate mount(/proc) EPERM. The nested userns is
the isolation boundary; the proc remount only hides outer PIDs
from `ls /proc`.
Fixes the two failing mandatory-deny-paths tests that exercise
enableWeakerNestedSandbox. No test changes required.
Bump version to 0.0.46.
292 lines
9.9 KiB
C
292 lines
9.9 KiB
C
/*
|
|
* apply-seccomp.c - Apply seccomp BPF filter in an isolated PID namespace
|
|
*
|
|
* Usage: apply-seccomp <filter.bpf> <command> [args...]
|
|
*
|
|
* This program reads a pre-compiled BPF filter from a file, isolates the
|
|
* target command in a nested user+PID+mount namespace so it cannot see or
|
|
* ptrace any process that lacks the filter, applies the filter with
|
|
* prctl(PR_SET_SECCOMP), and execs the command.
|
|
*
|
|
* Process layout inside the outer bwrap sandbox:
|
|
*
|
|
* bwrap init (PID 1) <- outer PID ns, no seccomp
|
|
* \_ bash / socat ... <- outer PID ns, no seccomp
|
|
* \_ apply-seccomp [outer] <- outer PID ns, waits for inner init
|
|
* ================================================= PID ns boundary
|
|
* \_ apply-seccomp [inner init] <- inner PID 1, PR_SET_DUMPABLE=0
|
|
* \_ user command <- inner PID 2, seccomp applied
|
|
*
|
|
* From the user command's point of view /proc contains only its own process
|
|
* tree. The bwrap init, bash wrapper, and socat helpers are not addressable,
|
|
* so they cannot be ptraced or patched via /proc/N/mem even on systems with
|
|
* kernel.yama.ptrace_scope=0. The inner init (PID 1) sets PR_SET_DUMPABLE=0
|
|
* so it cannot be ptraced either.
|
|
*
|
|
* Any failure to set up the nested namespaces aborts with a non-zero exit
|
|
* status; we never fall back to running the command without isolation.
|
|
*
|
|
* Compile: gcc -static -O2 -o apply-seccomp apply-seccomp.c
|
|
*/
|
|
|
|
#define _GNU_SOURCE
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdarg.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include <fcntl.h>
|
|
#include <errno.h>
|
|
#include <sched.h>
|
|
#include <signal.h>
|
|
#include <sys/prctl.h>
|
|
#include <sys/wait.h>
|
|
#include <sys/mount.h>
|
|
#include <linux/seccomp.h>
|
|
#include <linux/filter.h>
|
|
|
|
#ifndef PR_SET_NO_NEW_PRIVS
|
|
#define PR_SET_NO_NEW_PRIVS 38
|
|
#endif
|
|
|
|
#ifndef PR_CAP_AMBIENT
|
|
#define PR_CAP_AMBIENT 47
|
|
#define PR_CAP_AMBIENT_CLEAR_ALL 4
|
|
#endif
|
|
|
|
#ifndef SECCOMP_MODE_FILTER
|
|
#define SECCOMP_MODE_FILTER 2
|
|
#endif
|
|
|
|
#define MAX_FILTER_SIZE 4096
|
|
|
|
static void die(const char *msg) {
|
|
perror(msg);
|
|
_exit(1);
|
|
}
|
|
|
|
static int write_file(const char *path, const char *fmt, ...) {
|
|
char buf[256];
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
int len = vsnprintf(buf, sizeof(buf), fmt, ap);
|
|
va_end(ap);
|
|
if (len < 0 || (size_t)len >= sizeof(buf)) {
|
|
errno = EOVERFLOW;
|
|
return -1;
|
|
}
|
|
|
|
int fd = open(path, O_WRONLY);
|
|
if (fd < 0) {
|
|
return -1;
|
|
}
|
|
ssize_t r = write(fd, buf, (size_t)len);
|
|
int saved = errno;
|
|
close(fd);
|
|
if (r != len) {
|
|
errno = (r < 0) ? saved : EIO;
|
|
return -1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* PID the current process forwards signals to. Used by both the outer stub
|
|
* (forwards to inner init) and the inner init (forwards to the worker).
|
|
* PID 1 ignores signals it has no handler for, so the inner init MUST install
|
|
* these or SIGTERM from the outside is silently dropped. */
|
|
static volatile pid_t forward_target = -1;
|
|
|
|
static void forward_signal(int sig) {
|
|
if (forward_target > 0) {
|
|
kill(forward_target, sig);
|
|
}
|
|
}
|
|
|
|
static void install_forwarders(pid_t target) {
|
|
forward_target = target;
|
|
struct sigaction sa = { .sa_handler = forward_signal };
|
|
sigemptyset(&sa.sa_mask);
|
|
sigaction(SIGTERM, &sa, NULL);
|
|
sigaction(SIGINT, &sa, NULL);
|
|
sigaction(SIGHUP, &sa, NULL);
|
|
sigaction(SIGQUIT, &sa, NULL);
|
|
sigaction(SIGUSR1, &sa, NULL);
|
|
sigaction(SIGUSR2, &sa, NULL);
|
|
}
|
|
|
|
/*
|
|
* Wait for `main_child`, reaping any other children that exit first.
|
|
* Returns as soon as `main_child` terminates — the caller then _exit()s,
|
|
* which as PID 1 tears down the namespace and SIGKILLs any stragglers.
|
|
* Returns an exit(3)-style status: exit code, or 128+signal.
|
|
*/
|
|
static int reap_until(pid_t main_child) {
|
|
int status = 0;
|
|
for (;;) {
|
|
pid_t r = waitpid(-1, &status, 0);
|
|
if (r < 0) {
|
|
if (errno == EINTR) {
|
|
continue;
|
|
}
|
|
return 1; /* ECHILD without seeing main_child — shouldn't happen. */
|
|
}
|
|
if (r == main_child) {
|
|
if (WIFEXITED(status)) {
|
|
return WEXITSTATUS(status);
|
|
}
|
|
if (WIFSIGNALED(status)) {
|
|
return 128 + WTERMSIG(status);
|
|
}
|
|
return 1;
|
|
}
|
|
/* Reaped an orphan that died before main_child; keep waiting. */
|
|
}
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
if (argc < 3) {
|
|
fprintf(stderr, "Usage: %s <filter.bpf> <command> [args...]\n", argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
const char *filter_path = argv[1];
|
|
char **command_argv = &argv[2];
|
|
|
|
/* ---- Load the BPF filter up front so we fail before any namespace work. ---- */
|
|
int fd = open(filter_path, O_RDONLY);
|
|
if (fd < 0) {
|
|
die("apply-seccomp: open(filter)");
|
|
}
|
|
static unsigned char filter_bytes[MAX_FILTER_SIZE];
|
|
ssize_t filter_size = read(fd, filter_bytes, MAX_FILTER_SIZE);
|
|
close(fd);
|
|
if (filter_size <= 0 || filter_size % 8 != 0) {
|
|
fprintf(stderr, "apply-seccomp: invalid BPF filter (size=%zd)\n", filter_size);
|
|
return 1;
|
|
}
|
|
struct sock_fprog prog = {
|
|
.len = (unsigned short)(filter_size / 8),
|
|
.filter = (struct sock_filter *)filter_bytes,
|
|
};
|
|
|
|
/* ---- New PID + mount namespaces. Children (not us) enter the PID ns. ----
|
|
*
|
|
* Two paths to get CAP_SYS_ADMIN for the unshare:
|
|
* (a) The caller (bwrap) kept CAP_SYS_ADMIN in this user namespace via
|
|
* --cap-add. Just unshare directly.
|
|
* (b) We don't have the cap. Create a nested user namespace to get it,
|
|
* map uid/gid, then unshare. This also works when apply-seccomp is
|
|
* run standalone outside bwrap.
|
|
*
|
|
* Path (a) is tried first. If the caller didn't give us the cap, the
|
|
* kernel returns EPERM and we fall through to (b). Path (b) can itself
|
|
* fail on hosts where unprivileged user namespaces are gated by an LSM
|
|
* (Ubuntu 24.04's AppArmor restriction, for example) — the unshare
|
|
* succeeds but the new namespace grants no capabilities, so the setgroups
|
|
* write fails. In that case we abort: the caller must supply CAP_SYS_ADMIN.
|
|
*/
|
|
if (unshare(CLONE_NEWPID | CLONE_NEWNS) < 0) {
|
|
if (errno != EPERM) {
|
|
die("apply-seccomp: unshare(CLONE_NEWPID|CLONE_NEWNS)");
|
|
}
|
|
|
|
uid_t uid = geteuid();
|
|
gid_t gid = getegid();
|
|
|
|
if (unshare(CLONE_NEWUSER) < 0) {
|
|
die("apply-seccomp: unshare(CLONE_NEWUSER)");
|
|
}
|
|
if (write_file("/proc/self/setgroups", "deny") < 0) {
|
|
die("apply-seccomp: write /proc/self/setgroups "
|
|
"(nested userns is capability-restricted; "
|
|
"caller must provide CAP_SYS_ADMIN)");
|
|
}
|
|
if (write_file("/proc/self/uid_map", "%u %u 1\n", uid, uid) < 0) {
|
|
die("apply-seccomp: write /proc/self/uid_map");
|
|
}
|
|
if (write_file("/proc/self/gid_map", "%u %u 1\n", gid, gid) < 0) {
|
|
die("apply-seccomp: write /proc/self/gid_map");
|
|
}
|
|
if (unshare(CLONE_NEWPID | CLONE_NEWNS) < 0) {
|
|
die("apply-seccomp: unshare(CLONE_NEWPID|CLONE_NEWNS) after userns");
|
|
}
|
|
}
|
|
|
|
pid_t child = fork();
|
|
if (child < 0) {
|
|
die("apply-seccomp: fork");
|
|
}
|
|
|
|
if (child > 0) {
|
|
/* Outer stub: still in bwrap's PID namespace. Forward signals and
|
|
* wait so the caller sees the real exit status. */
|
|
install_forwarders(child);
|
|
|
|
int status;
|
|
for (;;) {
|
|
pid_t r = waitpid(child, &status, 0);
|
|
if (r < 0 && errno == EINTR) continue;
|
|
if (r < 0) die("apply-seccomp: waitpid");
|
|
break;
|
|
}
|
|
if (WIFSIGNALED(status)) return 128 + WTERMSIG(status);
|
|
return WIFEXITED(status) ? WEXITSTATUS(status) : 1;
|
|
}
|
|
|
|
/* ================================================================
|
|
* Inner init — PID 1 in the nested PID namespace.
|
|
* ================================================================ */
|
|
|
|
/* Block ptrace and /proc/1/mem writes against this process. */
|
|
if (prctl(PR_SET_DUMPABLE, 0) < 0) {
|
|
die("apply-seccomp: prctl(PR_SET_DUMPABLE)");
|
|
}
|
|
|
|
/* Don't let our /proc mount propagate anywhere. */
|
|
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
|
|
die("apply-seccomp: mount(MS_PRIVATE)");
|
|
}
|
|
/* EPERM here means a masked /proc is underneath (unprivileged Docker)
|
|
* and the kernel domination check refused the overmount. The nested
|
|
* userns above is the isolation boundary; this remount only hides
|
|
* outer PIDs from `ls /proc`. enableWeakerNestedSandbox targets
|
|
* exactly this environment. */
|
|
if (mount("proc", "/proc", "proc", MS_NOSUID | MS_NODEV | MS_NOEXEC, NULL) < 0
|
|
&& errno != EPERM) {
|
|
die("apply-seccomp: mount(/proc)");
|
|
}
|
|
|
|
/* bwrap --cap-add places CAP_SYS_ADMIN in the ambient set so it survives
|
|
* exec. Clear it now that the mount is done; combined with
|
|
* PR_SET_NO_NEW_PRIVS, the worker's execve drops to zero capabilities. */
|
|
if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) < 0) {
|
|
die("apply-seccomp: prctl(PR_CAP_AMBIENT_CLEAR_ALL)");
|
|
}
|
|
|
|
/* Fork the real workload so PID 1 can stay as a non-dumpable reaper. */
|
|
pid_t worker = fork();
|
|
if (worker < 0) {
|
|
die("apply-seccomp: fork(worker)");
|
|
}
|
|
|
|
if (worker > 0) {
|
|
/* Inner init: reap everything, exit with the worker's status.
|
|
* When PID 1 exits the kernel tears down the whole namespace.
|
|
* PID 1 drops signals without handlers, so install forwarders. */
|
|
install_forwarders(worker);
|
|
_exit(reap_until(worker));
|
|
}
|
|
|
|
/* ---- Worker (inner PID 2): apply seccomp and exec. ---- */
|
|
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
|
|
die("apply-seccomp: prctl(PR_SET_NO_NEW_PRIVS)");
|
|
}
|
|
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) < 0) {
|
|
die("apply-seccomp: prctl(PR_SET_SECCOMP)");
|
|
}
|
|
|
|
execvp(command_argv[0], command_argv);
|
|
die("apply-seccomp: execvp");
|
|
return 1;
|
|
}
|