[PATCH 6/7] um: skas: kill stubs that block SIGALRM via a watchdog thread

Fri Jun 19 20:22:23 PDT 2026

From: Cong Wang <cwang at multikernel.io>

A hijacked stub can block SIGALRM with a crafted rt_sigreturn (the mask is
restored from the stack it controls), so it is never preempted and never
reports back. SIGALRM goes to the stub, not the monitor, so the monitor
then blocks indefinitely in wait_stub_done_seccomp(). This is the one item
("blocking e.g. SIGALRM") of the stub.c "Known security issues" list that a
seccomp filter cannot address, since rt_sigreturn is required and its
effect lives on the stack rather than in register arguments.

Detect it out of band. A helper thread blocks on its own timerfd and
watches a per-vCPU (pid, seq) pair the monitor updates around each wait:
pid is the stub being waited on, seq advances every time the stub reports.
A stub that stops reporting leaves pid pinned and seq frozen; after
SECCOMP_WD_STALL_TICKS ticks of no progress the watchdog SIGKILLs it, and
the resulting SIGCHLD unblocks the monitor through the existing "pid < 0"
teardown.

Each monitor writes only its own slot and the watchdog only reads, so the
word-sized state needs just __READ_ONCE()/__WRITE_ONCE(), no lock; the
watchdog scans every slot, covering all CPUs under SMP. A false kill cannot
happen without the same pid and an unchanged seq across many ticks. The
thread runs with all signals blocked (os_run_helper_thread()), uses
write(2) rather than printk() from its non-kernel context, and is started
once via a compare-and-swap guard. It is preferred over a bounded
FUTEX_WAIT timeout: it costs one counter bump on the per-syscall hot path
and catches a stall anywhere on stub input, not just the one futex.

Verified on UML (UP and 2-CPU SMP): heavy mmap/munmap churn and CPU-bound
loops on every vCPU run with zero false kills; a stub with SIGALRM blocked
is killed in ~5s and the monitor recovers, while syscall-making processes
are untouched.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang at multikernel.io>
---
 arch/um/os-Linux/skas/process.c | 129 ++++++++++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 2010b4529c41..7ffde2b00b61 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -21,6 +21,7 @@
 #include <as-layout.h>
 #include <init.h>
 #include <kern_util.h>
+#include <smp.h>
 #include <mem.h>
 #include <os.h>
 #include <ptrace_user.h>
@@ -33,6 +34,8 @@
 #include <linux/threads.h>
 #include <sys/ioctl.h>
 #include <poll.h>
+#include <signal.h>
+#include <sys/timerfd.h>
 #include <timetravel.h>
 #include <asm-generic/rwonce.h>
 #include "../internal.h"
@@ -254,11 +257,131 @@ static void seccomp_serve_stub_syscalls(struct mm_id *mm_idp)
 		seccomp_notify_serve(mm_idp);
 }
 
+#define SECCOMP_WD_TICK_SECS	1
+#define SECCOMP_WD_STALL_TICKS	5	/* ~5s of no progress before killing */
+
+static int seccomp_wd_pid[CONFIG_NR_CPUS] = { [0 ... CONFIG_NR_CPUS - 1] = -1 };
+static unsigned long seccomp_wd_seq[CONFIG_NR_CPUS];
+
+static inline void seccomp_wd_enter(int pid)
+{
+	int cpu = uml_curr_cpu();
+
+	__WRITE_ONCE(seccomp_wd_seq[cpu], seccomp_wd_seq[cpu] + 1);
+	__WRITE_ONCE(seccomp_wd_pid[cpu], pid);
+}
+
+static inline void seccomp_wd_progress(void)
+{
+	int cpu = uml_curr_cpu();
+
+	__WRITE_ONCE(seccomp_wd_seq[cpu], seccomp_wd_seq[cpu] + 1);
+}
+
+static inline void seccomp_wd_exit(void)
+{
+	__WRITE_ONCE(seccomp_wd_pid[uml_curr_cpu()], -1);
+}
+
+/* Per-CPU snapshot the watchdog compares against the next tick. */
+struct seccomp_wd_cpu {
+	int prev_pid;
+	unsigned long prev_seq;
+	int stall;
+};
+
+static void seccomp_wd_check_cpu(int cpu, struct seccomp_wd_cpu *st)
+{
+	static const char kill_msg[] =
+		"seccomp watchdog: killing unresponsive stub (SIGALRM blocked?)\n";
+	int pid = __READ_ONCE(seccomp_wd_pid[cpu]);
+	unsigned long seq = __READ_ONCE(seccomp_wd_seq[cpu]);
+
+	if (pid >= 0 && pid == st->prev_pid && seq == st->prev_seq) {
+		if (++st->stall >= SECCOMP_WD_STALL_TICKS) {
+			/* printk() is unsafe from this thread. */
+			(void)!write(2, kill_msg, sizeof(kill_msg) - 1);
+			kill(pid, SIGKILL);
+			st->stall = 0;
+		}
+	} else {
+		st->stall = 0;
+	}
+
+	st->prev_pid = pid;
+	st->prev_seq = seq;
+}
+
+static void *seccomp_watchdog(void *arg)
+{
+	int tfd = (int)(long)arg;
+	struct seccomp_wd_cpu st[CONFIG_NR_CPUS];
+	int cpu;
+
+	for (cpu = 0; cpu < CONFIG_NR_CPUS; cpu++)
+		st[cpu] = (struct seccomp_wd_cpu){ .prev_pid = -1 };
+
+	for (;;) {
+		unsigned long long expirations;
+
+		/*
+		 * One check per wakeup; ignore the expiration count so a
+		 * descheduled watchdog accrues stalls more slowly, never faster.
+		 */
+		if (read(tfd, &expirations, sizeof(expirations)) !=
+		    sizeof(expirations))
+			continue;
+
+		for (cpu = 0; cpu < CONFIG_NR_CPUS; cpu++)
+			seccomp_wd_check_cpu(cpu, &st[cpu]);
+	}
+
+	return NULL;
+}
+
+static void start_seccomp_watchdog(void)
+{
+	static int started;
+	static struct os_helper_thread *wd_td;
+	struct itimerspec its = {
+		.it_value    = { .tv_sec = SECCOMP_WD_TICK_SECS },
+		.it_interval = { .tv_sec = SECCOMP_WD_TICK_SECS },
+	};
+	int tfd;
+
+	/* Several vCPU monitors may race here; only the first starts the thread. */
+	if (!__sync_bool_compare_and_swap(&started, 0, 1))
+		return;
+
+	tfd = timerfd_create(CLOCK_MONOTONIC, 0);
+	if (tfd < 0) {
+		printk(UM_KERN_ERR "%s : timerfd_create failed, errno = %d\n",
+		       __func__, errno);
+		return;
+	}
+
+	if (timerfd_settime(tfd, 0, &its, NULL) < 0) {
+		printk(UM_KERN_ERR "%s : timerfd_settime failed, errno = %d\n",
+		       __func__, errno);
+		close(tfd);
+		return;
+	}
+
+	if (os_run_helper_thread(&wd_td, seccomp_watchdog, (void *)(long)tfd) < 0) {
+		printk(UM_KERN_ERR "%s : failed to start watchdog thread\n",
+		       __func__);
+		close(tfd);
+		wd_td = NULL;
+	}
+}
+
 void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 {
 	struct stub_data *data = (void *)mm_idp->stack;
 	int ret;
 
+	seccomp_wd_enter(mm_idp->pid);
+
 	do {
 		if (!running) {
 			wake_seccomp_stub(mm_idp);
@@ -296,6 +419,9 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 		if (__READ_ONCE(mm_idp->pid) < 0)
 			goto out_kill;
 
+		/* The stub reported back: record progress for the watchdog. */
+		seccomp_wd_progress();
+
 		running = 0;
 
 		/* We may receive a SIGALRM before SIGSYS, iterate again. */
@@ -312,9 +438,11 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 		goto out_kill;
 	}
 
+	seccomp_wd_exit();
 	return;
 
 out_kill:
+	seccomp_wd_exit();
 	printk(UM_KERN_ERR "%s : failed to wait for stub, pid = %d, errno = %d\n",
 	       __func__, mm_idp->pid, errno);
 	/* This is not true inside start_userspace */
@@ -633,6 +761,7 @@ int start_userspace(struct mm_id *mm_id)
 		err = get_stub_listener(mm_id);
 		if (err)
 			goto out_kill;
+		start_seccomp_watchdog();
 	} else {
 		close(tramp_data.sockpair[1]);
 	}
-- 
2.43.0