[PATCH 2/7] um: skas: gate stub mmap() through the USER_NOTIF monitor

Cong Wang xiyou.wangcong at gmail.com
Fri Jun 19 20:22:19 PDT 2026


From: Cong Wang <cwang at multikernel.io>

Route the stub's mmap to SECCOMP_RET_USER_NOTIF instead of RET_ALLOW, and
have the monitor service the resulting notifications inline. This is the
mechanism that a following change uses to validate each mmap's arguments;
for now every stub mmap is allowed (responded with
SECCOMP_USER_NOTIF_FLAG_CONTINUE), so behaviour is unchanged.

CONTINUE is safe for mmap: its arguments are all scalars captured in
seccomp_data, so there is no TOCTOU re-read of user memory.

The stub runs queued mmap batches in two places: syscall_stub_flush()
and the userspace() resume path, both of which wake the stub via
wait_stub_done_seccomp(). Servicing therefore lives there: after waking,
the monitor issues one NOTIF_RECV/CONTINUE per STUB_SYSCALL_MMAP in the
batch the stub is about to run. Signals are masked while the stub
executes the batch inside its SIGSYS handler, so notifications arrive in
queued order with nothing interleaved, and a simple counted loop is
sufficient. The pre-existing wake logic is factored into
wake_seccomp_stub() so both the wake and the wait-only paths share it.

Verified on UML: guest boots and survives a fork/exec storm plus heavy
demand paging with every stub mmap round-tripping through the monitor.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang at multikernel.io>
---
 arch/um/kernel/skas/stub_exe.c  |   5 +-
 arch/um/os-Linux/skas/process.c | 121 ++++++++++++++++++++++----------
 2 files changed, 89 insertions(+), 37 deletions(-)

diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index b5432f6ccbc7..65ea2af5ca73 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -173,7 +173,7 @@ noinline static void real_init(void)
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K,__NR_close,
 				 5, 0),
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
-				 4, 0),
+				 5, 0),
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
 				 3, 0),
 #ifdef __i386__
@@ -191,6 +191,9 @@ noinline static void real_init(void)
 
 			/* [18] Permitted call for the stub */
 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+
+			/* [19] mmap: route to the monitor for validation */
+			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
 		};
 		struct sock_fprog prog = {
 			.len = sizeof(filter) / sizeof(filter[0]),
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 0ab1d109a68d..63b426b2c523 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -29,7 +29,9 @@
 #include <sysdep/stub.h>
 #include <sysdep/mcontext.h>
 #include <linux/futex.h>
+#include <linux/seccomp.h>
 #include <linux/threads.h>
+#include <sys/ioctl.h>
 #include <timetravel.h>
 #include <asm-generic/rwonce.h>
 #include "../internal.h"
@@ -147,49 +149,89 @@ void wait_stub_done(int pid)
 	fatal_sigsegv();
 }
 
+static void wake_seccomp_stub(struct mm_id *mm_idp)
+{
+	struct stub_data *data = (void *)mm_idp->stack;
+	const char byte = 0;
+	struct iovec iov = {
+		.iov_base = (void *)&byte,
+		.iov_len = sizeof(byte),
+	};
+	union {
+		char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))];
+		struct cmsghdr align;
+	} ctrl;
+	struct msghdr msgh = {
+		.msg_iov = &iov,
+		.msg_iovlen = 1,
+	};
+
+	if (mm_idp->syscall_fd_num) {
+		unsigned int fds_size = sizeof(int) * mm_idp->syscall_fd_num;
+		struct cmsghdr *cmsg;
+
+		msgh.msg_control = ctrl.data;
+		msgh.msg_controllen = CMSG_SPACE(fds_size);
+		cmsg = CMSG_FIRSTHDR(&msgh);
+		cmsg->cmsg_level = SOL_SOCKET;
+		cmsg->cmsg_type = SCM_RIGHTS;
+		cmsg->cmsg_len = CMSG_LEN(fds_size);
+		memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map, fds_size);
+
+		CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock, &msgh, 0));
+	}
+
+	data->signal = 0;
+	data->futex = FUTEX_IN_CHILD;
+	CATCH_EINTR(syscall(__NR_futex, &data->futex,
+			    FUTEX_WAKE, 1, NULL, NULL, 0));
+}
+
+static int seccomp_notify_serve(int notify_fd)
+{
+	struct seccomp_notif req = {};
+	struct seccomp_notif_resp resp = {};
+	int ret;
+
+	CATCH_EINTR(ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_RECV, &req));
+	if (ret < 0)
+		return -errno;
+
+	resp.id = req.id;
+	resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+
+	CATCH_EINTR(ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_SEND, &resp));
+	if (ret < 0)
+		return -errno;
+
+	return 0;
+}
+
+static void seccomp_serve_mmaps(struct mm_id *mm_idp)
+{
+	struct stub_data *data = (void *)mm_idp->stack;
+	int i, n_mmaps = 0;
+
+	if (mm_idp->seccomp_notify_fd < 0)
+		return;
+
+	for (i = 0; i < data->syscall_data_len; i++)
+		if (data->syscall_data[i].syscall == STUB_SYSCALL_MMAP)
+			n_mmaps++;
+
+	for (i = 0; i < n_mmaps; i++)
+		seccomp_notify_serve(mm_idp->seccomp_notify_fd);
+}
+
 void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 {
 	struct stub_data *data = (void *)mm_idp->stack;
 	int ret;
 
 	do {
-		const char byte = 0;
-		struct iovec iov = {
-			.iov_base = (void *)&byte,
-			.iov_len = sizeof(byte),
-		};
-		union {
-			char data[CMSG_SPACE(sizeof(mm_idp->syscall_fd_map))];
-			struct cmsghdr align;
-		} ctrl;
-		struct msghdr msgh = {
-			.msg_iov = &iov,
-			.msg_iovlen = 1,
-		};
-
 		if (!running) {
-			if (mm_idp->syscall_fd_num) {
-				unsigned int fds_size =
-					sizeof(int) * mm_idp->syscall_fd_num;
-				struct cmsghdr *cmsg;
-
-				msgh.msg_control = ctrl.data;
-				msgh.msg_controllen = CMSG_SPACE(fds_size);
-				cmsg = CMSG_FIRSTHDR(&msgh);
-				cmsg->cmsg_level = SOL_SOCKET;
-				cmsg->cmsg_type = SCM_RIGHTS;
-				cmsg->cmsg_len = CMSG_LEN(fds_size);
-				memcpy(CMSG_DATA(cmsg), mm_idp->syscall_fd_map,
-				       fds_size);
-
-				CATCH_EINTR(syscall(__NR_sendmsg, mm_idp->sock,
-						&msgh, 0));
-			}
-
-			data->signal = 0;
-			data->futex = FUTEX_IN_CHILD;
-			CATCH_EINTR(syscall(__NR_futex, &data->futex,
-					    FUTEX_WAKE, 1, NULL, NULL, 0));
+			wake_seccomp_stub(mm_idp);
+			seccomp_serve_mmaps(mm_idp);
 		}
 
 		do {
@@ -246,6 +288,13 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 		fatal_sigsegv();
 }
 
+/*
+ * Service one SECCOMP_RET_USER_NOTIF notification from a stub mmap: read the
+ * suspended call, then respond CONTINUE so the stub's real mmap runs. CONTINUE
+ * is safe here because mmap takes only scalar arguments (no TOCTOU on user
+ * memory). Validation of (addr, len, prot, fd, offset) is added later; for now
+ * every stub mmap is allowed.
+ */
 extern unsigned long current_stub_stack(void);
 
 static void get_skas_faultinfo(int pid, struct faultinfo *fi)
-- 
2.43.0




More information about the linux-um mailing list