[PATCH 1/7] um: skas: create a seccomp USER_NOTIF listener and hand it to the monitor

Cong Wang xiyou.wangcong at gmail.com
Fri Jun 19 20:22:18 PDT 2026


From: Cong Wang <cwang at multikernel.io>

First step toward validating stub mmap() calls (the intra-guest /
host-escape disclosure issue in stub.c "Known security issues"): give the
monitor a SECCOMP_RET_USER_NOTIF listener for each stub's filter, so a
later change can route mmap to the monitor for per-call validation.

The stub installs its seccomp filter with NEW_LISTENER (plus TSYNC_ESRCH,
which the kernel requires to combine NEW_LISTENER with TSYNC); the
seccomp() return value is the listener fd. mmap still returns RET_ALLOW,
so there is no behavioural change yet.

The stub cannot hand the fd over itself: once the filter is installed,
every syscall it makes from outside the stub page traps with SIGSYS
instead of executing, so it can neither sendmsg() nor close() the fd.
Instead the monitor pulls it with pidfd_getfd(): the listener is, by
construction, fd 1 in the stub (close_range() left only fd 0 open before
seccomp() allocated it). Leaving the stub's copy open is harmless:
ioctl (NOTIF_RECV/SEND) is not on the syscall allowlist, so a hijacked
stub cannot self-approve. The monitor stores the fd in mm_id.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang at multikernel.io>
---
 arch/um/include/shared/skas/mm_id.h |  1 +
 arch/um/kernel/skas/stub_exe.c      | 12 ++++++---
 arch/um/os-Linux/skas/process.c     | 38 +++++++++++++++++++++++++++--
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/arch/um/include/shared/skas/mm_id.h b/arch/um/include/shared/skas/mm_id.h
index 18c0621430d2..46164d71554b 100644
--- a/arch/um/include/shared/skas/mm_id.h
+++ b/arch/um/include/shared/skas/mm_id.h
@@ -17,6 +17,7 @@ struct mm_id {
 
 	/* Only used with SECCOMP mode */
 	int sock;
+	int seccomp_notify_fd;
 	int syscall_fd_num;
 	int syscall_fd_map[STUB_MAX_FDS];
 };
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index cbafaa684e66..b5432f6ccbc7 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -196,10 +196,14 @@ noinline static void real_init(void)
 			.len = sizeof(filter) / sizeof(filter[0]),
 			.filter = filter,
 		};
-
-		if (stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
-				  SECCOMP_FILTER_FLAG_TSYNC,
-				  (unsigned long)&prog) != 0)
+		long listener;
+
+		listener = stub_syscall3(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
+					 SECCOMP_FILTER_FLAG_TSYNC |
+					 SECCOMP_FILTER_FLAG_NEW_LISTENER |
+					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH,
+					 (unsigned long)&prog);
+		if (listener < 0)
 			stub_syscall1(__NR_exit, 21);
 
 		/* Fall through, the exit syscall will cause SIGSYS */
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index d6c22f8aa06d..0ab1d109a68d 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -428,6 +428,34 @@ __initcall(init_stub_exe_fd);
 
 int using_seccomp;
 
+/*
+ * Obtain the SECCOMP_RET_USER_NOTIF listener fd the stub created. The stub
+ * cannot hand it over itself: after installing the filter, every syscall it
+ * makes traps with SIGSYS rather than executing (so it can neither sendmsg()
+ * it nor close() it). Instead the listener is, by construction, the first free
+ * fd in the stub -- fd 1, since close_range() left only fd 0 open before
+ * seccomp() allocated it -- so the monitor duplicates it directly with
+ * pidfd_getfd(). ptrace_may_access() holds because the monitor is the stub's
+ * parent.
+ */
+#define STUB_LISTENER_FD 1
+static int get_stub_listener(struct mm_id *mm_id)
+{
+	int pidfd, lfd;
+
+	pidfd = syscall(__NR_pidfd_open, mm_id->pid, 0);
+	if (pidfd < 0)
+		return -errno;
+
+	lfd = syscall(__NR_pidfd_getfd, pidfd, STUB_LISTENER_FD, 0);
+	close(pidfd);
+	if (lfd < 0)
+		return -errno;
+
+	mm_id->seccomp_notify_fd = lfd;
+	return 0;
+}
+
 /**
  * start_userspace() - prepare a new userspace process
  * @mm_id: The corresponding struct mm_id
@@ -449,6 +477,8 @@ int start_userspace(struct mm_id *mm_id)
 	unsigned long sp;
 	int status, n, err;
 
+	mm_id->seccomp_notify_fd = -1;
+
 	/* setup a temporary stack page */
 	stack = mmap(NULL, UM_KERN_PAGE_SIZE,
 		     PROT_READ | PROT_WRITE | PROT_EXEC,
@@ -522,10 +552,14 @@ int start_userspace(struct mm_id *mm_id)
 	}
 
 	close(tramp_data.sockpair[0]);
-	if (using_seccomp)
+	if (using_seccomp) {
 		mm_id->sock = tramp_data.sockpair[1];
-	else
+		err = get_stub_listener(mm_id);
+		if (err)
+			goto out_kill;
+	} else {
 		close(tramp_data.sockpair[1]);
+	}
 
 	return 0;
 
-- 
2.43.0




More information about the linux-um mailing list