[PATCH 5/7] um: skas: validate stub munmap() against the guest address range

Cong Wang xiyou.wangcong at gmail.com
Fri Jun 19 20:22:22 PDT 2026


From: Cong Wang <cwang at multikernel.io>

Route stub munmap() through the USER_NOTIF monitor too, and validate it
before letting it run. munmap() was previously SECCOMP_RET_ALLOW, so a
hijacked stub (jumping to the in-stub munmap with crafted registers) could
unmap arbitrary ranges, including the stub's own code/data pages, which
would sever the monitor's control over it, or guest mappings outside what
it is allowed to manage. After mmap, munmap was the remaining memory
primitive a hijacked stub could invoke with arbitrary arguments.

Unlike mmap(), there is no PTE left to check: by the time the stub unmaps
a guest page the kernel has already cleared the corresponding entry. So
stub_munmap_allowed() is range-based instead: the request must be
non-empty, must not wrap, and must lie entirely below STUB_START. That
confines the stub to the guest address space and keeps its own reserved
region off-limits. Both arguments are scalars captured in seccomp_data, so
CONTINUE carries no TOCTOU risk, same as mmap().

stub_munmap_allowed() lives in skas/uaccess.c next to stub_mmap_allowed();
the os-Linux notify handler dispatches on the syscall number and responds
CONTINUE or -EPERM, and the batch server counts STUB_SYSCALL_MUNMAP as
well as STUB_SYSCALL_MMAP.

Verified on UML: guest boots and survives heavy mmap/munmap churn with
zero false denials; the legitimate boot-time clear of the whole user
address space [0, STUB_START) is allowed (end == STUB_START), while a
range overlapping the stub region is denied.

Assisted-by: Claude:claude-opus-4.8
Signed-off-by: Cong Wang <cwang at multikernel.io>
---
 arch/um/include/shared/skas/skas.h |  2 ++
 arch/um/kernel/skas/stub_exe.c     |  4 ++--
 arch/um/kernel/skas/uaccess.c      | 12 ++++++++++++
 arch/um/os-Linux/skas/process.c    | 31 ++++++++++++++++++------------
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h
index ce1b67b06b4b..ca2a62cef0c1 100644
--- a/arch/um/include/shared/skas/skas.h
+++ b/arch/um/include/shared/skas/skas.h
@@ -20,5 +20,7 @@ void initial_jmpbuf_unlock(void);
 
 int stub_mmap_allowed(struct mm_id *id, unsigned long addr,
 		      unsigned long prot, unsigned long offset);
+int stub_munmap_allowed(struct mm_id *id, unsigned long addr,
+			unsigned long len);
 
 #endif
diff --git a/arch/um/kernel/skas/stub_exe.c b/arch/um/kernel/skas/stub_exe.c
index 65ea2af5ca73..00eea0cb9463 100644
--- a/arch/um/kernel/skas/stub_exe.c
+++ b/arch/um/kernel/skas/stub_exe.c
@@ -175,7 +175,7 @@ noinline static void real_init(void)
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, STUB_MMAP_NR,
 				 5, 0),
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_munmap,
-				 3, 0),
+				 4, 0),
 #ifdef __i386__
 			BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_set_thread_area,
 				 2, 0),
@@ -192,7 +192,7 @@ noinline static void real_init(void)
 			/* [18] Permitted call for the stub */
 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
 
-			/* [19] mmap: route to the monitor for validation */
+			/* [19] mmap and munmap: route to the monitor for validation */
 			BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_USER_NOTIF),
 		};
 		struct sock_fprog prog = {
diff --git a/arch/um/kernel/skas/uaccess.c b/arch/um/kernel/skas/uaccess.c
index 9359ede8a04b..feb267637735 100644
--- a/arch/um/kernel/skas/uaccess.c
+++ b/arch/um/kernel/skas/uaccess.c
@@ -14,6 +14,7 @@
 #include <asm/futex.h>
 #include <os.h>
 #include <skas.h>
+#include <as-layout.h>
 
 /*
  * Same mapping as MMAP_OFFSET() in <sysdep/stub.h>, but usable from kernel
@@ -79,6 +80,17 @@ int stub_mmap_allowed(struct mm_id *id, unsigned long addr,
 	return 1;
 }
 
+int stub_munmap_allowed(struct mm_id *id, unsigned long addr, unsigned long len)
+{
+	if (len == 0 || addr + len < addr)
+		return 0;
+
+	if (addr + len > STUB_START)
+		return 0;
+
+	return 1;
+}
+
 static pte_t *maybe_map(unsigned long virt, int is_write)
 {
 	pte_t *pte = virt_to_pte(current->mm, virt);
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 0987eb79ce76..2010b4529c41 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -193,7 +193,7 @@ static int seccomp_notify_serve(struct mm_id *mm_idp)
 	struct seccomp_notif req = {};
 	struct seccomp_notif_resp resp = {};
 	int notify_fd = mm_idp->seccomp_notify_fd;
-	int ret;
+	int allowed, ret;
 
 	CATCH_EINTR(ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_RECV, &req));
 	if (ret < 0)
@@ -201,13 +201,19 @@ static int seccomp_notify_serve(struct mm_id *mm_idp)
 
 	resp.id = req.id;
 
-	if (req.data.nr == STUB_MMAP_NR &&
-	    stub_mmap_allowed(mm_idp, req.data.args[0], req.data.args[2],
-			      req.data.args[5])) {
+	if (req.data.nr == STUB_MMAP_NR)
+		allowed = stub_mmap_allowed(mm_idp, req.data.args[0],
+					    req.data.args[2], req.data.args[5]);
+	else if (req.data.nr == __NR_munmap)
+		allowed = stub_munmap_allowed(mm_idp, req.data.args[0],
+					      req.data.args[1]);
+	else
+		allowed = 0;
+
+	if (allowed)
 		resp.flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
-	} else {
+	else
 		resp.error = -EPERM;
-	}
 
 	CATCH_EINTR(ret = ioctl(notify_fd, SECCOMP_IOCTL_NOTIF_SEND, &resp));
 	if (ret < 0)
@@ -231,19 +237,20 @@ static void seccomp_notify_drain(struct mm_id *mm_idp)
 			break;
 }
 
-static void seccomp_serve_mmaps(struct mm_id *mm_idp)
+static void seccomp_serve_stub_syscalls(struct mm_id *mm_idp)
 {
 	struct stub_data *data = (void *)mm_idp->stack;
-	int i, n_mmaps = 0;
+	int i, n_notif = 0;
 
 	if (mm_idp->seccomp_notify_fd < 0)
 		return;
 
 	for (i = 0; i < data->syscall_data_len; i++)
-		if (data->syscall_data[i].syscall == STUB_SYSCALL_MMAP)
-			n_mmaps++;
+		if (data->syscall_data[i].syscall == STUB_SYSCALL_MMAP ||
+		    data->syscall_data[i].syscall == STUB_SYSCALL_MUNMAP)
+			n_notif++;
 
-	for (i = 0; i < n_mmaps; i++)
+	for (i = 0; i < n_notif; i++)
 		seccomp_notify_serve(mm_idp);
 }
 
@@ -255,7 +262,7 @@ void wait_stub_done_seccomp(struct mm_id *mm_idp, int running, int wait_sigsys)
 	do {
 		if (!running) {
 			wake_seccomp_stub(mm_idp);
-			seccomp_serve_mmaps(mm_idp);
+			seccomp_serve_stub_syscalls(mm_idp);
 		}
 
 		do {
-- 
2.43.0




More information about the linux-um mailing list