[PATCH v9 02/10] um: use execveat to create userspace MMs
Benjamin Berg
benjamin at sipsolutions.net
Thu Oct 17 02:01:39 PDT 2024
Hi,
On Thu, 2024-10-17 at 15:17 +0800, David Gow wrote:
> On Thu, 19 Sept 2024 at 20:45, Benjamin Berg <benjamin at sipsolutions.net> wrote:
> >
> > [SNIP]
>
> It turns out that this breaks the KUnit user alloc helpers on x86_64,
> at least on my machine.
>
> This can be reproduced with:
> ./tools/testing/kunit/kunit.py run usercopy
>
> Though the 32-bit version works:
> ./tools/testing/kunit/kunit.py run usercopy --kconfig_add CONFIG_64BIT=n
>
> The error we're getting is:
> start_userspace : expected SIGSTOP, got status = 139
> Could not create userspace mm
>
> This basically is the result of the stub_exe segfaulting very early on
> in its execution.
>
> It seems that this is due to the stack being misaligned, and so the
> generated SSE instructions are faulting. The workarounds I've tested
> here include:
> a) Build the stub with -mno-sse
> b) Decorate real_init() with __attribute__((force_align_arg_pointer))
> c) Decorate __start() with __attribute__((naked))
>
> The last one seems to validate my theory as to why this is occurring:
> __start's prologue is misaligning the stack, as __start is not
> actually _called_ from anything, so there's no 8-byte misalignment to
> hold the return address.
>
> If this makes sense, I'll send a patch out with whichever the
> preferred fix(es) are. My guess is that (c) is the "proper" fix,
> though I'd not _miss_ SSE if we chose to disable it for the handful of
> instructions here anyway.
Yeah, SSE is pointless here, but we probably should ensure the
alignment is as expected.
So, c) seems like the right thing to do here. I am just a bit confused
about the alignment guarantees from the kernel. Are we correct to
assume a 16 byte alignment there or do we also need b) just to be on
the safe side?
Unfortunately I have not managed to reproduce the issue.
Benjamin
>
> Cheers,
> -- David
>
> > arch/um/Makefile | 3 +-
> > arch/um/include/shared/skas/stub-data.h | 11 ++
> > arch/um/kernel/skas/.gitignore | 2 +
> > arch/um/kernel/skas/Makefile | 33 ++++-
> > arch/um/kernel/skas/stub_exe.c | 88 ++++++++++++
> > arch/um/kernel/skas/stub_exe_embed.S | 11 ++
> > arch/um/os-Linux/mem.c | 2 +-
> > arch/um/os-Linux/skas/process.c | 181 ++++++++++++++++----
> > ----
> > 8 files changed, 271 insertions(+), 60 deletions(-)
> > create mode 100644 arch/um/kernel/skas/.gitignore
> > create mode 100644 arch/um/kernel/skas/stub_exe.c
> > create mode 100644 arch/um/kernel/skas/stub_exe_embed.S
> >
> > diff --git a/arch/um/Makefile b/arch/um/Makefile
> > index 00b63bac5eff..31e367e8ab4d 100644
> > --- a/arch/um/Makefile
> > +++ b/arch/um/Makefile
> > @@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -
> > D__arch_um__ \
> > $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap \
> > -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \
> > -Din6addr_loopback=kernel_in6addr_loopback \
> > - -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr
> > + -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \
> > + -D__close_range=kernel__close_range
> >
> > KBUILD_RUSTFLAGS += -Crelocation-model=pie
> >
> > diff --git a/arch/um/include/shared/skas/stub-data.h
> > b/arch/um/include/shared/skas/stub-data.h
> > index 2b6b44759dfa..3fbdda727373 100644
> > --- a/arch/um/include/shared/skas/stub-data.h
> > +++ b/arch/um/include/shared/skas/stub-data.h
> > @@ -12,6 +12,17 @@
> > #include <as-layout.h>
> > #include <sysdep/tls.h>
> >
> > +struct stub_init_data {
> > + unsigned long stub_start;
> > +
> > + int stub_code_fd;
> > + unsigned long stub_code_offset;
> > + int stub_data_fd;
> > + unsigned long stub_data_offset;
> > +
> > + unsigned long segv_handler;
> > +};
> > +
> > #define STUB_NEXT_SYSCALL(s) \
> > ((struct stub_syscall *) (((unsigned long) s) + (s)-
> > >cmd_len))
> >
> > diff --git a/arch/um/kernel/skas/.gitignore
> > b/arch/um/kernel/skas/.gitignore
> > new file mode 100644
> > index 000000000000..c3409ced0f38
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/.gitignore
> > @@ -0,0 +1,2 @@
> > +stub_exe
> > +stub_exe.dbg
> > diff --git a/arch/um/kernel/skas/Makefile
> > b/arch/um/kernel/skas/Makefile
> > index 6f86d53e3d69..fbb61968055f 100644
> > --- a/arch/um/kernel/skas/Makefile
> > +++ b/arch/um/kernel/skas/Makefile
> > @@ -3,14 +3,43 @@
> > # Copyright (C) 2002 - 2007 Jeff Dike
> > (jdike@{addtoit,linux.intel}.com)
> > #
> >
> > -obj-y := stub.o mmu.o process.o syscall.o uaccess.o
> > +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \
> > + stub_exe_embed.o
> > +
> > +# Stub executable
> > +
> > +stub_exe_objs-y := stub_exe.o
> > +
> > +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F)
> > +
> > +# Object file containing the ELF executable
> > +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe
> > +
> > +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE
> > + $(call if_changed,stub_exe)
> > +
> > +$(obj)/stub_exe: OBJCOPYFLAGS := -S
> > +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE
> > + $(call if_changed,objcopy)
> > +
> > +quiet_cmd_stub_exe = STUB_EXE $@
> > + cmd_stub_exe = $(CC) -nostdlib -o $@ \
> > + $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \
> > + $(filter %.o,$^)
> > +
> > +STUB_EXE_LDFLAGS = -n -static
> > +
> > +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y)
> > +
> > +# end
> >
> > # stub.o is in the stub, so it can't be built with profiling
> > # GCC hardened also auto-enables -fpic, but we need %ebx so it
> > can't work ->
> > # disable it
> >
> > CFLAGS_stub.o := $(CFLAGS_NO_HARDENING)
> > -UNPROFILE_OBJS := stub.o
> > +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING)
> > +UNPROFILE_OBJS := stub.o stub_exe.o
> > KCOV_INSTRUMENT := n
> >
> > include $(srctree)/arch/um/scripts/Makefile.rules
> > diff --git a/arch/um/kernel/skas/stub_exe.c
> > b/arch/um/kernel/skas/stub_exe.c
> > new file mode 100644
> > index 000000000000..bc6ba2e4d805
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/stub_exe.c
> > @@ -0,0 +1,88 @@
> > +#include <sys/ptrace.h>
> > +#include <sys/prctl.h>
> > +#include <asm/unistd.h>
> > +#include <sysdep/stub.h>
> > +#include <stub-data.h>
> > +
> > +void _start(void);
> > +
> > +noinline static void real_init(void)
> > +{
> > + struct stub_init_data init_data;
> > + unsigned long res;
> > + struct {
> > + void *ss_sp;
> > + int ss_flags;
> > + size_t ss_size;
> > + } stack = {
> > + .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
> > + };
> > + struct {
> > + void *sa_handler_;
> > + unsigned long sa_flags;
> > + void *sa_restorer;
> > + unsigned long long sa_mask;
> > + } sa = {
> > + /* Need to set SA_RESTORER (but the handler never
> > returns) */
> > + .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO |
> > 0x04000000,
> > + /* no need to mask any signals */
> > + .sa_mask = 0,
> > + };
> > +
> > + /* set a nice name */
> > + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-
> > userspace");
> > +
> > + /* read information from STDIN and close it */
> > + res = stub_syscall3(__NR_read, 0,
> > + (unsigned long)&init_data,
> > sizeof(init_data));
> > + if (res != sizeof(init_data))
> > + stub_syscall1(__NR_exit, 10);
> > +
> > + stub_syscall1(__NR_close, 0);
> > +
> > + /* map stub code + data */
> > + res = stub_syscall6(STUB_MMAP_NR,
> > + init_data.stub_start,
> > UM_KERN_PAGE_SIZE,
> > + PROT_READ | PROT_EXEC, MAP_FIXED |
> > MAP_SHARED,
> > + init_data.stub_code_fd,
> > init_data.stub_code_offset);
> > + if (res != init_data.stub_start)
> > + stub_syscall1(__NR_exit, 11);
> > +
> > + res = stub_syscall6(STUB_MMAP_NR,
> > + init_data.stub_start +
> > UM_KERN_PAGE_SIZE,
> > + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
> > + PROT_READ | PROT_WRITE, MAP_FIXED |
> > MAP_SHARED,
> > + init_data.stub_data_fd,
> > init_data.stub_data_offset);
> > + if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
> > + stub_syscall1(__NR_exit, 12);
> > +
> > + /* setup signal stack inside stub data */
> > + stack.ss_sp = (void *)init_data.stub_start +
> > UM_KERN_PAGE_SIZE;
> > + stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
> > +
> > + /* register SIGSEGV handler */
> > + sa.sa_handler_ = (void *) init_data.segv_handler;
> > + res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned
> > long)&sa, 0,
> > + sizeof(sa.sa_mask));
> > + if (res != 0)
> > + stub_syscall1(__NR_exit, 13);
> > +
> > + stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
> > +
> > + stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid),
> > SIGSTOP);
> > +
> > + stub_syscall1(__NR_exit, 14);
> > +
> > + __builtin_unreachable();
> > +}
> > +
> > +void _start(void)
> > +{
> > + char *alloc;
> > +
> > + /* Make enough space for the stub (including space for
> > alignment) */
> > + alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) *
> > UM_KERN_PAGE_SIZE);
> > + asm volatile("" : "+r,m"(alloc) : : "memory");
> > +
> > + real_init();
> > +}
> > diff --git a/arch/um/kernel/skas/stub_exe_embed.S
> > b/arch/um/kernel/skas/stub_exe_embed.S
> > new file mode 100644
> > index 000000000000..6d8914fbe8f1
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/stub_exe_embed.S
> > @@ -0,0 +1,11 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#include <linux/init.h>
> > +#include <linux/linkage.h>
> > +
> > +__INITDATA
> > +
> > +SYM_DATA_START(stub_exe_start)
> > + .incbin "arch/um/kernel/skas/stub_exe"
> > +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end)
> > +
> > +__FINIT
> > diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
> > index cf44d386f23c..857e3deab293 100644
> > --- a/arch/um/os-Linux/mem.c
> > +++ b/arch/um/os-Linux/mem.c
> > @@ -42,7 +42,7 @@ void kasan_map_memory(void *start, size_t len)
> > }
> >
> > /* Set by make_tempfile() during early boot. */
> > -static char *tempdir = NULL;
> > +char *tempdir = NULL;
> >
> > /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error.
> > */
> > static int __init check_tmpfs(const char *dir)
> > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
> > Linux/skas/process.c
> > index b6f656bcffb1..2caaa9e85752 100644
> > --- a/arch/um/os-Linux/skas/process.c
> > +++ b/arch/um/os-Linux/skas/process.c
> > @@ -10,8 +10,11 @@
> > #include <sched.h>
> > #include <errno.h>
> > #include <string.h>
> > +#include <fcntl.h>
> > +#include <mem_user.h>
> > #include <sys/mman.h>
> > #include <sys/wait.h>
> > +#include <sys/stat.h>
> > #include <asm/unistd.h>
> > #include <as-layout.h>
> > #include <init.h>
> > @@ -189,69 +192,135 @@ static void handle_trap(int pid, struct
> > uml_pt_regs *regs)
> >
> > extern char __syscall_stub_start[];
> >
> > -/**
> > - * userspace_tramp() - userspace trampoline
> > - * @stack: pointer to the new userspace stack page
> > - *
> > - * The userspace trampoline is used to setup a new userspace
> > process in start_userspace() after it was clone()'ed.
> > - * This function will run on a temporary stack page.
> > - * It ptrace()'es itself, then
> > - * Two pages are mapped into the userspace address space:
> > - * - STUB_CODE (with EXEC), which contains the skas stub code
> > - * - STUB_DATA (with R/W), which contains a data page that is used
> > to transfer certain data between the UML userspace process and the
> > UML kernel.
> > - * Also for the userspace process a SIGSEGV handler is installed
> > to catch pagefaults in the userspace process.
> > - * And last the process stops itself to give control to the UML
> > kernel for this userspace process.
> > - *
> > - * Return: Always zero, otherwise the current userspace process is
> > ended with non null exit() call
> > - */
> > +static int stub_exe_fd;
> > +
> > static int userspace_tramp(void *stack)
> > {
> > - struct sigaction sa;
> > - void *addr;
> > - int fd;
> > + char *const argv[] = { "uml-userspace", NULL };
> > + int pipe_fds[2];
> > unsigned long long offset;
> > - unsigned long segv_handler = STUB_CODE +
> > - (unsigned long)
> > stub_segv_handler -
> > - (unsigned long)
> > __syscall_stub_start;
> > -
> > - ptrace(PTRACE_TRACEME, 0, 0, 0);
> > -
> > - signal(SIGTERM, SIG_DFL);
> > - signal(SIGWINCH, SIG_IGN);
> > -
> > - fd = phys_mapping(uml_to_phys(__syscall_stub_start),
> > &offset);
> > - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
> > - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd,
> > offset);
> > - if (addr == MAP_FAILED) {
> > - os_info("mapping mmap stub at 0x%lx failed, errno =
> > %d\n",
> > - STUB_CODE, errno);
> > - exit(1);
> > + struct stub_init_data init_data = {
> > + .stub_start = STUB_START,
> > + .segv_handler = STUB_CODE +
> > + (unsigned long) stub_segv_handler -
> > + (unsigned long)
> > __syscall_stub_start,
> > + };
> > + struct iomem_region *iomem;
> > + int ret;
> > +
> > + init_data.stub_code_fd =
> > phys_mapping(uml_to_phys(__syscall_stub_start),
> > + &offset);
> > + init_data.stub_code_offset = MMAP_OFFSET(offset);
> > +
> > + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack),
> > &offset);
> > + init_data.stub_data_offset = MMAP_OFFSET(offset);
> > +
> > + /* Set CLOEXEC on all FDs and then unset on all memory
> > related FDs */
> > + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC);
> > +
> > + fcntl(init_data.stub_data_fd, F_SETFD, 0);
> > + for (iomem = iomem_regions; iomem; iomem = iomem->next)
> > + fcntl(iomem->fd, F_SETFD, 0);
> > +
> > + /* Create a pipe for init_data (no CLOEXEC) and dup2 to
> > STDIN */
> > + if (pipe2(pipe_fds, 0))
> > + exit(2);
> > +
> > + close(0);
> > + if (dup2(pipe_fds[0], 0) < 0) {
> > + close(pipe_fds[0]);
> > + close(pipe_fds[1]);
> > + exit(3);
> > }
> > + close(pipe_fds[0]);
> > +
> > + /* Write init_data and close write side */
> > + ret = write(pipe_fds[1], &init_data, sizeof(init_data));
> > + close(pipe_fds[1]);
> > +
> > + if (ret != sizeof(init_data))
> > + exit(4);
> > +
> > + execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH);
> >
> > - fd = phys_mapping(uml_to_phys(stack), &offset);
> > - addr = mmap((void *) STUB_DATA,
> > - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ
> > | PROT_WRITE,
> > - MAP_FIXED | MAP_SHARED, fd, offset);
> > - if (addr == MAP_FAILED) {
> > - os_info("mapping segfault stack at 0x%lx failed,
> > errno = %d\n",
> > - STUB_DATA, errno);
> > - exit(1);
> > + exit(5);
> > +}
> > +
> > +extern char stub_exe_start[];
> > +extern char stub_exe_end[];
> > +
> > +extern char *tempdir;
> > +
> > +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX"
> > +
> > +#ifndef MFD_EXEC
> > +#define MFD_EXEC 0x0010U
> > +#endif
> > +
> > +static int __init init_stub_exe_fd(void)
> > +{
> > + size_t written = 0;
> > + char *tmpfile = NULL;
> > +
> > + stub_exe_fd = memfd_create("uml-userspace",
> > + MFD_EXEC | MFD_CLOEXEC |
> > MFD_ALLOW_SEALING);
> > +
> > + if (stub_exe_fd < 0) {
> > + printk(UM_KERN_INFO "Could not create executable
> > memfd, using temporary file!");
> > +
> > + tmpfile = malloc(strlen(tempdir) +
> > + strlen(STUB_EXE_NAME_TEMPLATE) +
> > 1);
> > + if (tmpfile == NULL)
> > + panic("Failed to allocate memory for stub
> > binary name");
> > +
> > + strcpy(tmpfile, tempdir);
> > + strcat(tmpfile, STUB_EXE_NAME_TEMPLATE);
> > +
> > + stub_exe_fd = mkstemp(tmpfile);
> > + if (stub_exe_fd < 0)
> > + panic("Could not create temporary file for
> > stub binary: %d",
> > + -errno);
> > }
> >
> > - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES *
> > UM_KERN_PAGE_SIZE);
> > - sigemptyset(&sa.sa_mask);
> > - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
> > - sa.sa_sigaction = (void *) segv_handler;
> > - sa.sa_restorer = NULL;
> > - if (sigaction(SIGSEGV, &sa, NULL) < 0) {
> > - os_info("%s - setting SIGSEGV handler failed -
> > errno = %d\n",
> > - __func__, errno);
> > - exit(1);
> > + while (written < stub_exe_end - stub_exe_start) {
> > + ssize_t res = write(stub_exe_fd, stub_exe_start +
> > written,
> > + stub_exe_end - stub_exe_start -
> > written);
> > + if (res < 0) {
> > + if (errno == EINTR)
> > + continue;
> > +
> > + if (tmpfile)
> > + unlink(tmpfile);
> > + panic("Failed write stub binary: %d", -
> > errno);
> > + }
> > +
> > + written += res;
> > + }
> > +
> > + if (!tmpfile) {
> > + fcntl(stub_exe_fd, F_ADD_SEALS,
> > + F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW |
> > F_SEAL_SEAL);
> > + } else {
> > + if (fchmod(stub_exe_fd, 00500) < 0) {
> > + unlink(tmpfile);
> > + panic("Could not make stub binary
> > executable: %d",
> > + -errno);
> > + }
> > +
> > + close(stub_exe_fd);
> > + stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC |
> > O_NOFOLLOW);
> > + if (stub_exe_fd < 0) {
> > + unlink(tmpfile);
> > + panic("Could not reopen stub binary: %d", -
> > errno);
> > + }
> > +
> > + unlink(tmpfile);
> > + free(tmpfile);
> > }
> >
> > - kill(os_getpid(), SIGSTOP);
> > return 0;
> > }
> > +__initcall(init_stub_exe_fd);
> >
> > int userspace_pid[NR_CPUS];
> >
> > @@ -270,7 +339,7 @@ int start_userspace(unsigned long stub_stack)
> > {
> > void *stack;
> > unsigned long sp;
> > - int pid, status, n, flags, err;
> > + int pid, status, n, err;
> >
> > /* setup a temporary stack page */
> > stack = mmap(NULL, UM_KERN_PAGE_SIZE,
> > @@ -286,10 +355,10 @@ int start_userspace(unsigned long stub_stack)
> > /* set stack pointer to the end of the stack page, so it
> > can grow downwards */
> > sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
> >
> > - flags = CLONE_FILES | SIGCHLD;
> > -
> > /* clone into new userspace process */
> > - pid = clone(userspace_tramp, (void *) sp, flags, (void *)
> > stub_stack);
> > + pid = clone(userspace_tramp, (void *) sp,
> > + CLONE_VFORK | CLONE_VM | SIGCHLD,
> > + (void *)stub_stack);
> > if (pid < 0) {
> > err = -errno;
> > printk(UM_KERN_ERR "%s : clone failed, errno =
> > %d\n",
> > --
> > 2.46.0
> >
> >
More information about the linux-um
mailing list