[PATCH v9 02/10] um: use execveat to create userspace MMs

Benjamin Berg benjamin at sipsolutions.net
Thu Oct 17 02:01:39 PDT 2024


Hi,

On Thu, 2024-10-17 at 15:17 +0800, David Gow wrote:
> On Thu, 19 Sept 2024 at 20:45, Benjamin Berg <benjamin at sipsolutions.net> wrote:
> > 
> > [SNIP]
> 
> It turns out that this breaks the KUnit user alloc helpers on x86_64,
> at least on my machine.
> 
> This can be reproduced with:
> ./tools/testing/kunit/kunit.py run usercopy
> 
> Though the 32-bit version works:
> ./tools/testing/kunit/kunit.py run usercopy --kconfig_add CONFIG_64BIT=n
> 
> The error we're getting is:
> start_userspace : expected SIGSTOP, got status = 139
> Could not create userspace mm
> 
> This basically is the result of the stub_exe segfaulting very early on
> in its execution.
> 
> It seems that this is due to the stack being misaligned, and so the
> generated SSE instructions are faulting. The workarounds I've tested
> here include:
> a) Build the stub with -mno-sse
> b) Decorate real_init() with __attribute__((force_align_arg_pointer))
> c) Decorate __start() with __attribute__((naked))
> 
> The last one seems to validate my theory as to why this is occurring:
> __start's prologue is misaligning the stack, as __start is not
> actually _called_ from anything, so there's no 8-byte misalignment to
> hold the return address.
> 
> If this makes sense, I'll send a patch out with whichever the
> preferred fix(es) are. My guess is that (c) is the "proper" fix,
> though I'd not _miss_ SSE if we chose to disable it for the handful of
> instructions here anyway.

Yeah, SSE is pointless here, but we probably should ensure the
alignment is as expected.

So, c) seems like the right thing to do here. I am just a bit confused
about the alignment guarantees from the kernel. Are we correct to
assume a 16 byte alignment there or do we also need b) just to be on
the safe side?

Unfortunately I have not managed to reproduce the issue.

Benjamin

> 
> Cheers,
> -- David
> 
> >  arch/um/Makefile                        |   3 +-
> >  arch/um/include/shared/skas/stub-data.h |  11 ++
> >  arch/um/kernel/skas/.gitignore          |   2 +
> >  arch/um/kernel/skas/Makefile            |  33 ++++-
> >  arch/um/kernel/skas/stub_exe.c          |  88 ++++++++++++
> >  arch/um/kernel/skas/stub_exe_embed.S    |  11 ++
> >  arch/um/os-Linux/mem.c                  |   2 +-
> >  arch/um/os-Linux/skas/process.c         | 181 ++++++++++++++++----
> > ----
> >  8 files changed, 271 insertions(+), 60 deletions(-)
> >  create mode 100644 arch/um/kernel/skas/.gitignore
> >  create mode 100644 arch/um/kernel/skas/stub_exe.c
> >  create mode 100644 arch/um/kernel/skas/stub_exe_embed.S
> > 
> > diff --git a/arch/um/Makefile b/arch/um/Makefile
> > index 00b63bac5eff..31e367e8ab4d 100644
> > --- a/arch/um/Makefile
> > +++ b/arch/um/Makefile
> > @@ -61,7 +61,8 @@ KBUILD_CFLAGS += $(CFLAGS) $(CFLAGS-y) -
> > D__arch_um__ \
> >         $(ARCH_INCLUDE) $(MODE_INCLUDE) -Dvmap=kernel_vmap      \
> >         -Dlongjmp=kernel_longjmp -Dsetjmp=kernel_setjmp \
> >         -Din6addr_loopback=kernel_in6addr_loopback \
> > -       -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr
> > +       -Din6addr_any=kernel_in6addr_any -Dstrrchr=kernel_strrchr \
> > +       -D__close_range=kernel__close_range
> > 
> >  KBUILD_RUSTFLAGS += -Crelocation-model=pie
> > 
> > diff --git a/arch/um/include/shared/skas/stub-data.h
> > b/arch/um/include/shared/skas/stub-data.h
> > index 2b6b44759dfa..3fbdda727373 100644
> > --- a/arch/um/include/shared/skas/stub-data.h
> > +++ b/arch/um/include/shared/skas/stub-data.h
> > @@ -12,6 +12,17 @@
> >  #include <as-layout.h>
> >  #include <sysdep/tls.h>
> > 
> > +struct stub_init_data {
> > +       unsigned long stub_start;
> > +
> > +       int stub_code_fd;
> > +       unsigned long stub_code_offset;
> > +       int stub_data_fd;
> > +       unsigned long stub_data_offset;
> > +
> > +       unsigned long segv_handler;
> > +};
> > +
> >  #define STUB_NEXT_SYSCALL(s) \
> >         ((struct stub_syscall *) (((unsigned long) s) + (s)-
> > >cmd_len))
> > 
> > diff --git a/arch/um/kernel/skas/.gitignore
> > b/arch/um/kernel/skas/.gitignore
> > new file mode 100644
> > index 000000000000..c3409ced0f38
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/.gitignore
> > @@ -0,0 +1,2 @@
> > +stub_exe
> > +stub_exe.dbg
> > diff --git a/arch/um/kernel/skas/Makefile
> > b/arch/um/kernel/skas/Makefile
> > index 6f86d53e3d69..fbb61968055f 100644
> > --- a/arch/um/kernel/skas/Makefile
> > +++ b/arch/um/kernel/skas/Makefile
> > @@ -3,14 +3,43 @@
> >  # Copyright (C) 2002 - 2007 Jeff Dike
> > (jdike@{addtoit,linux.intel}.com)
> >  #
> > 
> > -obj-y := stub.o mmu.o process.o syscall.o uaccess.o
> > +obj-y := stub.o mmu.o process.o syscall.o uaccess.o \
> > +        stub_exe_embed.o
> > +
> > +# Stub executable
> > +
> > +stub_exe_objs-y := stub_exe.o
> > +
> > +stub_exe_objs := $(foreach F,$(stub_exe_objs-y),$(obj)/$F)
> > +
> > +# Object file containing the ELF executable
> > +$(obj)/stub_exe_embed.o: $(src)/stub_exe_embed.S $(obj)/stub_exe
> > +
> > +$(obj)/stub_exe.dbg: $(stub_exe_objs) FORCE
> > +       $(call if_changed,stub_exe)
> > +
> > +$(obj)/stub_exe: OBJCOPYFLAGS := -S
> > +$(obj)/stub_exe: $(obj)/stub_exe.dbg FORCE
> > +       $(call if_changed,objcopy)
> > +
> > +quiet_cmd_stub_exe = STUB_EXE $@
> > +      cmd_stub_exe = $(CC) -nostdlib -o $@ \
> > +                          $(KBUILD_CFLAGS) $(STUB_EXE_LDFLAGS) \
> > +                          $(filter %.o,$^)
> > +
> > +STUB_EXE_LDFLAGS = -n -static
> > +
> > +targets += stub_exe.dbg stub_exe $(stub_exe_objs-y)
> > +
> > +# end
> > 
> >  # stub.o is in the stub, so it can't be built with profiling
> >  # GCC hardened also auto-enables -fpic, but we need %ebx so it
> > can't work ->
> >  # disable it
> > 
> >  CFLAGS_stub.o := $(CFLAGS_NO_HARDENING)
> > -UNPROFILE_OBJS := stub.o
> > +CFLAGS_stub_exe.o := $(CFLAGS_NO_HARDENING)
> > +UNPROFILE_OBJS := stub.o stub_exe.o
> >  KCOV_INSTRUMENT := n
> > 
> >  include $(srctree)/arch/um/scripts/Makefile.rules
> > diff --git a/arch/um/kernel/skas/stub_exe.c
> > b/arch/um/kernel/skas/stub_exe.c
> > new file mode 100644
> > index 000000000000..bc6ba2e4d805
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/stub_exe.c
> > @@ -0,0 +1,88 @@
> > +#include <sys/ptrace.h>
> > +#include <sys/prctl.h>
> > +#include <asm/unistd.h>
> > +#include <sysdep/stub.h>
> > +#include <stub-data.h>
> > +
> > +void _start(void);
> > +
> > +noinline static void real_init(void)
> > +{
> > +       struct stub_init_data init_data;
> > +       unsigned long res;
> > +       struct {
> > +               void  *ss_sp;
> > +               int    ss_flags;
> > +               size_t ss_size;
> > +       } stack = {
> > +               .ss_size = STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
> > +       };
> > +       struct {
> > +               void *sa_handler_;
> > +               unsigned long sa_flags;
> > +               void *sa_restorer;
> > +               unsigned long long sa_mask;
> > +       } sa = {
> > +               /* Need to set SA_RESTORER (but the handler never
> > returns) */
> > +               .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO |
> > 0x04000000,
> > +               /* no need to mask any signals */
> > +               .sa_mask = 0,
> > +       };
> > +
> > +       /* set a nice name */
> > +       stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-
> > userspace");
> > +
> > +       /* read information from STDIN and close it */
> > +       res = stub_syscall3(__NR_read, 0,
> > +                           (unsigned long)&init_data,
> > sizeof(init_data));
> > +       if (res != sizeof(init_data))
> > +               stub_syscall1(__NR_exit, 10);
> > +
> > +       stub_syscall1(__NR_close, 0);
> > +
> > +       /* map stub code + data */
> > +       res = stub_syscall6(STUB_MMAP_NR,
> > +                           init_data.stub_start,
> > UM_KERN_PAGE_SIZE,
> > +                           PROT_READ | PROT_EXEC, MAP_FIXED |
> > MAP_SHARED,
> > +                           init_data.stub_code_fd,
> > init_data.stub_code_offset);
> > +       if (res != init_data.stub_start)
> > +               stub_syscall1(__NR_exit, 11);
> > +
> > +       res = stub_syscall6(STUB_MMAP_NR,
> > +                           init_data.stub_start +
> > UM_KERN_PAGE_SIZE,
> > +                           STUB_DATA_PAGES * UM_KERN_PAGE_SIZE,
> > +                           PROT_READ | PROT_WRITE, MAP_FIXED |
> > MAP_SHARED,
> > +                           init_data.stub_data_fd,
> > init_data.stub_data_offset);
> > +       if (res != init_data.stub_start + UM_KERN_PAGE_SIZE)
> > +               stub_syscall1(__NR_exit, 12);
> > +
> > +       /* setup signal stack inside stub data */
> > +       stack.ss_sp = (void *)init_data.stub_start +
> > UM_KERN_PAGE_SIZE;
> > +       stub_syscall2(__NR_sigaltstack, (unsigned long)&stack, 0);
> > +
> > +       /* register SIGSEGV handler */
> > +       sa.sa_handler_ = (void *) init_data.segv_handler;
> > +       res = stub_syscall4(__NR_rt_sigaction, SIGSEGV, (unsigned
> > long)&sa, 0,
> > +                           sizeof(sa.sa_mask));
> > +       if (res != 0)
> > +               stub_syscall1(__NR_exit, 13);
> > +
> > +       stub_syscall4(__NR_ptrace, PTRACE_TRACEME, 0, 0, 0);
> > +
> > +       stub_syscall2(__NR_kill, stub_syscall0(__NR_getpid),
> > SIGSTOP);
> > +
> > +       stub_syscall1(__NR_exit, 14);
> > +
> > +       __builtin_unreachable();
> > +}
> > +
> > +void _start(void)
> > +{
> > +       char *alloc;
> > +
> > +       /* Make enough space for the stub (including space for
> > alignment) */
> > +       alloc = __builtin_alloca((1 + 2 * STUB_DATA_PAGES - 1) *
> > UM_KERN_PAGE_SIZE);
> > +       asm volatile("" : "+r,m"(alloc) : : "memory");
> > +
> > +       real_init();
> > +}
> > diff --git a/arch/um/kernel/skas/stub_exe_embed.S
> > b/arch/um/kernel/skas/stub_exe_embed.S
> > new file mode 100644
> > index 000000000000..6d8914fbe8f1
> > --- /dev/null
> > +++ b/arch/um/kernel/skas/stub_exe_embed.S
> > @@ -0,0 +1,11 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#include <linux/init.h>
> > +#include <linux/linkage.h>
> > +
> > +__INITDATA
> > +
> > +SYM_DATA_START(stub_exe_start)
> > +       .incbin "arch/um/kernel/skas/stub_exe"
> > +SYM_DATA_END_LABEL(stub_exe_start, SYM_L_GLOBAL, stub_exe_end)
> > +
> > +__FINIT
> > diff --git a/arch/um/os-Linux/mem.c b/arch/um/os-Linux/mem.c
> > index cf44d386f23c..857e3deab293 100644
> > --- a/arch/um/os-Linux/mem.c
> > +++ b/arch/um/os-Linux/mem.c
> > @@ -42,7 +42,7 @@ void kasan_map_memory(void *start, size_t len)
> >  }
> > 
> >  /* Set by make_tempfile() during early boot. */
> > -static char *tempdir = NULL;
> > +char *tempdir = NULL;
> > 
> >  /* Check if dir is on tmpfs. Return 0 if yes, -1 if no or error.
> > */
> >  static int __init check_tmpfs(const char *dir)
> > diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
> > Linux/skas/process.c
> > index b6f656bcffb1..2caaa9e85752 100644
> > --- a/arch/um/os-Linux/skas/process.c
> > +++ b/arch/um/os-Linux/skas/process.c
> > @@ -10,8 +10,11 @@
> >  #include <sched.h>
> >  #include <errno.h>
> >  #include <string.h>
> > +#include <fcntl.h>
> > +#include <mem_user.h>
> >  #include <sys/mman.h>
> >  #include <sys/wait.h>
> > +#include <sys/stat.h>
> >  #include <asm/unistd.h>
> >  #include <as-layout.h>
> >  #include <init.h>
> > @@ -189,69 +192,135 @@ static void handle_trap(int pid, struct
> > uml_pt_regs *regs)
> > 
> >  extern char __syscall_stub_start[];
> > 
> > -/**
> > - * userspace_tramp() - userspace trampoline
> > - * @stack:     pointer to the new userspace stack page
> > - *
> > - * The userspace trampoline is used to setup a new userspace
> > process in start_userspace() after it was clone()'ed.
> > - * This function will run on a temporary stack page.
> > - * It ptrace()'es itself, then
> > - * Two pages are mapped into the userspace address space:
> > - * - STUB_CODE (with EXEC), which contains the skas stub code
> > - * - STUB_DATA (with R/W), which contains a data page that is used
> > to transfer certain data between the UML userspace process and the
> > UML kernel.
> > - * Also for the userspace process a SIGSEGV handler is installed
> > to catch pagefaults in the userspace process.
> > - * And last the process stops itself to give control to the UML
> > kernel for this userspace process.
> > - *
> > - * Return: Always zero, otherwise the current userspace process is
> > ended with non null exit() call
> > - */
> > +static int stub_exe_fd;
> > +
> >  static int userspace_tramp(void *stack)
> >  {
> > -       struct sigaction sa;
> > -       void *addr;
> > -       int fd;
> > +       char *const argv[] = { "uml-userspace", NULL };
> > +       int pipe_fds[2];
> >         unsigned long long offset;
> > -       unsigned long segv_handler = STUB_CODE +
> > -                                    (unsigned long)
> > stub_segv_handler -
> > -                                    (unsigned long)
> > __syscall_stub_start;
> > -
> > -       ptrace(PTRACE_TRACEME, 0, 0, 0);
> > -
> > -       signal(SIGTERM, SIG_DFL);
> > -       signal(SIGWINCH, SIG_IGN);
> > -
> > -       fd = phys_mapping(uml_to_phys(__syscall_stub_start),
> > &offset);
> > -       addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
> > -                     PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd,
> > offset);
> > -       if (addr == MAP_FAILED) {
> > -               os_info("mapping mmap stub at 0x%lx failed, errno =
> > %d\n",
> > -                       STUB_CODE, errno);
> > -               exit(1);
> > +       struct stub_init_data init_data = {
> > +               .stub_start = STUB_START,
> > +               .segv_handler = STUB_CODE +
> > +                               (unsigned long) stub_segv_handler -
> > +                               (unsigned long)
> > __syscall_stub_start,
> > +       };
> > +       struct iomem_region *iomem;
> > +       int ret;
> > +
> > +       init_data.stub_code_fd =
> > phys_mapping(uml_to_phys(__syscall_stub_start),
> > +                                             &offset);
> > +       init_data.stub_code_offset = MMAP_OFFSET(offset);
> > +
> > +       init_data.stub_data_fd = phys_mapping(uml_to_phys(stack),
> > &offset);
> > +       init_data.stub_data_offset = MMAP_OFFSET(offset);
> > +
> > +       /* Set CLOEXEC on all FDs and then unset on all memory
> > related FDs */
> > +       close_range(0, ~0U, CLOSE_RANGE_CLOEXEC);
> > +
> > +       fcntl(init_data.stub_data_fd, F_SETFD, 0);
> > +       for (iomem = iomem_regions; iomem; iomem = iomem->next)
> > +               fcntl(iomem->fd, F_SETFD, 0);
> > +
> > +       /* Create a pipe for init_data (no CLOEXEC) and dup2 to
> > STDIN */
> > +       if (pipe2(pipe_fds, 0))
> > +               exit(2);
> > +
> > +       close(0);
> > +       if (dup2(pipe_fds[0], 0) < 0) {
> > +               close(pipe_fds[0]);
> > +               close(pipe_fds[1]);
> > +               exit(3);
> >         }
> > +       close(pipe_fds[0]);
> > +
> > +       /* Write init_data and close write side */
> > +       ret = write(pipe_fds[1], &init_data, sizeof(init_data));
> > +       close(pipe_fds[1]);
> > +
> > +       if (ret != sizeof(init_data))
> > +               exit(4);
> > +
> > +       execveat(stub_exe_fd, "", argv, NULL, AT_EMPTY_PATH);
> > 
> > -       fd = phys_mapping(uml_to_phys(stack), &offset);
> > -       addr = mmap((void *) STUB_DATA,
> > -                   STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ
> > | PROT_WRITE,
> > -                   MAP_FIXED | MAP_SHARED, fd, offset);
> > -       if (addr == MAP_FAILED) {
> > -               os_info("mapping segfault stack at 0x%lx failed,
> > errno = %d\n",
> > -                       STUB_DATA, errno);
> > -               exit(1);
> > +       exit(5);
> > +}
> > +
> > +extern char stub_exe_start[];
> > +extern char stub_exe_end[];
> > +
> > +extern char *tempdir;
> > +
> > +#define STUB_EXE_NAME_TEMPLATE "/uml-userspace-XXXXXX"
> > +
> > +#ifndef MFD_EXEC
> > +#define MFD_EXEC 0x0010U
> > +#endif
> > +
> > +static int __init init_stub_exe_fd(void)
> > +{
> > +       size_t written = 0;
> > +       char *tmpfile = NULL;
> > +
> > +       stub_exe_fd = memfd_create("uml-userspace",
> > +                                  MFD_EXEC | MFD_CLOEXEC |
> > MFD_ALLOW_SEALING);
> > +
> > +       if (stub_exe_fd < 0) {
> > +               printk(UM_KERN_INFO "Could not create executable
> > memfd, using temporary file!");
> > +
> > +               tmpfile = malloc(strlen(tempdir) +
> > +                                 strlen(STUB_EXE_NAME_TEMPLATE) +
> > 1);
> > +               if (tmpfile == NULL)
> > +                       panic("Failed to allocate memory for stub
> > binary name");
> > +
> > +               strcpy(tmpfile, tempdir);
> > +               strcat(tmpfile, STUB_EXE_NAME_TEMPLATE);
> > +
> > +               stub_exe_fd = mkstemp(tmpfile);
> > +               if (stub_exe_fd < 0)
> > +                       panic("Could not create temporary file for
> > stub binary: %d",
> > +                             -errno);
> >         }
> > 
> > -       set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES *
> > UM_KERN_PAGE_SIZE);
> > -       sigemptyset(&sa.sa_mask);
> > -       sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
> > -       sa.sa_sigaction = (void *) segv_handler;
> > -       sa.sa_restorer = NULL;
> > -       if (sigaction(SIGSEGV, &sa, NULL) < 0) {
> > -               os_info("%s - setting SIGSEGV handler failed -
> > errno = %d\n",
> > -                       __func__, errno);
> > -               exit(1);
> > +       while (written < stub_exe_end - stub_exe_start) {
> > +               ssize_t res = write(stub_exe_fd, stub_exe_start +
> > written,
> > +                                   stub_exe_end - stub_exe_start -
> > written);
> > +               if (res < 0) {
> > +                       if (errno == EINTR)
> > +                               continue;
> > +
> > +                       if (tmpfile)
> > +                               unlink(tmpfile);
> > +                       panic("Failed write stub binary: %d", -
> > errno);
> > +               }
> > +
> > +               written += res;
> > +       }
> > +
> > +       if (!tmpfile) {
> > +               fcntl(stub_exe_fd, F_ADD_SEALS,
> > +                     F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_GROW |
> > F_SEAL_SEAL);
> > +       } else {
> > +               if (fchmod(stub_exe_fd, 00500) < 0) {
> > +                       unlink(tmpfile);
> > +                       panic("Could not make stub binary
> > executable: %d",
> > +                             -errno);
> > +               }
> > +
> > +               close(stub_exe_fd);
> > +               stub_exe_fd = open(tmpfile, O_RDONLY | O_CLOEXEC |
> > O_NOFOLLOW);
> > +               if (stub_exe_fd < 0) {
> > +                       unlink(tmpfile);
> > +                       panic("Could not reopen stub binary: %d", -
> > errno);
> > +               }
> > +
> > +               unlink(tmpfile);
> > +               free(tmpfile);
> >         }
> > 
> > -       kill(os_getpid(), SIGSTOP);
> >         return 0;
> >  }
> > +__initcall(init_stub_exe_fd);
> > 
> >  int userspace_pid[NR_CPUS];
> > 
> > @@ -270,7 +339,7 @@ int start_userspace(unsigned long stub_stack)
> >  {
> >         void *stack;
> >         unsigned long sp;
> > -       int pid, status, n, flags, err;
> > +       int pid, status, n, err;
> > 
> >         /* setup a temporary stack page */
> >         stack = mmap(NULL, UM_KERN_PAGE_SIZE,
> > @@ -286,10 +355,10 @@ int start_userspace(unsigned long stub_stack)
> >         /* set stack pointer to the end of the stack page, so it
> > can grow downwards */
> >         sp = (unsigned long)stack + UM_KERN_PAGE_SIZE;
> > 
> > -       flags = CLONE_FILES | SIGCHLD;
> > -
> >         /* clone into new userspace process */
> > -       pid = clone(userspace_tramp, (void *) sp, flags, (void *)
> > stub_stack);
> > +       pid = clone(userspace_tramp, (void *) sp,
> > +                   CLONE_VFORK | CLONE_VM | SIGCHLD,
> > +                   (void *)stub_stack);
> >         if (pid < 0) {
> >                 err = -errno;
> >                 printk(UM_KERN_ERR "%s : clone failed, errno =
> > %d\n",
> > --
> > 2.46.0
> > 
> > 



More information about the linux-um mailing list