[PATCH v5 3/7] um: use execveat on memfd to create userspace MMs
Tiwei Bie
tiwei.btw at antgroup.com
Thu Jun 20 00:20:08 PDT 2024
On 6/19/24 11:34 PM, Benjamin Berg wrote:
> From: Benjamin Berg <benjamin.berg at intel.com>
>
> Using clone will not undo features that have been enabled by libc. An
> example of this already happening is rseq, which could cause the kernel
> to read/write memory of the userspace process. In the future the
> standard library might also use mseal by default to protect itself,
> which would also thwart our attempts at unmapping everything.
>
> Solve all this by taking a step back and doing an execve into a tiny
> static binary that sets up the minimal environment required for the
> stub without using any standard library. That way we have a clean
> execution environment that is fully under the control of UML.
>
> Note that this changes things a bit as the FDs are not anymore shared
> with the kernel. Instead, we explicitly share the FDs for the physical
> memory and all existing iomem regions. Doing this is fine, as iomem
> regions cannot be added at runtime.
This is pretty cool! :)
>
> Signed-off-by: Benjamin Berg <benjamin.berg at intel.com>
> ---
> arch/um/include/shared/skas/stub-data.h | 11 ++
> arch/um/os-Linux/skas/process.c | 145 +++++++++++++++---------
> arch/x86/um/.gitignore | 2 +
> arch/x86/um/Makefile | 32 +++++-
> arch/x86/um/stub_elf.c | 84 ++++++++++++++
> arch/x86/um/stub_elf_embed.S | 11 ++
> 6 files changed, 227 insertions(+), 58 deletions(-)
> create mode 100644 arch/x86/um/.gitignore
> create mode 100644 arch/x86/um/stub_elf.c
> create mode 100644 arch/x86/um/stub_elf_embed.S
>
> diff --git a/arch/um/include/shared/skas/stub-data.h b/arch/um/include/shared/skas/stub-data.h
> index 5e3ade3fb38b..83d210f59956 100644
> --- a/arch/um/include/shared/skas/stub-data.h
> +++ b/arch/um/include/shared/skas/stub-data.h
> @@ -8,6 +8,17 @@
> #ifndef __STUB_DATA_H
> #define __STUB_DATA_H
>
> +struct stub_init_data {
> + unsigned long stub_start;
> +
> + int stub_code_fd;
> + unsigned long stub_code_offset;
> + int stub_data_fd;
> + unsigned long stub_data_offset;
> +
> + unsigned long segv_handler;
> +};
> +
> struct stub_data {
> unsigned long offset;
> int fd;
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
> index 41a288dcfc34..6cd82ca7a43d 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -23,6 +23,8 @@
> #include <skas.h>
> #include <sysdep/stub.h>
> #include <linux/threads.h>
> +#include <fcntl.h>
> +#include <mem_user.h>
> #include "../internal.h"
>
> int is_skas_winch(int pid, int fd, void *data)
> @@ -188,69 +190,100 @@ static void handle_trap(int pid, struct uml_pt_regs *regs)
>
> extern char __syscall_stub_start[];
>
> -/**
> - * userspace_tramp() - userspace trampoline
> - * @stack: pointer to the new userspace stack page
> - *
> - * The userspace trampoline is used to setup a new userspace process in start_userspace() after it was clone()'ed.
> - * This function will run on a temporary stack page.
> - * It ptrace()'es itself, then
> - * Two pages are mapped into the userspace address space:
> - * - STUB_CODE (with EXEC), which contains the skas stub code
> - * - STUB_DATA (with R/W), which contains a data page that is used to transfer certain data between the UML userspace process and the UML kernel.
> - * Also for the userspace process a SIGSEGV handler is installed to catch pagefaults in the userspace process.
> - * And last the process stops itself to give control to the UML kernel for this userspace process.
> - *
> - * Return: Always zero, otherwise the current userspace process is ended with non null exit() call
> - */
> +int userspace_pid[NR_CPUS];
Nit: userspace_pid is already defined in this file.
> +
> +extern char stub_elf_start[];
> +extern char stub_elf_end[];
> +
> +int stub_exec_fd;
stub_exec_fd isn't used outside this file. Might be better to make it static.
> +
> static int userspace_tramp(void *stack)
> {
> - struct sigaction sa;
> - void *addr;
> - int fd;
> + char *const argv[] = { "uml-userspace", NULL };
> + int pipe_fds[2];
> unsigned long long offset;
> - unsigned long segv_handler = STUB_CODE +
> - (unsigned long) stub_segv_handler -
> - (unsigned long) __syscall_stub_start;
> -
> - ptrace(PTRACE_TRACEME, 0, 0, 0);
> -
> - signal(SIGTERM, SIG_DFL);
> - signal(SIGWINCH, SIG_IGN);
> -
> - fd = phys_mapping(uml_to_phys(__syscall_stub_start), &offset);
> - addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
> - PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
> - if (addr == MAP_FAILED) {
> - os_info("mapping mmap stub at 0x%lx failed, errno = %d\n",
> - STUB_CODE, errno);
> - exit(1);
> + struct stub_init_data init_data = {
> + .stub_start = STUB_START,
> + .segv_handler = STUB_CODE +
> + (unsigned long) stub_segv_handler -
> + (unsigned long) __syscall_stub_start,
> + };
> + struct iomem_region *iomem = iomem_regions;
> + int ret;
> +
> + init_data.stub_code_fd = phys_mapping(uml_to_phys(__syscall_stub_start),
> + &offset);
> + init_data.stub_code_offset = MMAP_OFFSET(offset);
> +
> + init_data.stub_data_fd = phys_mapping(uml_to_phys(stack), &offset);
> + init_data.stub_data_offset = MMAP_OFFSET(offset);
> +
> + /* Set CLOEXEC on all FDs and then unset on all memory related FDs */
> + close_range(0, ~0U, CLOSE_RANGE_CLOEXEC);
> +
> + fcntl(init_data.stub_data_fd, F_SETFD, 0);
> + while (iomem) {
> + fcntl(init_data.stub_data_fd, F_SETFD, 0);
Typo: this is supposed to be iomem->fd.
> + iomem = iomem->next;
> }
>
> - fd = phys_mapping(uml_to_phys(stack), &offset);
> - addr = mmap((void *) STUB_DATA,
> - STUB_DATA_PAGES * UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
> - MAP_FIXED | MAP_SHARED, fd, offset);
> - if (addr == MAP_FAILED) {
> - os_info("mapping segfault stack at 0x%lx failed, errno = %d\n",
> - STUB_DATA, errno);
> - exit(1);
> + /* Create a pipe for init_data (no CLOEXEC) and dup2 to STDIN */
> + if (pipe2(pipe_fds, 0))
> + exit(2);
> +
> + close(0);
> + if (dup2(pipe_fds[0], 0) < 0) {
> + close(pipe_fds[0]);
> + close(pipe_fds[1]);
> + exit(3);
> }
> + close(pipe_fds[0]);
> +
> + /* Write init_data and close write side */
> + ret = write(pipe_fds[1], &init_data, sizeof(init_data));
> + close(pipe_fds[1]);
> +
> + if (ret != sizeof(init_data))
> + exit(4);
> +
> + execveat(stub_exec_fd, "", argv, NULL, AT_EMPTY_PATH);
>
> - set_sigstack((void *) STUB_DATA, STUB_DATA_PAGES * UM_KERN_PAGE_SIZE);
> - sigemptyset(&sa.sa_mask);
> - sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
> - sa.sa_sigaction = (void *) segv_handler;
> - sa.sa_restorer = NULL;
> - if (sigaction(SIGSEGV, &sa, NULL) < 0) {
> - os_info("%s - setting SIGSEGV handler failed - errno = %d\n",
> - __func__, errno);
> - exit(1);
> + close(0);
> +
> + exit(4);
Might be better to bump the exit code.
> +}
> +
[...]
> diff --git a/arch/x86/um/stub_elf.c b/arch/x86/um/stub_elf.c
> new file mode 100644
> index 000000000000..b634740a6a15
> --- /dev/null
> +++ b/arch/x86/um/stub_elf.c
> @@ -0,0 +1,84 @@
> +#include <sys/ptrace.h>
> +#include <sys/prctl.h>
> +#include <asm/unistd.h>
> +#include <sysdep/stub.h>
> +#include <stub-data.h>
> +
> +void _start(void);
> +
> +static void real_init(void)
> +{
> + struct stub_init_data init_data;
> + unsigned long res;
> + struct {
> + void *ss_sp;
> + int ss_flags;
> + size_t ss_size;
> + } stack;
> + struct {
> + void *sa_handler_;
> + unsigned long sa_flags;
> + void *sa_restorer;
> + unsigned long sa_mask;
> + } sa = {};
> +
> + /* set a nice name */
> + stub_syscall2(__NR_prctl, PR_SET_NAME, (unsigned long)"uml-userspace");
> +
> + /* read information from STDIN and close it */
> + stub_syscall3(__NR_read, 0,
> + (unsigned long)&init_data, sizeof(init_data));
Might be better to check the number of bytes read.
Regards,
Tiwei
More information about the linux-um
mailing list