[PATCH] um: clean up mm creation

Fri Sep 22 06:49:08 PDT 2023

On Fri, 2023-09-22 at 13:16 +0200, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg at intel.com>
> 
> While enabling PREEMPT on UML, we found that the call to
> force_flush_all() cannot be done where it is, it sleeps
> while atomic.
> 
> Further investigation shows that all this seems at least
> a bit roundabout and possibly wrong wrong in the first
> place - we copy from the 'current' process and then flush
> when it starts running.
> 
> What we really want is to start fresh with an empty mm
> process, then have it all set up by the kernel (copying
> the original mm contents as needed), and then sync it
> in arch_dup_mmap().
> 
> We should do the same for the LDT, so need to split that
> to be able to do this.
> 
> Note that this fixes what seems like an issue - we look
> at current->mm when we copy, but that doesn't seem right
> in the case of clone() without copying the MM. This is
> probably saved by the forced flush later right now.

Well, not clone(), but rather any execve(). The execve() happens in a
new MM (do_execveat_common -> alloc_bprm -> bprm_mm_init -> mm_alloc)
which is then swapped into the current task by exec_mmap.

And, that explains why the flush was needed. Without it, our userspace
had the parent's memory available unless it was un-/remapped (possibly
even RW with CLONE_VM|CLONE_VFORK). This is obviously wrong and
pointless as we discarded the information anyway.

So, doing a flush in arch_dup_mmap seems much more reasonable to me
(not entirely sure if it is needed). And, I doubt there is a benefit in
optimizing the flush away by somehow cloning the userspace process.

Benjamin

> Signed-off-by: Johannes Berg <johannes.berg at intel.com>
> ---
>  arch/um/include/asm/Kbuild        |   1 -
>  arch/um/include/asm/mm_hooks.h    |  22 ++++++
>  arch/um/include/asm/mmu.h         |   3 +-
>  arch/um/include/asm/mmu_context.h |   2 +-
>  arch/um/include/shared/os.h       |   1 -
>  arch/um/kernel/process.c          |   3 -
>  arch/um/kernel/skas/mmu.c         |  35 ++++++----
>  arch/um/kernel/tlb.c              |   5 +-
>  arch/um/os-Linux/skas/process.c   | 107 ----------------------------
> --
>  arch/x86/um/ldt.c                 |  53 +++++++--------
>  10 files changed, 76 insertions(+), 156 deletions(-)
>  create mode 100644 arch/um/include/asm/mm_hooks.h
> 
> diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
> index b2d834a29f3a..de8d82a6fd7b 100644
> --- a/arch/um/include/asm/Kbuild
> +++ b/arch/um/include/asm/Kbuild
> @@ -26,5 +26,4 @@ generic-y += switch_to.h
>  generic-y += topology.h
>  generic-y += trace_clock.h
>  generic-y += kprobes.h
> -generic-y += mm_hooks.h
>  generic-y += vga.h
> diff --git a/arch/um/include/asm/mm_hooks.h
> b/arch/um/include/asm/mm_hooks.h
> new file mode 100644
> index 000000000000..b1016520c5b8
> --- /dev/null
> +++ b/arch/um/include/asm/mm_hooks.h
> @@ -0,0 +1,22 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_UM_MM_HOOKS_H
> +#define _ASM_UM_MM_HOOKS_H
> +
> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
> +
> +static inline void arch_exit_mmap(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void arch_unmap(struct mm_struct *mm,
> +                       unsigned long start, unsigned long end)
> +{
> +}
> +
> +static inline bool arch_vma_access_permitted(struct vm_area_struct
> *vma,
> +               bool write, bool execute, bool foreign)
> +{
> +       /* by default, allow everything */
> +       return true;
> +}
> +#endif /* _ASM_UM_MM_HOOKS_H */
> diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
> index 5b072aba5b65..68a710d23b5d 100644
> --- a/arch/um/include/asm/mmu.h
> +++ b/arch/um/include/asm/mmu.h
> @@ -18,7 +18,8 @@ typedef struct mm_context {
>  extern void __switch_mm(struct mm_id * mm_idp);
>  
>  /* Avoid tangled inclusion with asm/ldt.h */
> -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context
> *from_mm);
> +extern int init_new_ldt(struct mm_context *to_mm);
> +extern int copy_ldt(struct mm_context *to_mm, struct mm_context
> *from_mm);
>  extern void free_ldt(struct mm_context *mm);
>  
>  #endif
> diff --git a/arch/um/include/asm/mmu_context.h
> b/arch/um/include/asm/mmu_context.h
> index 68e2eb9cfb47..8668861d4a85 100644
> --- a/arch/um/include/asm/mmu_context.h
> +++ b/arch/um/include/asm/mmu_context.h
> @@ -13,7 +13,7 @@
>  #include <asm/mm_hooks.h>
>  #include <asm/mmu.h>
>  
> -extern void force_flush_all(void);
> +void force_flush_all(struct mm_struct *mm);
>  
>  #define activate_mm activate_mm
>  static inline void activate_mm(struct mm_struct *old, struct
> mm_struct *new)
> diff --git a/arch/um/include/shared/os.h
> b/arch/um/include/shared/os.h
> index 1a82c6548dd5..c9acc28fe47c 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp,
> unsigned long addr,
>  /* skas/process.c */
>  extern int is_skas_winch(int pid, int fd, void *data);
>  extern int start_userspace(unsigned long stub_stack);
> -extern int copy_context_skas0(unsigned long stack, int pid);
>  extern void userspace(struct uml_pt_regs *regs, unsigned long
> *aux_fp_regs);
>  extern void new_thread(void *stack, jmp_buf *buf, void
> (*handler)(void));
>  extern void switch_threads(jmp_buf *me, jmp_buf *you);
> diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
> index 6daffb9d8a8d..a024acd6d85c 100644
> --- a/arch/um/kernel/process.c
> +++ b/arch/um/kernel/process.c
> @@ -25,7 +25,6 @@
>  #include <linux/threads.h>
>  #include <linux/resume_user_mode.h>
>  #include <asm/current.h>
> -#include <asm/mmu_context.h>
>  #include <linux/uaccess.h>
>  #include <as-layout.h>
>  #include <kern_util.h>
> @@ -139,8 +138,6 @@ void new_thread_handler(void)
>  /* Called magically, see new_thread_handler above */
>  void fork_handler(void)
>  {
> -       force_flush_all();
> -
>         schedule_tail(current->thread.prev_sched);
>  
>         /*
> diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
> index 656fe16c9b63..ac4ca203ac24 100644
> --- a/arch/um/kernel/skas/mmu.c
> +++ b/arch/um/kernel/skas/mmu.c
> @@ -10,13 +10,13 @@
>  
>  #include <asm/pgalloc.h>
>  #include <asm/sections.h>
> +#include <asm/mmu_context.h>
>  #include <as-layout.h>
>  #include <os.h>
>  #include <skas.h>
>  
>  int init_new_context(struct task_struct *task, struct mm_struct *mm)
>  {
> -       struct mm_context *from_mm = NULL;
>         struct mm_context *to_mm = &mm->context;
>         unsigned long stack = 0;
>         int ret = -ENOMEM;
> @@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
>                 goto out;
>  
>         to_mm->id.stack = stack;
> -       if (current->mm != NULL && current->mm != &init_mm)
> -               from_mm = &current->mm->context;
>  
> +       /*
> +        * Allocate a completely fresh mm. We'll sync the mappings
> once
> +        * the rest of the kernel is done, in arch_copy_mm().
> +        */
>         block_signals_trace();
> -       if (from_mm)
> -               to_mm->id.u.pid = copy_context_skas0(stack,
> -                                                    from_mm-
> >id.u.pid);
> -       else to_mm->id.u.pid = start_userspace(stack);
> +       to_mm->id.u.pid = start_userspace(stack);
>         unblock_signals_trace();
>  
>         if (to_mm->id.u.pid < 0) {
> @@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
>                 goto out_free;
>         }
>  
> -       ret = init_new_ldt(to_mm, from_mm);
> -       if (ret < 0) {
> -               printk(KERN_ERR "init_new_context_skas - init_ldt"
> -                      " failed, errno = %d\n", ret);
> +       ret = init_new_ldt(to_mm);
> +       if (ret)
>                 goto out_free;
> -       }
>  
>         return 0;
>  
> @@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
>         return ret;
>  }
>  
> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> +       int ret = copy_ldt(&mm->context, &oldmm->context);
> +
> +       if (ret < 0) {
> +               printk(KERN_ERR "%s - copy_ldt failed, errno = %d\n",
> +                      __func__, ret);
> +               return ret;
> +       }
> +
> +       force_flush_all(mm);
> +       return 0;
> +}
> +
> +
>  void destroy_context(struct mm_struct *mm)
>  {
>         struct mm_context *mmu = &mm->context;
> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
> index 34ec8e677fb9..7c0161321fd9 100644
> --- a/arch/um/kernel/tlb.c
> +++ b/arch/um/kernel/tlb.c
> @@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm)
>                 fix_range(mm, vma->vm_start, vma->vm_end, 0);
>  }
>  
> -void force_flush_all(void)
> +void force_flush_all(struct mm_struct *mm)
>  {
> -       struct mm_struct *mm = current->mm;
>         struct vm_area_struct *vma;
>         VMA_ITERATOR(vmi, mm, 0);
>  
> -       mmap_read_lock(mm);
>         for_each_vma(vmi, vma)
>                 fix_range(mm, vma->vm_start, vma->vm_end, 1);
> -       mmap_read_unlock(mm);
>  }
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
> Linux/skas/process.c
> index f92129bbf981..403f4c6082b6 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs,
> unsigned long *aux_fp_regs)
>         }
>  }
>  
> -static unsigned long thread_regs[MAX_REG_NR];
> -static unsigned long thread_fp_regs[FP_SIZE];
> -
> -static int __init init_thread_regs(void)
> -{
> -       get_safe_registers(thread_regs, thread_fp_regs);
> -       /* Set parent's instruction pointer to start of clone-stub */
> -       thread_regs[REGS_IP_INDEX] = STUB_CODE +
> -                               (unsigned long) stub_clone_handler -
> -                               (unsigned long) __syscall_stub_start;
> -       thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES *
> UM_KERN_PAGE_SIZE -
> -               sizeof(void *);
> -#ifdef __SIGNAL_FRAMESIZE
> -       thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
> -#endif
> -       return 0;
> -}
> -
> -__initcall(init_thread_regs);
> -
> -int copy_context_skas0(unsigned long new_stack, int pid)
> -{
> -       int err;
> -       unsigned long current_stack = current_stub_stack();
> -       struct stub_data *data = (struct stub_data *) current_stack;
> -       struct stub_data *child_data = (struct stub_data *)
> new_stack;
> -       unsigned long long new_offset;
> -       int new_fd = phys_mapping(uml_to_phys((void *)new_stack),
> &new_offset);
> -
> -       /*
> -        * prepare offset and fd of child's stack as argument for
> parent's
> -        * and child's mmap2 calls
> -        */
> -       *data = ((struct stub_data) {
> -               .offset = MMAP_OFFSET(new_offset),
> -               .fd     = new_fd,
> -               .parent_err = -ESRCH,
> -               .child_err = 0,
> -       });
> -
> -       *child_data = ((struct stub_data) {
> -               .child_err = -ESRCH,
> -       });
> -
> -       err = ptrace_setregs(pid, thread_regs);
> -       if (err < 0) {
> -               err = -errno;
> -               printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid =
> %d, errno = %d\n",
> -                     __func__, pid, -err);
> -               return err;
> -       }
> -
> -       err = put_fp_registers(pid, thread_fp_regs);
> -       if (err < 0) {
> -               printk(UM_KERN_ERR "%s : put_fp_registers failed, pid
> = %d, err = %d\n",
> -                      __func__, pid, err);
> -               return err;
> -       }
> -
> -       /*
> -        * Wait, until parent has finished its work: read child's pid
> from
> -        * parent's stack, and check, if bad result.
> -        */
> -       err = ptrace(PTRACE_CONT, pid, 0, 0);
> -       if (err) {
> -               err = -errno;
> -               printk(UM_KERN_ERR "Failed to continue new process,
> pid = %d, errno = %d\n",
> -                      pid, errno);
> -               return err;
> -       }
> -
> -       wait_stub_done(pid);
> -
> -       pid = data->parent_err;
> -       if (pid < 0) {
> -               printk(UM_KERN_ERR "%s - stub-parent reports error
> %d\n",
> -                     __func__, -pid);
> -               return pid;
> -       }
> -
> -       /*
> -        * Wait, until child has finished too: read child's result
> from
> -        * child's stack and check it.
> -        */
> -       wait_stub_done(pid);
> -       if (child_data->child_err != STUB_DATA) {
> -               printk(UM_KERN_ERR "%s - stub-child %d reports error
> %ld\n",
> -                      __func__, pid, data->child_err);
> -               err = data->child_err;
> -               goto out_kill;
> -       }
> -
> -       if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
> -                  (void *)PTRACE_O_TRACESYSGOOD) < 0) {
> -               err = -errno;
> -               printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed,
> errno = %d\n",
> -                      __func__, errno);
> -               goto out_kill;
> -       }
> -
> -       return pid;
> -
> - out_kill:
> -       os_kill_ptraced_process(pid, 1);
> -       return err;
> -}
> -
>  void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
>  {
>         (*buf)[0].JB_IP = (unsigned long) handler;
> diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
> index 255a44dd415a..609feaeff23b 100644
> --- a/arch/x86/um/ldt.c
> +++ b/arch/x86/um/ldt.c
> @@ -297,36 +297,37 @@ static void ldt_get_host_info(void)
>         free_pages((unsigned long)ldt, order);
>  }
>  
> -long init_new_ldt(struct mm_context *new_mm, struct mm_context
> *from_mm)
> +int init_new_ldt(struct mm_context *new_mm)
>  {
> -       struct user_desc desc;
> +       struct user_desc desc = {};
>         short * num_p;
> -       int i;
> -       long page, err=0;
> +       int err = 0;
>         void *addr = NULL;
>  
> -
>         mutex_init(&new_mm->arch.ldt.lock);
>  
> -       if (!from_mm) {
> -               memset(&desc, 0, sizeof(desc));
> -               /*
> -                * Now we try to retrieve info about the ldt, we
> -                * inherited from the host. All ldt-entries found
> -                * will be reset in the following loop
> -                */
> -               ldt_get_host_info();
> -               for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
> -                       desc.entry_number = *num_p;
> -                       err = write_ldt_entry(&new_mm->id, 1, &desc,
> -                                             &addr, *(num_p + 1) ==
> -1);
> -                       if (err)
> -                               break;
> -               }
> -               new_mm->arch.ldt.entry_count = 0;
> -
> -               goto out;
> +       memset(&desc, 0, sizeof(desc));
> +       /*
> +        * Now we try to retrieve info about the ldt, we
> +        * inherited from the host. All ldt-entries found
> +        * will be reset in the following loop
> +        */
> +       ldt_get_host_info();
> +       for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
> +               desc.entry_number = *num_p;
> +               err = write_ldt_entry(&new_mm->id, 1, &desc,
> +                                     &addr, *(num_p + 1) == -1);
> +               if (err)
> +                       break;
>         }
> +       new_mm->arch.ldt.entry_count = 0;
> +
> +       return err;
> +}
> +
> +int copy_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
> +{
> +       int err = 0;
>  
>         /*
>          * Our local LDT is used to supply the data for
> @@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm,
> struct mm_context *from_mm)
>                 memcpy(new_mm->arch.ldt.u.entries, from_mm-
> >arch.ldt.u.entries,
>                        sizeof(new_mm->arch.ldt.u.entries));
>         else {
> -               i = from_mm->arch.ldt.entry_count /
> LDT_ENTRIES_PER_PAGE;
> +               int i = from_mm->arch.ldt.entry_count /
> LDT_ENTRIES_PER_PAGE;
> +               unsigned long page;
> +
>                 while (i-->0) {
>                         page =
> __get_free_page(GFP_KERNEL|__GFP_ZERO);
>                         if (!page) {
> @@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm,
> struct mm_context *from_mm)
>         new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
>         mutex_unlock(&from_mm->arch.ldt.lock);
>  
> -    out:
>         return err;
>  }
>  
> -
>  void free_ldt(struct mm_context *mm)
>  {
>         int i;