[PATCH] um: clean up mm creation

Fri Sep 22 08:49:12 PDT 2023

On 22/09/2023 16:31, Benjamin Berg wrote:
> Hi,
> 
> On Fri, 2023-09-22 at 14:41 +0100, Anton Ivanov wrote:
>>
>> On 22/09/2023 12:16, Johannes Berg wrote:
>>> From: Johannes Berg <johannes.berg at intel.com>
>>>
>>> While enabling PREEMPT on UML, we found that the call to
>>> force_flush_all() cannot be done where it is, it sleeps
>>> while atomic.
>>>
>>> Further investigation shows that all this seems at least
>>> a bit roundabout and possibly wrong wrong in the first
>>> place - we copy from the 'current' process and then flush
>>> when it starts running.
>>>
>>> What we really want is to start fresh with an empty mm
>>> process, then have it all set up by the kernel (copying
>>> the original mm contents as needed), and then sync it
>>> in arch_dup_mmap().
>>
>> Is there a way we can come up with COW here?
> 
> COW for what? Flushing the page tables once shouldn't be that expensive
> (and we do it already).
> 
>>> We should do the same for the LDT, so need to split that
>>> to be able to do this.
>>>
>>> Note that this fixes what seems like an issue - we look
>>> at current->mm when we copy, but that doesn't seem right
>>> in the case of clone() without copying the MM. This is
>>> probably saved by the forced flush later right now.
>>
>> We will need to work on this.
>>
>> It is nearly twice slower than the current approach on a find /usr -
>> type f -exec cat {} > /dev/null \;
> 
> Hmm, now that is interesting. Could it be that we incorrectly avoid
> minor faults in the old code?
> 
> i.e. something like:
>   * fork()
>   * new MM is created (copy_context_skas0)
>   * new process runs:
>     - MM is flushed out
>     - execve() happens in userspace
>   * new MM for task is created (copy_context_skas0)
>   * VMAs for libraries are created, but not maybe not the TLB entries
>     (i.e. kernel relies on minor faults to do that later)

I am going to try to trace that next week.

>   * some old (mostly read-only) mappings remain visible
>   * new executable runs:
>     - MM is NOT flushed
>     - code runs
> 
> If this happens, then what you might be seeing is the memory layout of
> the new process being very similar and the process not hitting minor
> faults because it manages to read the parents memory.

That is one possibility.

My gut feeling is that there is something else. End of the day, /bin/cat 
and /bin/find are tiny and what they pull from libc is minimal as well.

To have such a gigantic difference in performance we have to be copying 
a large chunk of data which we should not be copying on every invocation.

> 
> Benjamin
> 
> 
>>>
>>> Signed-off-by: Johannes Berg <johannes.berg at intel.com>
>>> ---
>>>    arch/um/include/asm/Kbuild        |   1 -
>>>    arch/um/include/asm/mm_hooks.h    |  22 ++++++
>>>    arch/um/include/asm/mmu.h         |   3 +-
>>>    arch/um/include/asm/mmu_context.h |   2 +-
>>>    arch/um/include/shared/os.h       |   1 -
>>>    arch/um/kernel/process.c          |   3 -
>>>    arch/um/kernel/skas/mmu.c         |  35 ++++++----
>>>    arch/um/kernel/tlb.c              |   5 +-
>>>    arch/um/os-Linux/skas/process.c   | 107 -------------------------
>>> -----
>>>    arch/x86/um/ldt.c                 |  53 +++++++--------
>>>    10 files changed, 76 insertions(+), 156 deletions(-)
>>>    create mode 100644 arch/um/include/asm/mm_hooks.h
>>>
>>> diff --git a/arch/um/include/asm/Kbuild
>>> b/arch/um/include/asm/Kbuild
>>> index b2d834a29f3a..de8d82a6fd7b 100644
>>> --- a/arch/um/include/asm/Kbuild
>>> +++ b/arch/um/include/asm/Kbuild
>>> @@ -26,5 +26,4 @@ generic-y += switch_to.h
>>>    generic-y += topology.h
>>>    generic-y += trace_clock.h
>>>    generic-y += kprobes.h
>>> -generic-y += mm_hooks.h
>>>    generic-y += vga.h
>>> diff --git a/arch/um/include/asm/mm_hooks.h
>>> b/arch/um/include/asm/mm_hooks.h
>>> new file mode 100644
>>> index 000000000000..b1016520c5b8
>>> --- /dev/null
>>> +++ b/arch/um/include/asm/mm_hooks.h
>>> @@ -0,0 +1,22 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#ifndef _ASM_UM_MM_HOOKS_H
>>> +#define _ASM_UM_MM_HOOKS_H
>>> +
>>> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
>>> +
>>> +static inline void arch_exit_mmap(struct mm_struct *mm)
>>> +{
>>> +}
>>> +
>>> +static inline void arch_unmap(struct mm_struct *mm,
>>> +                       unsigned long start, unsigned long end)
>>> +{
>>> +}
>>> +
>>> +static inline bool arch_vma_access_permitted(struct vm_area_struct
>>> *vma,
>>> +               bool write, bool execute, bool foreign)
>>> +{
>>> +       /* by default, allow everything */
>>> +       return true;
>>> +}
>>> +#endif /* _ASM_UM_MM_HOOKS_H */
>>> diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
>>> index 5b072aba5b65..68a710d23b5d 100644
>>> --- a/arch/um/include/asm/mmu.h
>>> +++ b/arch/um/include/asm/mmu.h
>>> @@ -18,7 +18,8 @@ typedef struct mm_context {
>>>    extern void __switch_mm(struct mm_id * mm_idp);
>>>    
>>>    /* Avoid tangled inclusion with asm/ldt.h */
>>> -extern long init_new_ldt(struct mm_context *to_mm, struct
>>> mm_context *from_mm);
>>> +extern int init_new_ldt(struct mm_context *to_mm);
>>> +extern int copy_ldt(struct mm_context *to_mm, struct mm_context
>>> *from_mm);
>>>    extern void free_ldt(struct mm_context *mm);
>>>    
>>>    #endif
>>> diff --git a/arch/um/include/asm/mmu_context.h
>>> b/arch/um/include/asm/mmu_context.h
>>> index 68e2eb9cfb47..8668861d4a85 100644
>>> --- a/arch/um/include/asm/mmu_context.h
>>> +++ b/arch/um/include/asm/mmu_context.h
>>> @@ -13,7 +13,7 @@
>>>    #include <asm/mm_hooks.h>
>>>    #include <asm/mmu.h>
>>>    
>>> -extern void force_flush_all(void);
>>> +void force_flush_all(struct mm_struct *mm);
>>>    
>>>    #define activate_mm activate_mm
>>>    static inline void activate_mm(struct mm_struct *old, struct
>>> mm_struct *new)
>>> diff --git a/arch/um/include/shared/os.h
>>> b/arch/um/include/shared/os.h
>>> index 1a82c6548dd5..c9acc28fe47c 100644
>>> --- a/arch/um/include/shared/os.h
>>> +++ b/arch/um/include/shared/os.h
>>> @@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp,
>>> unsigned long addr,
>>>    /* skas/process.c */
>>>    extern int is_skas_winch(int pid, int fd, void *data);
>>>    extern int start_userspace(unsigned long stub_stack);
>>> -extern int copy_context_skas0(unsigned long stack, int pid);
>>>    extern void userspace(struct uml_pt_regs *regs, unsigned long
>>> *aux_fp_regs);
>>>    extern void new_thread(void *stack, jmp_buf *buf, void
>>> (*handler)(void));
>>>    extern void switch_threads(jmp_buf *me, jmp_buf *you);
>>> diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
>>> index 6daffb9d8a8d..a024acd6d85c 100644
>>> --- a/arch/um/kernel/process.c
>>> +++ b/arch/um/kernel/process.c
>>> @@ -25,7 +25,6 @@
>>>    #include <linux/threads.h>
>>>    #include <linux/resume_user_mode.h>
>>>    #include <asm/current.h>
>>> -#include <asm/mmu_context.h>
>>>    #include <linux/uaccess.h>
>>>    #include <as-layout.h>
>>>    #include <kern_util.h>
>>> @@ -139,8 +138,6 @@ void new_thread_handler(void)
>>>    /* Called magically, see new_thread_handler above */
>>>    void fork_handler(void)
>>>    {
>>> -       force_flush_all();
>>> -
>>>          schedule_tail(current->thread.prev_sched);
>>>    
>>>          /*
>>> diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
>>> index 656fe16c9b63..ac4ca203ac24 100644
>>> --- a/arch/um/kernel/skas/mmu.c
>>> +++ b/arch/um/kernel/skas/mmu.c
>>> @@ -10,13 +10,13 @@
>>>    
>>>    #include <asm/pgalloc.h>
>>>    #include <asm/sections.h>
>>> +#include <asm/mmu_context.h>
>>>    #include <as-layout.h>
>>>    #include <os.h>
>>>    #include <skas.h>
>>>    
>>>    int init_new_context(struct task_struct *task, struct mm_struct
>>> *mm)
>>>    {
>>> -       struct mm_context *from_mm = NULL;
>>>          struct mm_context *to_mm = &mm->context;
>>>          unsigned long stack = 0;
>>>          int ret = -ENOMEM;
>>> @@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>>                  goto out;
>>>    
>>>          to_mm->id.stack = stack;
>>> -       if (current->mm != NULL && current->mm != &init_mm)
>>> -               from_mm = &current->mm->context;
>>>    
>>> +       /*
>>> +        * Allocate a completely fresh mm. We'll sync the mappings
>>> once
>>> +        * the rest of the kernel is done, in arch_copy_mm().
>>> +        */
>>>          block_signals_trace();
>>> -       if (from_mm)
>>> -               to_mm->id.u.pid = copy_context_skas0(stack,
>>> -                                                    from_mm-
>>>> id.u.pid);
>>> -       else to_mm->id.u.pid = start_userspace(stack);
>>> +       to_mm->id.u.pid = start_userspace(stack);
>>>          unblock_signals_trace();
>>>    
>>>          if (to_mm->id.u.pid < 0) {
>>> @@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>>                  goto out_free;
>>>          }
>>>    
>>> -       ret = init_new_ldt(to_mm, from_mm);
>>> -       if (ret < 0) {
>>> -               printk(KERN_ERR "init_new_context_skas - init_ldt"
>>> -                      " failed, errno = %d\n", ret);
>>> +       ret = init_new_ldt(to_mm);
>>> +       if (ret)
>>>                  goto out_free;
>>> -       }
>>>    
>>>          return 0;
>>>    
>>> @@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>>          return ret;
>>>    }
>>>    
>>> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
>>> +{
>>> +       int ret = copy_ldt(&mm->context, &oldmm->context);
>>> +
>>> +       if (ret < 0) {
>>> +               printk(KERN_ERR "%s - copy_ldt failed, errno =
>>> %d\n",
>>> +                      __func__, ret);
>>> +               return ret;
>>> +       }
>>> +
>>> +       force_flush_all(mm);
>>> +       return 0;
>>> +}
>>> +
>>> +
>>>    void destroy_context(struct mm_struct *mm)
>>>    {
>>>          struct mm_context *mmu = &mm->context;
>>> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
>>> index 34ec8e677fb9..7c0161321fd9 100644
>>> --- a/arch/um/kernel/tlb.c
>>> +++ b/arch/um/kernel/tlb.c
>>> @@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm)
>>>                  fix_range(mm, vma->vm_start, vma->vm_end, 0);
>>>    }
>>>    
>>> -void force_flush_all(void)
>>> +void force_flush_all(struct mm_struct *mm)
>>>    {
>>> -       struct mm_struct *mm = current->mm;
>>>          struct vm_area_struct *vma;
>>>          VMA_ITERATOR(vmi, mm, 0);
>>>    
>>> -       mmap_read_lock(mm);
>>>          for_each_vma(vmi, vma)
>>>                  fix_range(mm, vma->vm_start, vma->vm_end, 1);
>>> -       mmap_read_unlock(mm);
>>>    }
>>> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
>>> Linux/skas/process.c
>>> index f92129bbf981..403f4c6082b6 100644
>>> --- a/arch/um/os-Linux/skas/process.c
>>> +++ b/arch/um/os-Linux/skas/process.c
>>> @@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs,
>>> unsigned long *aux_fp_regs)
>>>          }
>>>    }
>>>    
>>> -static unsigned long thread_regs[MAX_REG_NR];
>>> -static unsigned long thread_fp_regs[FP_SIZE];
>>> -
>>> -static int __init init_thread_regs(void)
>>> -{
>>> -       get_safe_registers(thread_regs, thread_fp_regs);
>>> -       /* Set parent's instruction pointer to start of clone-stub
>>> */
>>> -       thread_regs[REGS_IP_INDEX] = STUB_CODE +
>>> -                               (unsigned long) stub_clone_handler
>>> -
>>> -                               (unsigned long)
>>> __syscall_stub_start;
>>> -       thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES *
>>> UM_KERN_PAGE_SIZE -
>>> -               sizeof(void *);
>>> -#ifdef __SIGNAL_FRAMESIZE
>>> -       thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
>>> -#endif
>>> -       return 0;
>>> -}
>>> -
>>> -__initcall(init_thread_regs);
>>> -
>>> -int copy_context_skas0(unsigned long new_stack, int pid)
>>> -{
>>> -       int err;
>>> -       unsigned long current_stack = current_stub_stack();
>>> -       struct stub_data *data = (struct stub_data *)
>>> current_stack;
>>> -       struct stub_data *child_data = (struct stub_data *)
>>> new_stack;
>>> -       unsigned long long new_offset;
>>> -       int new_fd = phys_mapping(uml_to_phys((void *)new_stack),
>>> &new_offset);
>>> -
>>> -       /*
>>> -        * prepare offset and fd of child's stack as argument for
>>> parent's
>>> -        * and child's mmap2 calls
>>> -        */
>>> -       *data = ((struct stub_data) {
>>> -               .offset = MMAP_OFFSET(new_offset),
>>> -               .fd     = new_fd,
>>> -               .parent_err = -ESRCH,
>>> -               .child_err = 0,
>>> -       });
>>> -
>>> -       *child_data = ((struct stub_data) {
>>> -               .child_err = -ESRCH,
>>> -       });
>>> -
>>> -       err = ptrace_setregs(pid, thread_regs);
>>> -       if (err < 0) {
>>> -               err = -errno;
>>> -               printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid
>>> = %d, errno = %d\n",
>>> -                     __func__, pid, -err);
>>> -               return err;
>>> -       }
>>> -
>>> -       err = put_fp_registers(pid, thread_fp_regs);
>>> -       if (err < 0) {
>>> -               printk(UM_KERN_ERR "%s : put_fp_registers failed,
>>> pid = %d, err = %d\n",
>>> -                      __func__, pid, err);
>>> -               return err;
>>> -       }
>>> -
>>> -       /*
>>> -        * Wait, until parent has finished its work: read child's
>>> pid from
>>> -        * parent's stack, and check, if bad result.
>>> -        */
>>> -       err = ptrace(PTRACE_CONT, pid, 0, 0);
>>> -       if (err) {
>>> -               err = -errno;
>>> -               printk(UM_KERN_ERR "Failed to continue new process,
>>> pid = %d, errno = %d\n",
>>> -                      pid, errno);
>>> -               return err;
>>> -       }
>>> -
>>> -       wait_stub_done(pid);
>>> -
>>> -       pid = data->parent_err;
>>> -       if (pid < 0) {
>>> -               printk(UM_KERN_ERR "%s - stub-parent reports error
>>> %d\n",
>>> -                     __func__, -pid);
>>> -               return pid;
>>> -       }
>>> -
>>> -       /*
>>> -        * Wait, until child has finished too: read child's result
>>> from
>>> -        * child's stack and check it.
>>> -        */
>>> -       wait_stub_done(pid);
>>> -       if (child_data->child_err != STUB_DATA) {
>>> -               printk(UM_KERN_ERR "%s - stub-child %d reports
>>> error %ld\n",
>>> -                      __func__, pid, data->child_err);
>>> -               err = data->child_err;
>>> -               goto out_kill;
>>> -       }
>>> -
>>> -       if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
>>> -                  (void *)PTRACE_O_TRACESYSGOOD) < 0) {
>>> -               err = -errno;
>>> -               printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS
>>> failed, errno = %d\n",
>>> -                      __func__, errno);
>>> -               goto out_kill;
>>> -       }
>>> -
>>> -       return pid;
>>> -
>>> - out_kill:
>>> -       os_kill_ptraced_process(pid, 1);
>>> -       return err;
>>> -}
>>> -
>>>    void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
>>>    {
>>>          (*buf)[0].JB_IP = (unsigned long) handler;
>>> diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
>>> index 255a44dd415a..609feaeff23b 100644
>>> --- a/arch/x86/um/ldt.c
>>> +++ b/arch/x86/um/ldt.c
>>> @@ -297,36 +297,37 @@ static void ldt_get_host_info(void)
>>>          free_pages((unsigned long)ldt, order);
>>>    }
>>>    
>>> -long init_new_ldt(struct mm_context *new_mm, struct mm_context
>>> *from_mm)
>>> +int init_new_ldt(struct mm_context *new_mm)
>>>    {
>>> -       struct user_desc desc;
>>> +       struct user_desc desc = {};
>>>          short * num_p;
>>> -       int i;
>>> -       long page, err=0;
>>> +       int err = 0;
>>>          void *addr = NULL;
>>>    
>>> -
>>>          mutex_init(&new_mm->arch.ldt.lock);
>>>    
>>> -       if (!from_mm) {
>>> -               memset(&desc, 0, sizeof(desc));
>>> -               /*
>>> -                * Now we try to retrieve info about the ldt, we
>>> -                * inherited from the host. All ldt-entries found
>>> -                * will be reset in the following loop
>>> -                */
>>> -               ldt_get_host_info();
>>> -               for (num_p=host_ldt_entries; *num_p != -1; num_p++)
>>> {
>>> -                       desc.entry_number = *num_p;
>>> -                       err = write_ldt_entry(&new_mm->id, 1,
>>> &desc,
>>> -                                             &addr, *(num_p + 1)
>>> == -1);
>>> -                       if (err)
>>> -                               break;
>>> -               }
>>> -               new_mm->arch.ldt.entry_count = 0;
>>> -
>>> -               goto out;
>>> +       memset(&desc, 0, sizeof(desc));
>>> +       /*
>>> +        * Now we try to retrieve info about the ldt, we
>>> +        * inherited from the host. All ldt-entries found
>>> +        * will be reset in the following loop
>>> +        */
>>> +       ldt_get_host_info();
>>> +       for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
>>> +               desc.entry_number = *num_p;
>>> +               err = write_ldt_entry(&new_mm->id, 1, &desc,
>>> +                                     &addr, *(num_p + 1) == -1);
>>> +               if (err)
>>> +                       break;
>>>          }
>>> +       new_mm->arch.ldt.entry_count = 0;
>>> +
>>> +       return err;
>>> +}
>>> +
>>> +int copy_ldt(struct mm_context *new_mm, struct mm_context
>>> *from_mm)
>>> +{
>>> +       int err = 0;
>>>    
>>>          /*
>>>           * Our local LDT is used to supply the data for
>>> @@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm,
>>> struct mm_context *from_mm)
>>>                  memcpy(new_mm->arch.ldt.u.entries, from_mm-
>>>> arch.ldt.u.entries,
>>>                         sizeof(new_mm->arch.ldt.u.entries));
>>>          else {
>>> -               i = from_mm->arch.ldt.entry_count /
>>> LDT_ENTRIES_PER_PAGE;
>>> +               int i = from_mm->arch.ldt.entry_count /
>>> LDT_ENTRIES_PER_PAGE;
>>> +               unsigned long page;
>>> +
>>>                  while (i-->0) {
>>>                          page =
>>> __get_free_page(GFP_KERNEL|__GFP_ZERO);
>>>                          if (!page) {
>>> @@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm,
>>> struct mm_context *from_mm)
>>>          new_mm->arch.ldt.entry_count = from_mm-
>>>> arch.ldt.entry_count;
>>>          mutex_unlock(&from_mm->arch.ldt.lock);
>>>    
>>> -    out:
>>>          return err;
>>>    }
>>>    
>>> -
>>>    void free_ldt(struct mm_context *mm)
>>>    {
>>>          int i;
>>
> 

-- 
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661
https://www.cambridgegreys.com/