[PATCH] um: clean up mm creation
Anton Ivanov
anton.ivanov at cambridgegreys.com
Fri Sep 22 08:49:12 PDT 2023
On 22/09/2023 16:31, Benjamin Berg wrote:
> Hi,
>
> On Fri, 2023-09-22 at 14:41 +0100, Anton Ivanov wrote:
>>
>> On 22/09/2023 12:16, Johannes Berg wrote:
>>> From: Johannes Berg <johannes.berg at intel.com>
>>>
>>> While enabling PREEMPT on UML, we found that the call to
>>> force_flush_all() cannot be done where it is, it sleeps
>>> while atomic.
>>>
>>> Further investigation shows that all this seems at least
>>> a bit roundabout and possibly wrong wrong in the first
>>> place - we copy from the 'current' process and then flush
>>> when it starts running.
>>>
>>> What we really want is to start fresh with an empty mm
>>> process, then have it all set up by the kernel (copying
>>> the original mm contents as needed), and then sync it
>>> in arch_dup_mmap().
>>
>> Is there a way we can come up with COW here?
>
> COW for what? Flushing the page tables once shouldn't be that expensive
> (and we do it already).
>
>>> We should do the same for the LDT, so need to split that
>>> to be able to do this.
>>>
>>> Note that this fixes what seems like an issue - we look
>>> at current->mm when we copy, but that doesn't seem right
>>> in the case of clone() without copying the MM. This is
>>> probably saved by the forced flush later right now.
>>
>> We will need to work on this.
>>
>> It is nearly twice slower than the current approach on a find /usr -
>> type f -exec cat {} > /dev/null \;
>
> Hmm, now that is interesting. Could it be that we incorrectly avoid
> minor faults in the old code?
>
> i.e. something like:
> * fork()
> * new MM is created (copy_context_skas0)
> * new process runs:
> - MM is flushed out
> - execve() happens in userspace
> * new MM for task is created (copy_context_skas0)
> * VMAs for libraries are created, but not maybe not the TLB entries
> (i.e. kernel relies on minor faults to do that later)
I am going to try to trace that next week.
> * some old (mostly read-only) mappings remain visible
> * new executable runs:
> - MM is NOT flushed
> - code runs
>
> If this happens, then what you might be seeing is the memory layout of
> the new process being very similar and the process not hitting minor
> faults because it manages to read the parents memory.
That is one possibility.
My gut feeling is that there is something else. End of the day, /bin/cat
and /bin/find are tiny and what they pull from libc is minimal as well.
To have such a gigantic difference in performance we have to be copying
a large chunk of data which we should not be copying on every invocation.
>
> Benjamin
>
>
>>>
>>> Signed-off-by: Johannes Berg <johannes.berg at intel.com>
>>> ---
>>> arch/um/include/asm/Kbuild | 1 -
>>> arch/um/include/asm/mm_hooks.h | 22 ++++++
>>> arch/um/include/asm/mmu.h | 3 +-
>>> arch/um/include/asm/mmu_context.h | 2 +-
>>> arch/um/include/shared/os.h | 1 -
>>> arch/um/kernel/process.c | 3 -
>>> arch/um/kernel/skas/mmu.c | 35 ++++++----
>>> arch/um/kernel/tlb.c | 5 +-
>>> arch/um/os-Linux/skas/process.c | 107 -------------------------
>>> -----
>>> arch/x86/um/ldt.c | 53 +++++++--------
>>> 10 files changed, 76 insertions(+), 156 deletions(-)
>>> create mode 100644 arch/um/include/asm/mm_hooks.h
>>>
>>> diff --git a/arch/um/include/asm/Kbuild
>>> b/arch/um/include/asm/Kbuild
>>> index b2d834a29f3a..de8d82a6fd7b 100644
>>> --- a/arch/um/include/asm/Kbuild
>>> +++ b/arch/um/include/asm/Kbuild
>>> @@ -26,5 +26,4 @@ generic-y += switch_to.h
>>> generic-y += topology.h
>>> generic-y += trace_clock.h
>>> generic-y += kprobes.h
>>> -generic-y += mm_hooks.h
>>> generic-y += vga.h
>>> diff --git a/arch/um/include/asm/mm_hooks.h
>>> b/arch/um/include/asm/mm_hooks.h
>>> new file mode 100644
>>> index 000000000000..b1016520c5b8
>>> --- /dev/null
>>> +++ b/arch/um/include/asm/mm_hooks.h
>>> @@ -0,0 +1,22 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#ifndef _ASM_UM_MM_HOOKS_H
>>> +#define _ASM_UM_MM_HOOKS_H
>>> +
>>> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
>>> +
>>> +static inline void arch_exit_mmap(struct mm_struct *mm)
>>> +{
>>> +}
>>> +
>>> +static inline void arch_unmap(struct mm_struct *mm,
>>> + unsigned long start, unsigned long end)
>>> +{
>>> +}
>>> +
>>> +static inline bool arch_vma_access_permitted(struct vm_area_struct
>>> *vma,
>>> + bool write, bool execute, bool foreign)
>>> +{
>>> + /* by default, allow everything */
>>> + return true;
>>> +}
>>> +#endif /* _ASM_UM_MM_HOOKS_H */
>>> diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
>>> index 5b072aba5b65..68a710d23b5d 100644
>>> --- a/arch/um/include/asm/mmu.h
>>> +++ b/arch/um/include/asm/mmu.h
>>> @@ -18,7 +18,8 @@ typedef struct mm_context {
>>> extern void __switch_mm(struct mm_id * mm_idp);
>>>
>>> /* Avoid tangled inclusion with asm/ldt.h */
>>> -extern long init_new_ldt(struct mm_context *to_mm, struct
>>> mm_context *from_mm);
>>> +extern int init_new_ldt(struct mm_context *to_mm);
>>> +extern int copy_ldt(struct mm_context *to_mm, struct mm_context
>>> *from_mm);
>>> extern void free_ldt(struct mm_context *mm);
>>>
>>> #endif
>>> diff --git a/arch/um/include/asm/mmu_context.h
>>> b/arch/um/include/asm/mmu_context.h
>>> index 68e2eb9cfb47..8668861d4a85 100644
>>> --- a/arch/um/include/asm/mmu_context.h
>>> +++ b/arch/um/include/asm/mmu_context.h
>>> @@ -13,7 +13,7 @@
>>> #include <asm/mm_hooks.h>
>>> #include <asm/mmu.h>
>>>
>>> -extern void force_flush_all(void);
>>> +void force_flush_all(struct mm_struct *mm);
>>>
>>> #define activate_mm activate_mm
>>> static inline void activate_mm(struct mm_struct *old, struct
>>> mm_struct *new)
>>> diff --git a/arch/um/include/shared/os.h
>>> b/arch/um/include/shared/os.h
>>> index 1a82c6548dd5..c9acc28fe47c 100644
>>> --- a/arch/um/include/shared/os.h
>>> +++ b/arch/um/include/shared/os.h
>>> @@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp,
>>> unsigned long addr,
>>> /* skas/process.c */
>>> extern int is_skas_winch(int pid, int fd, void *data);
>>> extern int start_userspace(unsigned long stub_stack);
>>> -extern int copy_context_skas0(unsigned long stack, int pid);
>>> extern void userspace(struct uml_pt_regs *regs, unsigned long
>>> *aux_fp_regs);
>>> extern void new_thread(void *stack, jmp_buf *buf, void
>>> (*handler)(void));
>>> extern void switch_threads(jmp_buf *me, jmp_buf *you);
>>> diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
>>> index 6daffb9d8a8d..a024acd6d85c 100644
>>> --- a/arch/um/kernel/process.c
>>> +++ b/arch/um/kernel/process.c
>>> @@ -25,7 +25,6 @@
>>> #include <linux/threads.h>
>>> #include <linux/resume_user_mode.h>
>>> #include <asm/current.h>
>>> -#include <asm/mmu_context.h>
>>> #include <linux/uaccess.h>
>>> #include <as-layout.h>
>>> #include <kern_util.h>
>>> @@ -139,8 +138,6 @@ void new_thread_handler(void)
>>> /* Called magically, see new_thread_handler above */
>>> void fork_handler(void)
>>> {
>>> - force_flush_all();
>>> -
>>> schedule_tail(current->thread.prev_sched);
>>>
>>> /*
>>> diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
>>> index 656fe16c9b63..ac4ca203ac24 100644
>>> --- a/arch/um/kernel/skas/mmu.c
>>> +++ b/arch/um/kernel/skas/mmu.c
>>> @@ -10,13 +10,13 @@
>>>
>>> #include <asm/pgalloc.h>
>>> #include <asm/sections.h>
>>> +#include <asm/mmu_context.h>
>>> #include <as-layout.h>
>>> #include <os.h>
>>> #include <skas.h>
>>>
>>> int init_new_context(struct task_struct *task, struct mm_struct
>>> *mm)
>>> {
>>> - struct mm_context *from_mm = NULL;
>>> struct mm_context *to_mm = &mm->context;
>>> unsigned long stack = 0;
>>> int ret = -ENOMEM;
>>> @@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>> goto out;
>>>
>>> to_mm->id.stack = stack;
>>> - if (current->mm != NULL && current->mm != &init_mm)
>>> - from_mm = ¤t->mm->context;
>>>
>>> + /*
>>> + * Allocate a completely fresh mm. We'll sync the mappings
>>> once
>>> + * the rest of the kernel is done, in arch_copy_mm().
>>> + */
>>> block_signals_trace();
>>> - if (from_mm)
>>> - to_mm->id.u.pid = copy_context_skas0(stack,
>>> - from_mm-
>>>> id.u.pid);
>>> - else to_mm->id.u.pid = start_userspace(stack);
>>> + to_mm->id.u.pid = start_userspace(stack);
>>> unblock_signals_trace();
>>>
>>> if (to_mm->id.u.pid < 0) {
>>> @@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>> goto out_free;
>>> }
>>>
>>> - ret = init_new_ldt(to_mm, from_mm);
>>> - if (ret < 0) {
>>> - printk(KERN_ERR "init_new_context_skas - init_ldt"
>>> - " failed, errno = %d\n", ret);
>>> + ret = init_new_ldt(to_mm);
>>> + if (ret)
>>> goto out_free;
>>> - }
>>>
>>> return 0;
>>>
>>> @@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task,
>>> struct mm_struct *mm)
>>> return ret;
>>> }
>>>
>>> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
>>> +{
>>> + int ret = copy_ldt(&mm->context, &oldmm->context);
>>> +
>>> + if (ret < 0) {
>>> + printk(KERN_ERR "%s - copy_ldt failed, errno =
>>> %d\n",
>>> + __func__, ret);
>>> + return ret;
>>> + }
>>> +
>>> + force_flush_all(mm);
>>> + return 0;
>>> +}
>>> +
>>> +
>>> void destroy_context(struct mm_struct *mm)
>>> {
>>> struct mm_context *mmu = &mm->context;
>>> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
>>> index 34ec8e677fb9..7c0161321fd9 100644
>>> --- a/arch/um/kernel/tlb.c
>>> +++ b/arch/um/kernel/tlb.c
>>> @@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm)
>>> fix_range(mm, vma->vm_start, vma->vm_end, 0);
>>> }
>>>
>>> -void force_flush_all(void)
>>> +void force_flush_all(struct mm_struct *mm)
>>> {
>>> - struct mm_struct *mm = current->mm;
>>> struct vm_area_struct *vma;
>>> VMA_ITERATOR(vmi, mm, 0);
>>>
>>> - mmap_read_lock(mm);
>>> for_each_vma(vmi, vma)
>>> fix_range(mm, vma->vm_start, vma->vm_end, 1);
>>> - mmap_read_unlock(mm);
>>> }
>>> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
>>> Linux/skas/process.c
>>> index f92129bbf981..403f4c6082b6 100644
>>> --- a/arch/um/os-Linux/skas/process.c
>>> +++ b/arch/um/os-Linux/skas/process.c
>>> @@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs,
>>> unsigned long *aux_fp_regs)
>>> }
>>> }
>>>
>>> -static unsigned long thread_regs[MAX_REG_NR];
>>> -static unsigned long thread_fp_regs[FP_SIZE];
>>> -
>>> -static int __init init_thread_regs(void)
>>> -{
>>> - get_safe_registers(thread_regs, thread_fp_regs);
>>> - /* Set parent's instruction pointer to start of clone-stub
>>> */
>>> - thread_regs[REGS_IP_INDEX] = STUB_CODE +
>>> - (unsigned long) stub_clone_handler
>>> -
>>> - (unsigned long)
>>> __syscall_stub_start;
>>> - thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES *
>>> UM_KERN_PAGE_SIZE -
>>> - sizeof(void *);
>>> -#ifdef __SIGNAL_FRAMESIZE
>>> - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
>>> -#endif
>>> - return 0;
>>> -}
>>> -
>>> -__initcall(init_thread_regs);
>>> -
>>> -int copy_context_skas0(unsigned long new_stack, int pid)
>>> -{
>>> - int err;
>>> - unsigned long current_stack = current_stub_stack();
>>> - struct stub_data *data = (struct stub_data *)
>>> current_stack;
>>> - struct stub_data *child_data = (struct stub_data *)
>>> new_stack;
>>> - unsigned long long new_offset;
>>> - int new_fd = phys_mapping(uml_to_phys((void *)new_stack),
>>> &new_offset);
>>> -
>>> - /*
>>> - * prepare offset and fd of child's stack as argument for
>>> parent's
>>> - * and child's mmap2 calls
>>> - */
>>> - *data = ((struct stub_data) {
>>> - .offset = MMAP_OFFSET(new_offset),
>>> - .fd = new_fd,
>>> - .parent_err = -ESRCH,
>>> - .child_err = 0,
>>> - });
>>> -
>>> - *child_data = ((struct stub_data) {
>>> - .child_err = -ESRCH,
>>> - });
>>> -
>>> - err = ptrace_setregs(pid, thread_regs);
>>> - if (err < 0) {
>>> - err = -errno;
>>> - printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid
>>> = %d, errno = %d\n",
>>> - __func__, pid, -err);
>>> - return err;
>>> - }
>>> -
>>> - err = put_fp_registers(pid, thread_fp_regs);
>>> - if (err < 0) {
>>> - printk(UM_KERN_ERR "%s : put_fp_registers failed,
>>> pid = %d, err = %d\n",
>>> - __func__, pid, err);
>>> - return err;
>>> - }
>>> -
>>> - /*
>>> - * Wait, until parent has finished its work: read child's
>>> pid from
>>> - * parent's stack, and check, if bad result.
>>> - */
>>> - err = ptrace(PTRACE_CONT, pid, 0, 0);
>>> - if (err) {
>>> - err = -errno;
>>> - printk(UM_KERN_ERR "Failed to continue new process,
>>> pid = %d, errno = %d\n",
>>> - pid, errno);
>>> - return err;
>>> - }
>>> -
>>> - wait_stub_done(pid);
>>> -
>>> - pid = data->parent_err;
>>> - if (pid < 0) {
>>> - printk(UM_KERN_ERR "%s - stub-parent reports error
>>> %d\n",
>>> - __func__, -pid);
>>> - return pid;
>>> - }
>>> -
>>> - /*
>>> - * Wait, until child has finished too: read child's result
>>> from
>>> - * child's stack and check it.
>>> - */
>>> - wait_stub_done(pid);
>>> - if (child_data->child_err != STUB_DATA) {
>>> - printk(UM_KERN_ERR "%s - stub-child %d reports
>>> error %ld\n",
>>> - __func__, pid, data->child_err);
>>> - err = data->child_err;
>>> - goto out_kill;
>>> - }
>>> -
>>> - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
>>> - (void *)PTRACE_O_TRACESYSGOOD) < 0) {
>>> - err = -errno;
>>> - printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS
>>> failed, errno = %d\n",
>>> - __func__, errno);
>>> - goto out_kill;
>>> - }
>>> -
>>> - return pid;
>>> -
>>> - out_kill:
>>> - os_kill_ptraced_process(pid, 1);
>>> - return err;
>>> -}
>>> -
>>> void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
>>> {
>>> (*buf)[0].JB_IP = (unsigned long) handler;
>>> diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
>>> index 255a44dd415a..609feaeff23b 100644
>>> --- a/arch/x86/um/ldt.c
>>> +++ b/arch/x86/um/ldt.c
>>> @@ -297,36 +297,37 @@ static void ldt_get_host_info(void)
>>> free_pages((unsigned long)ldt, order);
>>> }
>>>
>>> -long init_new_ldt(struct mm_context *new_mm, struct mm_context
>>> *from_mm)
>>> +int init_new_ldt(struct mm_context *new_mm)
>>> {
>>> - struct user_desc desc;
>>> + struct user_desc desc = {};
>>> short * num_p;
>>> - int i;
>>> - long page, err=0;
>>> + int err = 0;
>>> void *addr = NULL;
>>>
>>> -
>>> mutex_init(&new_mm->arch.ldt.lock);
>>>
>>> - if (!from_mm) {
>>> - memset(&desc, 0, sizeof(desc));
>>> - /*
>>> - * Now we try to retrieve info about the ldt, we
>>> - * inherited from the host. All ldt-entries found
>>> - * will be reset in the following loop
>>> - */
>>> - ldt_get_host_info();
>>> - for (num_p=host_ldt_entries; *num_p != -1; num_p++)
>>> {
>>> - desc.entry_number = *num_p;
>>> - err = write_ldt_entry(&new_mm->id, 1,
>>> &desc,
>>> - &addr, *(num_p + 1)
>>> == -1);
>>> - if (err)
>>> - break;
>>> - }
>>> - new_mm->arch.ldt.entry_count = 0;
>>> -
>>> - goto out;
>>> + memset(&desc, 0, sizeof(desc));
>>> + /*
>>> + * Now we try to retrieve info about the ldt, we
>>> + * inherited from the host. All ldt-entries found
>>> + * will be reset in the following loop
>>> + */
>>> + ldt_get_host_info();
>>> + for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
>>> + desc.entry_number = *num_p;
>>> + err = write_ldt_entry(&new_mm->id, 1, &desc,
>>> + &addr, *(num_p + 1) == -1);
>>> + if (err)
>>> + break;
>>> }
>>> + new_mm->arch.ldt.entry_count = 0;
>>> +
>>> + return err;
>>> +}
>>> +
>>> +int copy_ldt(struct mm_context *new_mm, struct mm_context
>>> *from_mm)
>>> +{
>>> + int err = 0;
>>>
>>> /*
>>> * Our local LDT is used to supply the data for
>>> @@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm,
>>> struct mm_context *from_mm)
>>> memcpy(new_mm->arch.ldt.u.entries, from_mm-
>>>> arch.ldt.u.entries,
>>> sizeof(new_mm->arch.ldt.u.entries));
>>> else {
>>> - i = from_mm->arch.ldt.entry_count /
>>> LDT_ENTRIES_PER_PAGE;
>>> + int i = from_mm->arch.ldt.entry_count /
>>> LDT_ENTRIES_PER_PAGE;
>>> + unsigned long page;
>>> +
>>> while (i-->0) {
>>> page =
>>> __get_free_page(GFP_KERNEL|__GFP_ZERO);
>>> if (!page) {
>>> @@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm,
>>> struct mm_context *from_mm)
>>> new_mm->arch.ldt.entry_count = from_mm-
>>>> arch.ldt.entry_count;
>>> mutex_unlock(&from_mm->arch.ldt.lock);
>>>
>>> - out:
>>> return err;
>>> }
>>>
>>> -
>>> void free_ldt(struct mm_context *mm)
>>> {
>>> int i;
>>
>
--
Anton R. Ivanov
Cambridgegreys Limited. Registered in England. Company Number 10273661
https://www.cambridgegreys.com/
More information about the linux-um
mailing list