[PATCH] um: clean up mm creation
Benjamin Berg
benjamin at sipsolutions.net
Fri Sep 22 06:49:08 PDT 2023
On Fri, 2023-09-22 at 13:16 +0200, Johannes Berg wrote:
> From: Johannes Berg <johannes.berg at intel.com>
>
> While enabling PREEMPT on UML, we found that the call to
> force_flush_all() cannot be done where it is, it sleeps
> while atomic.
>
> Further investigation shows that all this seems at least
> a bit roundabout and possibly wrong wrong in the first
> place - we copy from the 'current' process and then flush
> when it starts running.
>
> What we really want is to start fresh with an empty mm
> process, then have it all set up by the kernel (copying
> the original mm contents as needed), and then sync it
> in arch_dup_mmap().
>
> We should do the same for the LDT, so need to split that
> to be able to do this.
>
> Note that this fixes what seems like an issue - we look
> at current->mm when we copy, but that doesn't seem right
> in the case of clone() without copying the MM. This is
> probably saved by the forced flush later right now.
Well, not clone(), but rather any execve(). The execve() happens in a
new MM (do_execveat_common -> alloc_bprm -> bprm_mm_init -> mm_alloc)
which is then swapped into the current task by exec_mmap.
And, that explains why the flush was needed. Without it, our userspace
had the parent's memory available unless it was un-/remapped (possibly
even RW with CLONE_VM|CLONE_VFORK). This is obviously wrong and
pointless as we discarded the information anyway.
So, doing a flush in arch_dup_mmap seems much more reasonable to me
(not entirely sure if it is needed). And, I doubt there is a benefit in
optimizing the flush away by somehow cloning the userspace process.
Benjamin
> Signed-off-by: Johannes Berg <johannes.berg at intel.com>
> ---
> arch/um/include/asm/Kbuild | 1 -
> arch/um/include/asm/mm_hooks.h | 22 ++++++
> arch/um/include/asm/mmu.h | 3 +-
> arch/um/include/asm/mmu_context.h | 2 +-
> arch/um/include/shared/os.h | 1 -
> arch/um/kernel/process.c | 3 -
> arch/um/kernel/skas/mmu.c | 35 ++++++----
> arch/um/kernel/tlb.c | 5 +-
> arch/um/os-Linux/skas/process.c | 107 ----------------------------
> --
> arch/x86/um/ldt.c | 53 +++++++--------
> 10 files changed, 76 insertions(+), 156 deletions(-)
> create mode 100644 arch/um/include/asm/mm_hooks.h
>
> diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
> index b2d834a29f3a..de8d82a6fd7b 100644
> --- a/arch/um/include/asm/Kbuild
> +++ b/arch/um/include/asm/Kbuild
> @@ -26,5 +26,4 @@ generic-y += switch_to.h
> generic-y += topology.h
> generic-y += trace_clock.h
> generic-y += kprobes.h
> -generic-y += mm_hooks.h
> generic-y += vga.h
> diff --git a/arch/um/include/asm/mm_hooks.h
> b/arch/um/include/asm/mm_hooks.h
> new file mode 100644
> index 000000000000..b1016520c5b8
> --- /dev/null
> +++ b/arch/um/include/asm/mm_hooks.h
> @@ -0,0 +1,22 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_UM_MM_HOOKS_H
> +#define _ASM_UM_MM_HOOKS_H
> +
> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
> +
> +static inline void arch_exit_mmap(struct mm_struct *mm)
> +{
> +}
> +
> +static inline void arch_unmap(struct mm_struct *mm,
> + unsigned long start, unsigned long end)
> +{
> +}
> +
> +static inline bool arch_vma_access_permitted(struct vm_area_struct
> *vma,
> + bool write, bool execute, bool foreign)
> +{
> + /* by default, allow everything */
> + return true;
> +}
> +#endif /* _ASM_UM_MM_HOOKS_H */
> diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
> index 5b072aba5b65..68a710d23b5d 100644
> --- a/arch/um/include/asm/mmu.h
> +++ b/arch/um/include/asm/mmu.h
> @@ -18,7 +18,8 @@ typedef struct mm_context {
> extern void __switch_mm(struct mm_id * mm_idp);
>
> /* Avoid tangled inclusion with asm/ldt.h */
> -extern long init_new_ldt(struct mm_context *to_mm, struct mm_context
> *from_mm);
> +extern int init_new_ldt(struct mm_context *to_mm);
> +extern int copy_ldt(struct mm_context *to_mm, struct mm_context
> *from_mm);
> extern void free_ldt(struct mm_context *mm);
>
> #endif
> diff --git a/arch/um/include/asm/mmu_context.h
> b/arch/um/include/asm/mmu_context.h
> index 68e2eb9cfb47..8668861d4a85 100644
> --- a/arch/um/include/asm/mmu_context.h
> +++ b/arch/um/include/asm/mmu_context.h
> @@ -13,7 +13,7 @@
> #include <asm/mm_hooks.h>
> #include <asm/mmu.h>
>
> -extern void force_flush_all(void);
> +void force_flush_all(struct mm_struct *mm);
>
> #define activate_mm activate_mm
> static inline void activate_mm(struct mm_struct *old, struct
> mm_struct *new)
> diff --git a/arch/um/include/shared/os.h
> b/arch/um/include/shared/os.h
> index 1a82c6548dd5..c9acc28fe47c 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp,
> unsigned long addr,
> /* skas/process.c */
> extern int is_skas_winch(int pid, int fd, void *data);
> extern int start_userspace(unsigned long stub_stack);
> -extern int copy_context_skas0(unsigned long stack, int pid);
> extern void userspace(struct uml_pt_regs *regs, unsigned long
> *aux_fp_regs);
> extern void new_thread(void *stack, jmp_buf *buf, void
> (*handler)(void));
> extern void switch_threads(jmp_buf *me, jmp_buf *you);
> diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
> index 6daffb9d8a8d..a024acd6d85c 100644
> --- a/arch/um/kernel/process.c
> +++ b/arch/um/kernel/process.c
> @@ -25,7 +25,6 @@
> #include <linux/threads.h>
> #include <linux/resume_user_mode.h>
> #include <asm/current.h>
> -#include <asm/mmu_context.h>
> #include <linux/uaccess.h>
> #include <as-layout.h>
> #include <kern_util.h>
> @@ -139,8 +138,6 @@ void new_thread_handler(void)
> /* Called magically, see new_thread_handler above */
> void fork_handler(void)
> {
> - force_flush_all();
> -
> schedule_tail(current->thread.prev_sched);
>
> /*
> diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
> index 656fe16c9b63..ac4ca203ac24 100644
> --- a/arch/um/kernel/skas/mmu.c
> +++ b/arch/um/kernel/skas/mmu.c
> @@ -10,13 +10,13 @@
>
> #include <asm/pgalloc.h>
> #include <asm/sections.h>
> +#include <asm/mmu_context.h>
> #include <as-layout.h>
> #include <os.h>
> #include <skas.h>
>
> int init_new_context(struct task_struct *task, struct mm_struct *mm)
> {
> - struct mm_context *from_mm = NULL;
> struct mm_context *to_mm = &mm->context;
> unsigned long stack = 0;
> int ret = -ENOMEM;
> @@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
> goto out;
>
> to_mm->id.stack = stack;
> - if (current->mm != NULL && current->mm != &init_mm)
> - from_mm = ¤t->mm->context;
>
> + /*
> + * Allocate a completely fresh mm. We'll sync the mappings
> once
> + * the rest of the kernel is done, in arch_copy_mm().
> + */
> block_signals_trace();
> - if (from_mm)
> - to_mm->id.u.pid = copy_context_skas0(stack,
> - from_mm-
> >id.u.pid);
> - else to_mm->id.u.pid = start_userspace(stack);
> + to_mm->id.u.pid = start_userspace(stack);
> unblock_signals_trace();
>
> if (to_mm->id.u.pid < 0) {
> @@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
> goto out_free;
> }
>
> - ret = init_new_ldt(to_mm, from_mm);
> - if (ret < 0) {
> - printk(KERN_ERR "init_new_context_skas - init_ldt"
> - " failed, errno = %d\n", ret);
> + ret = init_new_ldt(to_mm);
> + if (ret)
> goto out_free;
> - }
>
> return 0;
>
> @@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task,
> struct mm_struct *mm)
> return ret;
> }
>
> +int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> + int ret = copy_ldt(&mm->context, &oldmm->context);
> +
> + if (ret < 0) {
> + printk(KERN_ERR "%s - copy_ldt failed, errno = %d\n",
> + __func__, ret);
> + return ret;
> + }
> +
> + force_flush_all(mm);
> + return 0;
> +}
> +
> +
> void destroy_context(struct mm_struct *mm)
> {
> struct mm_context *mmu = &mm->context;
> diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
> index 34ec8e677fb9..7c0161321fd9 100644
> --- a/arch/um/kernel/tlb.c
> +++ b/arch/um/kernel/tlb.c
> @@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm)
> fix_range(mm, vma->vm_start, vma->vm_end, 0);
> }
>
> -void force_flush_all(void)
> +void force_flush_all(struct mm_struct *mm)
> {
> - struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma;
> VMA_ITERATOR(vmi, mm, 0);
>
> - mmap_read_lock(mm);
> for_each_vma(vmi, vma)
> fix_range(mm, vma->vm_start, vma->vm_end, 1);
> - mmap_read_unlock(mm);
> }
> diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-
> Linux/skas/process.c
> index f92129bbf981..403f4c6082b6 100644
> --- a/arch/um/os-Linux/skas/process.c
> +++ b/arch/um/os-Linux/skas/process.c
> @@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs,
> unsigned long *aux_fp_regs)
> }
> }
>
> -static unsigned long thread_regs[MAX_REG_NR];
> -static unsigned long thread_fp_regs[FP_SIZE];
> -
> -static int __init init_thread_regs(void)
> -{
> - get_safe_registers(thread_regs, thread_fp_regs);
> - /* Set parent's instruction pointer to start of clone-stub */
> - thread_regs[REGS_IP_INDEX] = STUB_CODE +
> - (unsigned long) stub_clone_handler -
> - (unsigned long) __syscall_stub_start;
> - thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES *
> UM_KERN_PAGE_SIZE -
> - sizeof(void *);
> -#ifdef __SIGNAL_FRAMESIZE
> - thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
> -#endif
> - return 0;
> -}
> -
> -__initcall(init_thread_regs);
> -
> -int copy_context_skas0(unsigned long new_stack, int pid)
> -{
> - int err;
> - unsigned long current_stack = current_stub_stack();
> - struct stub_data *data = (struct stub_data *) current_stack;
> - struct stub_data *child_data = (struct stub_data *)
> new_stack;
> - unsigned long long new_offset;
> - int new_fd = phys_mapping(uml_to_phys((void *)new_stack),
> &new_offset);
> -
> - /*
> - * prepare offset and fd of child's stack as argument for
> parent's
> - * and child's mmap2 calls
> - */
> - *data = ((struct stub_data) {
> - .offset = MMAP_OFFSET(new_offset),
> - .fd = new_fd,
> - .parent_err = -ESRCH,
> - .child_err = 0,
> - });
> -
> - *child_data = ((struct stub_data) {
> - .child_err = -ESRCH,
> - });
> -
> - err = ptrace_setregs(pid, thread_regs);
> - if (err < 0) {
> - err = -errno;
> - printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid =
> %d, errno = %d\n",
> - __func__, pid, -err);
> - return err;
> - }
> -
> - err = put_fp_registers(pid, thread_fp_regs);
> - if (err < 0) {
> - printk(UM_KERN_ERR "%s : put_fp_registers failed, pid
> = %d, err = %d\n",
> - __func__, pid, err);
> - return err;
> - }
> -
> - /*
> - * Wait, until parent has finished its work: read child's pid
> from
> - * parent's stack, and check, if bad result.
> - */
> - err = ptrace(PTRACE_CONT, pid, 0, 0);
> - if (err) {
> - err = -errno;
> - printk(UM_KERN_ERR "Failed to continue new process,
> pid = %d, errno = %d\n",
> - pid, errno);
> - return err;
> - }
> -
> - wait_stub_done(pid);
> -
> - pid = data->parent_err;
> - if (pid < 0) {
> - printk(UM_KERN_ERR "%s - stub-parent reports error
> %d\n",
> - __func__, -pid);
> - return pid;
> - }
> -
> - /*
> - * Wait, until child has finished too: read child's result
> from
> - * child's stack and check it.
> - */
> - wait_stub_done(pid);
> - if (child_data->child_err != STUB_DATA) {
> - printk(UM_KERN_ERR "%s - stub-child %d reports error
> %ld\n",
> - __func__, pid, data->child_err);
> - err = data->child_err;
> - goto out_kill;
> - }
> -
> - if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
> - (void *)PTRACE_O_TRACESYSGOOD) < 0) {
> - err = -errno;
> - printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed,
> errno = %d\n",
> - __func__, errno);
> - goto out_kill;
> - }
> -
> - return pid;
> -
> - out_kill:
> - os_kill_ptraced_process(pid, 1);
> - return err;
> -}
> -
> void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
> {
> (*buf)[0].JB_IP = (unsigned long) handler;
> diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
> index 255a44dd415a..609feaeff23b 100644
> --- a/arch/x86/um/ldt.c
> +++ b/arch/x86/um/ldt.c
> @@ -297,36 +297,37 @@ static void ldt_get_host_info(void)
> free_pages((unsigned long)ldt, order);
> }
>
> -long init_new_ldt(struct mm_context *new_mm, struct mm_context
> *from_mm)
> +int init_new_ldt(struct mm_context *new_mm)
> {
> - struct user_desc desc;
> + struct user_desc desc = {};
> short * num_p;
> - int i;
> - long page, err=0;
> + int err = 0;
> void *addr = NULL;
>
> -
> mutex_init(&new_mm->arch.ldt.lock);
>
> - if (!from_mm) {
> - memset(&desc, 0, sizeof(desc));
> - /*
> - * Now we try to retrieve info about the ldt, we
> - * inherited from the host. All ldt-entries found
> - * will be reset in the following loop
> - */
> - ldt_get_host_info();
> - for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
> - desc.entry_number = *num_p;
> - err = write_ldt_entry(&new_mm->id, 1, &desc,
> - &addr, *(num_p + 1) ==
> -1);
> - if (err)
> - break;
> - }
> - new_mm->arch.ldt.entry_count = 0;
> -
> - goto out;
> + memset(&desc, 0, sizeof(desc));
> + /*
> + * Now we try to retrieve info about the ldt, we
> + * inherited from the host. All ldt-entries found
> + * will be reset in the following loop
> + */
> + ldt_get_host_info();
> + for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
> + desc.entry_number = *num_p;
> + err = write_ldt_entry(&new_mm->id, 1, &desc,
> + &addr, *(num_p + 1) == -1);
> + if (err)
> + break;
> }
> + new_mm->arch.ldt.entry_count = 0;
> +
> + return err;
> +}
> +
> +int copy_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
> +{
> + int err = 0;
>
> /*
> * Our local LDT is used to supply the data for
> @@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm,
> struct mm_context *from_mm)
> memcpy(new_mm->arch.ldt.u.entries, from_mm-
> >arch.ldt.u.entries,
> sizeof(new_mm->arch.ldt.u.entries));
> else {
> - i = from_mm->arch.ldt.entry_count /
> LDT_ENTRIES_PER_PAGE;
> + int i = from_mm->arch.ldt.entry_count /
> LDT_ENTRIES_PER_PAGE;
> + unsigned long page;
> +
> while (i-->0) {
> page =
> __get_free_page(GFP_KERNEL|__GFP_ZERO);
> if (!page) {
> @@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm,
> struct mm_context *from_mm)
> new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
> mutex_unlock(&from_mm->arch.ldt.lock);
>
> - out:
> return err;
> }
>
> -
> void free_ldt(struct mm_context *mm)
> {
> int i;
More information about the linux-um
mailing list