[PATCH] um: clean up mm creation

Fri Sep 22 04:16:39 PDT 2023

From: Johannes Berg <johannes.berg at intel.com>

While enabling PREEMPT on UML, we found that the call to
force_flush_all() cannot be done where it is, it sleeps
while atomic.

Further investigation shows that all this seems at least
a bit roundabout and possibly wrong wrong in the first
place - we copy from the 'current' process and then flush
when it starts running.

What we really want is to start fresh with an empty mm
process, then have it all set up by the kernel (copying
the original mm contents as needed), and then sync it
in arch_dup_mmap().

We should do the same for the LDT, so need to split that
to be able to do this.

Note that this fixes what seems like an issue - we look
at current->mm when we copy, but that doesn't seem right
in the case of clone() without copying the MM. This is
probably saved by the forced flush later right now.

Signed-off-by: Johannes Berg <johannes.berg at intel.com>
---
 arch/um/include/asm/Kbuild        |   1 -
 arch/um/include/asm/mm_hooks.h    |  22 ++++++
 arch/um/include/asm/mmu.h         |   3 +-
 arch/um/include/asm/mmu_context.h |   2 +-
 arch/um/include/shared/os.h       |   1 -
 arch/um/kernel/process.c          |   3 -
 arch/um/kernel/skas/mmu.c         |  35 ++++++----
 arch/um/kernel/tlb.c              |   5 +-
 arch/um/os-Linux/skas/process.c   | 107 ------------------------------
 arch/x86/um/ldt.c                 |  53 +++++++--------
 10 files changed, 76 insertions(+), 156 deletions(-)
 create mode 100644 arch/um/include/asm/mm_hooks.h

diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index b2d834a29f3a..de8d82a6fd7b 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -26,5 +26,4 @@ generic-y += switch_to.h
 generic-y += topology.h
 generic-y += trace_clock.h
 generic-y += kprobes.h
-generic-y += mm_hooks.h
 generic-y += vga.h
diff --git a/arch/um/include/asm/mm_hooks.h b/arch/um/include/asm/mm_hooks.h
new file mode 100644
index 000000000000..b1016520c5b8
--- /dev/null
+++ b/arch/um/include/asm/mm_hooks.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_UM_MM_HOOKS_H
+#define _ASM_UM_MM_HOOKS_H
+
+int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+
+static inline void arch_exit_mmap(struct mm_struct *mm)
+{
+}
+
+static inline void arch_unmap(struct mm_struct *mm,
+			unsigned long start, unsigned long end)
+{
+}
+
+static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
+		bool write, bool execute, bool foreign)
+{
+	/* by default, allow everything */
+	return true;
+}
+#endif	/* _ASM_UM_MM_HOOKS_H */
diff --git a/arch/um/include/asm/mmu.h b/arch/um/include/asm/mmu.h
index 5b072aba5b65..68a710d23b5d 100644
--- a/arch/um/include/asm/mmu.h
+++ b/arch/um/include/asm/mmu.h
@@ -18,7 +18,8 @@ typedef struct mm_context {
 extern void __switch_mm(struct mm_id * mm_idp);
 
 /* Avoid tangled inclusion with asm/ldt.h */
-extern long init_new_ldt(struct mm_context *to_mm, struct mm_context *from_mm);
+extern int init_new_ldt(struct mm_context *to_mm);
+extern int copy_ldt(struct mm_context *to_mm, struct mm_context *from_mm);
 extern void free_ldt(struct mm_context *mm);
 
 #endif
diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h
index 68e2eb9cfb47..8668861d4a85 100644
--- a/arch/um/include/asm/mmu_context.h
+++ b/arch/um/include/asm/mmu_context.h
@@ -13,7 +13,7 @@
 #include <asm/mm_hooks.h>
 #include <asm/mmu.h>
 
-extern void force_flush_all(void);
+void force_flush_all(struct mm_struct *mm);
 
 #define activate_mm activate_mm
 static inline void activate_mm(struct mm_struct *old, struct mm_struct *new)
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 1a82c6548dd5..c9acc28fe47c 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -289,7 +289,6 @@ extern int protect(struct mm_id * mm_idp, unsigned long addr,
 /* skas/process.c */
 extern int is_skas_winch(int pid, int fd, void *data);
 extern int start_userspace(unsigned long stub_stack);
-extern int copy_context_skas0(unsigned long stack, int pid);
 extern void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs);
 extern void new_thread(void *stack, jmp_buf *buf, void (*handler)(void));
 extern void switch_threads(jmp_buf *me, jmp_buf *you);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 6daffb9d8a8d..a024acd6d85c 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -25,7 +25,6 @@
 #include <linux/threads.h>
 #include <linux/resume_user_mode.h>
 #include <asm/current.h>
-#include <asm/mmu_context.h>
 #include <linux/uaccess.h>
 #include <as-layout.h>
 #include <kern_util.h>
@@ -139,8 +138,6 @@ void new_thread_handler(void)
 /* Called magically, see new_thread_handler above */
 void fork_handler(void)
 {
-	force_flush_all();
-
 	schedule_tail(current->thread.prev_sched);
 
 	/*
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 656fe16c9b63..ac4ca203ac24 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -10,13 +10,13 @@
 
 #include <asm/pgalloc.h>
 #include <asm/sections.h>
+#include <asm/mmu_context.h>
 #include <as-layout.h>
 #include <os.h>
 #include <skas.h>
 
 int init_new_context(struct task_struct *task, struct mm_struct *mm)
 {
- 	struct mm_context *from_mm = NULL;
 	struct mm_context *to_mm = &mm->context;
 	unsigned long stack = 0;
 	int ret = -ENOMEM;
@@ -26,14 +26,13 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 		goto out;
 
 	to_mm->id.stack = stack;
-	if (current->mm != NULL && current->mm != &init_mm)
-		from_mm = &current->mm->context;
 
+	/*
+	 * Allocate a completely fresh mm. We'll sync the mappings once
+	 * the rest of the kernel is done, in arch_copy_mm().
+	 */
 	block_signals_trace();
-	if (from_mm)
-		to_mm->id.u.pid = copy_context_skas0(stack,
-						     from_mm->id.u.pid);
-	else to_mm->id.u.pid = start_userspace(stack);
+	to_mm->id.u.pid = start_userspace(stack);
 	unblock_signals_trace();
 
 	if (to_mm->id.u.pid < 0) {
@@ -41,12 +40,9 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 		goto out_free;
 	}
 
-	ret = init_new_ldt(to_mm, from_mm);
-	if (ret < 0) {
-		printk(KERN_ERR "init_new_context_skas - init_ldt"
-		       " failed, errno = %d\n", ret);
+	ret = init_new_ldt(to_mm);
+	if (ret)
 		goto out_free;
-	}
 
 	return 0;
 
@@ -57,6 +53,21 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 	return ret;
 }
 
+int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+	int ret = copy_ldt(&mm->context, &oldmm->context);
+
+	if (ret < 0) {
+		printk(KERN_ERR "%s - copy_ldt failed, errno = %d\n",
+		       __func__, ret);
+		return ret;
+	}
+
+	force_flush_all(mm);
+	return 0;
+}
+
+
 void destroy_context(struct mm_struct *mm)
 {
 	struct mm_context *mmu = &mm->context;
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index 34ec8e677fb9..7c0161321fd9 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -600,14 +600,11 @@ void flush_tlb_mm(struct mm_struct *mm)
 		fix_range(mm, vma->vm_start, vma->vm_end, 0);
 }
 
-void force_flush_all(void)
+void force_flush_all(struct mm_struct *mm)
 {
-	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	VMA_ITERATOR(vmi, mm, 0);
 
-	mmap_read_lock(mm);
 	for_each_vma(vmi, vma)
 		fix_range(mm, vma->vm_start, vma->vm_end, 1);
-	mmap_read_unlock(mm);
 }
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index f92129bbf981..403f4c6082b6 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -508,113 +508,6 @@ void userspace(struct uml_pt_regs *regs, unsigned long *aux_fp_regs)
 	}
 }
 
-static unsigned long thread_regs[MAX_REG_NR];
-static unsigned long thread_fp_regs[FP_SIZE];
-
-static int __init init_thread_regs(void)
-{
-	get_safe_registers(thread_regs, thread_fp_regs);
-	/* Set parent's instruction pointer to start of clone-stub */
-	thread_regs[REGS_IP_INDEX] = STUB_CODE +
-				(unsigned long) stub_clone_handler -
-				(unsigned long) __syscall_stub_start;
-	thread_regs[REGS_SP_INDEX] = STUB_DATA + STUB_DATA_PAGES * UM_KERN_PAGE_SIZE -
-		sizeof(void *);
-#ifdef __SIGNAL_FRAMESIZE
-	thread_regs[REGS_SP_INDEX] -= __SIGNAL_FRAMESIZE;
-#endif
-	return 0;
-}
-
-__initcall(init_thread_regs);
-
-int copy_context_skas0(unsigned long new_stack, int pid)
-{
-	int err;
-	unsigned long current_stack = current_stub_stack();
-	struct stub_data *data = (struct stub_data *) current_stack;
-	struct stub_data *child_data = (struct stub_data *) new_stack;
-	unsigned long long new_offset;
-	int new_fd = phys_mapping(uml_to_phys((void *)new_stack), &new_offset);
-
-	/*
-	 * prepare offset and fd of child's stack as argument for parent's
-	 * and child's mmap2 calls
-	 */
-	*data = ((struct stub_data) {
-		.offset	= MMAP_OFFSET(new_offset),
-		.fd     = new_fd,
-		.parent_err = -ESRCH,
-		.child_err = 0,
-	});
-
-	*child_data = ((struct stub_data) {
-		.child_err = -ESRCH,
-	});
-
-	err = ptrace_setregs(pid, thread_regs);
-	if (err < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "%s : PTRACE_SETREGS failed, pid = %d, errno = %d\n",
-		      __func__, pid, -err);
-		return err;
-	}
-
-	err = put_fp_registers(pid, thread_fp_regs);
-	if (err < 0) {
-		printk(UM_KERN_ERR "%s : put_fp_registers failed, pid = %d, err = %d\n",
-		       __func__, pid, err);
-		return err;
-	}
-
-	/*
-	 * Wait, until parent has finished its work: read child's pid from
-	 * parent's stack, and check, if bad result.
-	 */
-	err = ptrace(PTRACE_CONT, pid, 0, 0);
-	if (err) {
-		err = -errno;
-		printk(UM_KERN_ERR "Failed to continue new process, pid = %d, errno = %d\n",
-		       pid, errno);
-		return err;
-	}
-
-	wait_stub_done(pid);
-
-	pid = data->parent_err;
-	if (pid < 0) {
-		printk(UM_KERN_ERR "%s - stub-parent reports error %d\n",
-		      __func__, -pid);
-		return pid;
-	}
-
-	/*
-	 * Wait, until child has finished too: read child's result from
-	 * child's stack and check it.
-	 */
-	wait_stub_done(pid);
-	if (child_data->child_err != STUB_DATA) {
-		printk(UM_KERN_ERR "%s - stub-child %d reports error %ld\n",
-		       __func__, pid, data->child_err);
-		err = data->child_err;
-		goto out_kill;
-	}
-
-	if (ptrace(PTRACE_OLDSETOPTIONS, pid, NULL,
-		   (void *)PTRACE_O_TRACESYSGOOD) < 0) {
-		err = -errno;
-		printk(UM_KERN_ERR "%s : PTRACE_OLDSETOPTIONS failed, errno = %d\n",
-		       __func__, errno);
-		goto out_kill;
-	}
-
-	return pid;
-
- out_kill:
-	os_kill_ptraced_process(pid, 1);
-	return err;
-}
-
 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
 {
 	(*buf)[0].JB_IP = (unsigned long) handler;
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 255a44dd415a..609feaeff23b 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -297,36 +297,37 @@ static void ldt_get_host_info(void)
 	free_pages((unsigned long)ldt, order);
 }
 
-long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
+int init_new_ldt(struct mm_context *new_mm)
 {
-	struct user_desc desc;
+	struct user_desc desc = {};
 	short * num_p;
-	int i;
-	long page, err=0;
+	int err = 0;
 	void *addr = NULL;
 
-
 	mutex_init(&new_mm->arch.ldt.lock);
 
-	if (!from_mm) {
-		memset(&desc, 0, sizeof(desc));
-		/*
-		 * Now we try to retrieve info about the ldt, we
-		 * inherited from the host. All ldt-entries found
-		 * will be reset in the following loop
-		 */
-		ldt_get_host_info();
-		for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
-			desc.entry_number = *num_p;
-			err = write_ldt_entry(&new_mm->id, 1, &desc,
-					      &addr, *(num_p + 1) == -1);
-			if (err)
-				break;
-		}
-		new_mm->arch.ldt.entry_count = 0;
-
-		goto out;
+	memset(&desc, 0, sizeof(desc));
+	/*
+	 * Now we try to retrieve info about the ldt, we
+	 * inherited from the host. All ldt-entries found
+	 * will be reset in the following loop
+	 */
+	ldt_get_host_info();
+	for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
+		desc.entry_number = *num_p;
+		err = write_ldt_entry(&new_mm->id, 1, &desc,
+				      &addr, *(num_p + 1) == -1);
+		if (err)
+			break;
 	}
+	new_mm->arch.ldt.entry_count = 0;
+
+	return err;
+}
+
+int copy_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
+{
+	int err = 0;
 
 	/*
 	 * Our local LDT is used to supply the data for
@@ -339,7 +340,9 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 		memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
 		       sizeof(new_mm->arch.ldt.u.entries));
 	else {
-		i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+		int i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+		unsigned long page;
+
 		while (i-->0) {
 			page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
 			if (!page) {
@@ -355,11 +358,9 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 	new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
 	mutex_unlock(&from_mm->arch.ldt.lock);
 
-    out:
 	return err;
 }
 
-
 void free_ldt(struct mm_context *mm)
 {
 	int i;
-- 
2.41.0