[RFC PATCH v2 07/13] x86/um: nommu: process/thread handling
Hajime Tazaki
thehajime at gmail.com
Sun Nov 10 22:27:07 PST 2024
Since ptrace facility isn't used under !MMU of UML, there is different
code path to invoke processes/threads; on an entry to the syscall
interface, the stack pointer should be manipulated to handle vfork(2)
return address, no external process is used, and need to properly
configure some of registers (fs segment register for TLS, etc) on every
context switch, etc.
Signals aren't delivered in non-ptrace syscall entry/leave so, we also
need to handle pending signal by ourselves.
Signed-off-by: Hajime Tazaki <thehajime at gmail.com>
Signed-off-by: Ricardo Koller <ricarkol at google.com>
---
arch/um/kernel/process.c | 33 +++++++++++++++++++++++++++++-
arch/um/os-Linux/process.c | 6 ++++++
arch/um/os-Linux/skas/process.c | 4 ++++
arch/x86/um/asm/processor.h | 12 +++++++++++
arch/x86/um/do_syscall_64.c | 36 +++++++++++++++++++++++++++++++++
arch/x86/um/entry_64.S | 21 +++++++++++++++++++
arch/x86/um/syscalls_64.c | 12 +++++++++++
7 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 56e7e525fc91..b3708dceb731 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -116,13 +116,17 @@ void new_thread_handler(void)
* callback returns only if the kernel thread execs a process
*/
fn(arg);
+#ifndef CONFIG_MMU
+ arch_switch_to(current);
+#endif
userspace(¤t->thread.regs.regs);
}
/* Called magically, see new_thread_handler above */
static void fork_handler(void)
{
- schedule_tail(current->thread.prev_sched);
+ if (current->thread.prev_sched)
+ schedule_tail(current->thread.prev_sched);
/*
* XXX: if interrupt_end() calls schedule, this call to
@@ -133,6 +137,33 @@ static void fork_handler(void)
current->thread.prev_sched = NULL;
+#ifndef CONFIG_MMU
+ /*
+ * child of vfork(2) comes here.
+ * clone(2) also enters here but doesn't need to advance the %rsp.
+ *
+ * This fork can only come from libc's vfork, which
+ * does this:
+ * popq %%rdx;
+ * call *%rax; // zpoline => __kernel_vsyscall
+ * pushq %%rdx;
+ * %rcx stores the return address which is stored
+ * at pt_regs[HOST_IP] at the moment. As child returns
+ * via userspace() with a jmp instruction (while parent
+ * does via ret instruction in __kernel_vsyscall), we
+ * need to pop (advance) the pushed address by "call"
+ * though, so this is what this next line does.
+ *
+ * As a result of vfork return in child, stack contents
+ * is overwritten by child (by pushq in vfork), which
+ * makes the parent puzzled after child returns.
+ *
+ * thus the contents should be restored before vfork/parent
+ * returns. this is done in do_syscall_64().
+ */
+ if (current->thread.regs.regs.gp[HOST_ORIG_AX] == __NR_vfork)
+ current->thread.regs.regs.gp[REGS_SP_INDEX] += 8;
+#endif
userspace(¤t->thread.regs.regs);
}
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index ed3d99301dc8..5acf6d41a4c2 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -25,7 +25,10 @@
void os_alarm_process(int pid)
{
+/* !CONFIG_MMU doesn't send alarm signal to other processes */
+#ifdef CONFIG_MMU
kill(pid, SIGALRM);
+#endif
}
void os_kill_process(int pid, int reap_child)
@@ -42,11 +45,14 @@ void os_kill_process(int pid, int reap_child)
void os_kill_ptraced_process(int pid, int reap_child)
{
+/* !CONFIG_MMU doesn't have ptraced process */
+#ifdef CONFIG_MMU
kill(pid, SIGKILL);
ptrace(PTRACE_KILL, pid);
ptrace(PTRACE_CONT, pid);
if (reap_child)
CATCH_EINTR(waitpid(pid, NULL, __WALL));
+#endif
}
/* Don't use the glibc version, which caches the result in TLS. It misses some
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index f683cfc9e51a..291136008431 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -144,6 +144,7 @@ void wait_stub_done(int pid)
extern unsigned long current_stub_stack(void);
+#ifdef CONFIG_MMU
static void get_skas_faultinfo(int pid, struct faultinfo *fi)
{
int err;
@@ -176,6 +177,7 @@ static void handle_trap(int pid, struct uml_pt_regs *regs)
handle_syscall(regs);
}
+#endif
extern char __syscall_stub_start[];
@@ -389,6 +391,7 @@ int start_userspace(unsigned long stub_stack)
}
int unscheduled_userspace_iterations;
+#ifdef CONFIG_MMU
extern unsigned long tt_extra_sched_jiffies;
void userspace(struct uml_pt_regs *regs)
@@ -550,6 +553,7 @@ void userspace(struct uml_pt_regs *regs)
}
}
}
+#endif /* UML_CONFIG_MMU */
void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
{
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 478710384b34..d88d7d9d5c18 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -38,6 +38,18 @@ static __always_inline void cpu_relax(void)
#define task_pt_regs(t) (&(t)->thread.regs)
+#ifndef CONFIG_MMU
+#define task_top_of_stack(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task->stack; \
+ __ptr += THREAD_SIZE; \
+ __ptr; \
+})
+
+extern long current_top_of_stack;
+extern long current_ptregs;
+#endif
+
#include <asm/processor-generic.h>
#endif
diff --git a/arch/x86/um/do_syscall_64.c b/arch/x86/um/do_syscall_64.c
index a1189ddb2b50..203bacc4cb3c 100644
--- a/arch/x86/um/do_syscall_64.c
+++ b/arch/x86/um/do_syscall_64.c
@@ -1,14 +1,43 @@
// SPDX-License-Identifier: GPL-2.0
+//#define DEBUG 1
#include <linux/kernel.h>
#include <linux/ptrace.h>
#include <kern_util.h>
#include <sysdep/syscalls.h>
#include <os.h>
+/*
+ * save/restore the return address stored in the stack, as the child overwrites
+ * the contents after returning to userspace (i.e., by push %rdx).
+ *
+ * see the detail in fork_handler().
+ */
+static void *vfork_save_stack(void)
+{
+ unsigned char *stack_copy;
+
+ stack_copy = kzalloc(8, GFP_KERNEL);
+ if (!stack_copy)
+ return NULL;
+
+ memcpy(stack_copy,
+ (void *)current->thread.regs.regs.gp[HOST_SP], 8);
+
+ return stack_copy;
+}
+
+static void vfork_restore_stack(void *stack_copy)
+{
+ WARN_ON_ONCE(!stack_copy);
+ memcpy((void *)current->thread.regs.regs.gp[HOST_SP],
+ stack_copy, 8);
+}
+
__visible void do_syscall_64(struct pt_regs *regs)
{
int syscall;
+ unsigned char *stack_copy = NULL;
syscall = PT_SYSCALL_NR(regs->regs.gp);
UPT_SYSCALL_NR(®s->regs) = syscall;
@@ -17,6 +46,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
syscall, (unsigned long)current,
(unsigned long)sys_call_table[syscall]);
+ if (syscall == __NR_vfork)
+ stack_copy = vfork_save_stack();
+
if (likely(syscall < NR_syscalls)) {
PT_REGS_SET_SYSCALL_RETURN(regs,
EXECUTE_SYSCALL(syscall, regs));
@@ -34,4 +66,8 @@ __visible void do_syscall_64(struct pt_regs *regs)
/* execve succeeded */
if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
userspace(¤t->thread.regs.regs);
+
+ /* only parents of vfork restores the contents of stack */
+ if (syscall == __NR_vfork && regs->regs.gp[HOST_AX] > 0)
+ vfork_restore_stack(stack_copy);
}
diff --git a/arch/x86/um/entry_64.S b/arch/x86/um/entry_64.S
index 022a8122690b..32f5002e2eb0 100644
--- a/arch/x86/um/entry_64.S
+++ b/arch/x86/um/entry_64.S
@@ -85,3 +85,24 @@ ENTRY(__kernel_vsyscall)
ret
END(__kernel_vsyscall)
+
+// void userspace(struct uml_pt_regs *regs)
+ENTRY(userspace)
+ /* align the stack for x86_64 ABI */
+ and $-0x10, %rsp
+ /* Handle any immediate reschedules or signals */
+ call interrupt_end
+
+ movq current_ptregs, %rsp
+
+ POP_REGS
+
+ addq $8, %rsp /* skip orig_ax */
+ popq %r11 /* pt_regs->ip */
+ addq $8, %rsp /* skip cs */
+ addq $8, %rsp /* skip flags */
+ popq %rsp
+
+ jmp *%r11
+
+END(userspace)
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index 6a00a28c9cca..edb17fc73e07 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -51,6 +51,18 @@ void arch_switch_to(struct task_struct *to)
* Nothing needs to be done on x86_64.
* The FS_BASE/GS_BASE registers are saved in the ptrace register set.
*/
+#ifndef CONFIG_MMU
+ current_top_of_stack = task_top_of_stack(to);
+ current_ptregs = (long)task_pt_regs(to);
+
+ if ((to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] == 0) ||
+ (to->mm == NULL))
+ return;
+
+ /* this changes the FS on every context switch */
+ arch_prctl(to, ARCH_SET_FS,
+ (void __user *) to->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)]);
+#endif
}
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
--
2.43.0
More information about the linux-um
mailing list