[PATCH v3 08/13] um: nommu: configure fs register on host syscall invocation

Hajime Tazaki thehajime at gmail.com
Mon Dec 2 20:23:07 PST 2024


As userspace on UML/!MMU also need to configure %fs register when it is
running to correctly access thread structure, host syscalls implemented
in os-Linux drivers may be puzzled when they are called.  Thus it has to
configure %fs register via arch_prctl(SET_FS) on every host syscalls.

Signed-off-by: Hajime Tazaki <thehajime at gmail.com>
Signed-off-by: Ricardo Koller <ricarkol at google.com>
---
 arch/um/include/shared/os.h |  3 ++
 arch/um/os-Linux/main.c     |  5 ++++
 arch/um/os-Linux/process.c  |  8 ++++++
 arch/um/os-Linux/start_up.c | 20 +++++++++++++
 arch/x86/um/do_syscall_64.c | 36 +++++++++++++++++++++++
 arch/x86/um/syscalls_64.c   | 57 +++++++++++++++++++++++++++++++++++++
 6 files changed, 129 insertions(+)

diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index c979a8b15434..f7f4da322906 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -190,6 +190,7 @@ extern void check_host_supports_tls(int *supports_tls, int *tls_min);
 extern void get_host_cpu_features(
 	void (*flags_helper_func)(char *line),
 	void (*cache_helper_func)(char *line));
+extern int host_has_fsgsbase;
 
 /* mem.c */
 extern int create_mem_file(unsigned long long len);
@@ -221,6 +222,8 @@ extern int os_drop_memory(void *addr, int length);
 extern int can_drop_memory(void);
 extern int os_mincore(void *addr, unsigned long len);
 #ifndef CONFIG_MMU
+extern long long host_fs;
+extern int os_arch_prctl(int pid, int option, unsigned long *arg);
 extern int os_setup_seccomp(void);
 #endif
 
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 0afcdeb8995b..aecf63d3db79 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -17,6 +17,7 @@
 #include <kern_util.h>
 #include <os.h>
 #include <um_malloc.h>
+#include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include "internal.h"
 
 #define PGD_BOUND (4 * 1024 * 1024)
@@ -158,6 +159,10 @@ int __init main(int argc, char **argv, char **envp)
 	change_sig(SIGPIPE, 0);
 	ret = linux_main(argc, argv, envp);
 
+#ifndef CONFIG_MMU
+	os_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
+#endif
+
 	/*
 	 * Disable SIGPROF - I have no idea why libc doesn't do this or turn
 	 * off the profiling time, but UML dies with a SIGPROF just before
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 51473b834497..346d297e89fe 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -221,6 +221,14 @@ void os_set_pdeathsig(void)
 }
 
 #ifndef CONFIG_MMU
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
+{
+	return syscall(SYS_arch_prctl, option, arg2);
+}
+
 int os_setup_seccomp(void)
 {
 	int err;
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 93fc82c01aba..dbab091892b3 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -19,6 +19,8 @@
 #include <sys/resource.h>
 #include <asm/ldt.h>
 #include <asm/unistd.h>
+#include <sys/auxv.h>
+#include <asm/hwcap2.h>
 #include <init.h>
 #include <os.h>
 #include <kern_util.h>
@@ -28,6 +30,8 @@
 #include <skas.h>
 #include "internal.h"
 
+int host_has_fsgsbase;
+
 static void ptrace_child(void)
 {
 	int ret;
@@ -278,6 +282,19 @@ void  __init get_host_cpu_features(
 	}
 }
 
+static void __init check_fsgsbase(void)
+{
+	unsigned long auxv = getauxval(AT_HWCAP2);
+
+	os_info("Checking FSGSBASE instructions...");
+	if (auxv & HWCAP2_FSGSBASE) {
+		host_has_fsgsbase = 1;
+		os_info("OK\n");
+	} else {
+		host_has_fsgsbase = 0;
+		os_info("disabled\n");
+	}
+}
 
 void __init os_early_checks(void)
 {
@@ -293,6 +310,9 @@ void __init os_early_checks(void)
 	 */
 	check_tmpexec();
 
+	/* probe fsgsbase instruction */
+	check_fsgsbase();
+
 	pid = start_ptraced_child();
 	if (init_pid_registers(pid))
 		fatal("Failed to initialize default registers");
diff --git a/arch/x86/um/do_syscall_64.c b/arch/x86/um/do_syscall_64.c
index ca468caff729..c7e48c74c7a5 100644
--- a/arch/x86/um/do_syscall_64.c
+++ b/arch/x86/um/do_syscall_64.c
@@ -3,6 +3,8 @@
 //#define DEBUG 1
 #include <linux/kernel.h>
 #include <linux/ptrace.h>
+#include <asm/fsgsbase.h>
+#include <asm/prctl.h>
 #include <kern_util.h>
 #include <sysdep/syscalls.h>
 #include <os.h>
@@ -34,6 +36,31 @@ static void vfork_restore_stack(void *stack_copy)
 	       stack_copy, 8);
 }
 
+static int os_x86_arch_prctl(int pid, int option, unsigned long *arg2)
+{
+	if (host_has_fsgsbase) {
+		switch (option) {
+		case ARCH_SET_FS:
+			wrfsbase(*arg2);
+			break;
+		case ARCH_SET_GS:
+			wrgsbase(*arg2);
+			break;
+		case ARCH_GET_FS:
+			*arg2 = rdfsbase();
+			break;
+		case ARCH_GET_GS:
+			*arg2 = rdgsbase();
+			break;
+		}
+		return 0;
+	} else {
+		return os_arch_prctl(pid, option, arg2);
+	}
+
+	return 0;
+}
+
 __visible void do_syscall_64(struct pt_regs *regs)
 {
 	int syscall;
@@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	if (syscall == __NR_vfork)
 		stack_copy = vfork_save_stack();
 
+	/* set fs register to the original host one */
+	os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
+
 	if (likely(syscall < NR_syscalls)) {
 		PT_REGS_SET_SYSCALL_RETURN(regs,
 				EXECUTE_SYSCALL(syscall, regs));
@@ -70,4 +100,10 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	/* force do_signal() --> is_syscall() */
 	set_thread_flag(TIF_SIGPENDING);
 	interrupt_end();
+
+	/* restore back fs register to userspace configured one */
+	os_x86_arch_prctl(0, ARCH_SET_FS,
+		      (void *)(current->thread.regs.regs.gp[FS_BASE
+						     / sizeof(unsigned long)]));
+
 }
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index edb17fc73e07..d56df936a2d7 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -12,11 +12,26 @@
 #include <asm/prctl.h> /* XXX This should get the constants from libc */
 #include <registers.h>
 #include <os.h>
+#include <asm/thread_info.h>
+#include <asm/mman.h>
+
+#ifndef CONFIG_MMU
+/*
+ * The guest libc can change FS, which confuses the host libc.
+ * In fact, changing FS directly is not supported (check
+ * man arch_prctl). So, whenever we make a host syscall,
+ * we should be changing FS to the original FS (not the
+ * one set by the guest libc). This original FS is stored
+ * in host_fs.
+ */
+long long host_fs = -1;
+#endif
 
 long arch_prctl(struct task_struct *task, int option,
 		unsigned long __user *arg2)
 {
 	long ret = -EINVAL;
+#ifdef CONFIG_MMU
 
 	switch (option) {
 	case ARCH_SET_FS:
@@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option,
 	}
 
 	return ret;
+#else
+
+	unsigned long *ptr = arg2, tmp;
+
+	switch (option) {
+	case ARCH_SET_FS:
+		if (host_fs == -1)
+			os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs);
+		ret = 0;
+		break;
+	case ARCH_SET_GS:
+		ret = 0;
+		break;
+	case ARCH_GET_FS:
+	case ARCH_GET_GS:
+		ptr = &tmp;
+		break;
+	}
+
+	ret = os_arch_prctl(0, option, ptr);
+	if (ret)
+		return ret;
+
+	switch (option) {
+	case ARCH_SET_FS:
+		current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
+			(unsigned long) arg2;
+		break;
+	case ARCH_SET_GS:
+		current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
+			(unsigned long) arg2;
+		break;
+	case ARCH_GET_FS:
+		ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
+		break;
+	case ARCH_GET_GS:
+		ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
+		break;
+	}
+
+	return ret;
+#endif
 }
 
 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
-- 
2.43.0




More information about the linux-um mailing list