[RFC PATCH v2 08/13] um: nommu: configure fs register on host syscall invocation
Hajime Tazaki
thehajime at gmail.com
Sun Nov 10 22:27:08 PST 2024
As userspace on UML/!MMU also need to configure %fs register when it is
running to correctly access thread structure, host syscalls implemented
in os-Linux drivers may be puzzled when they are called. Thus it has to
configure %fs register via arch_prctl(SET_FS) on every host syscalls.
Signed-off-by: Hajime Tazaki <thehajime at gmail.com>
Signed-off-by: Ricardo Koller <ricarkol at google.com>
---
arch/um/include/shared/os.h | 5 ++++
arch/um/os-Linux/Makefile | 4 +--
arch/um/os-Linux/cpu.c | 50 ++++++++++++++++++++++++++++++++
arch/um/os-Linux/internal.h | 5 ++++
arch/um/os-Linux/main.c | 5 ++++
arch/um/os-Linux/process.c | 8 ++++++
arch/um/os-Linux/start_up.c | 3 ++
arch/x86/um/do_syscall_64.c | 35 +++++++++++++++++++++++
arch/x86/um/syscalls_64.c | 57 +++++++++++++++++++++++++++++++++++++
9 files changed, 170 insertions(+), 2 deletions(-)
create mode 100644 arch/um/os-Linux/cpu.c
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 5a6722f254d5..69a7854f5f87 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -136,6 +136,9 @@ static inline struct openflags of_cloexec(struct openflags flags)
return flags;
}
+/* cpu.c */
+extern int host_has_fsgsbase;
+
/* file.c */
extern int os_stat_file(const char *file_name, struct uml_stat *buf);
extern int os_stat_fd(const int fd, struct uml_stat *buf);
@@ -221,6 +224,8 @@ extern int os_drop_memory(void *addr, int length);
extern int can_drop_memory(void);
extern int os_mincore(void *addr, unsigned long len);
#ifndef CONFIG_MMU
+extern long long host_fs;
+extern int os_arch_prctl(int pid, int option, unsigned long *arg);
extern int os_setup_seccomp(void);
#endif
diff --git a/arch/um/os-Linux/Makefile b/arch/um/os-Linux/Makefile
index 20ff8d5971db..af7c5f4373bc 100644
--- a/arch/um/os-Linux/Makefile
+++ b/arch/um/os-Linux/Makefile
@@ -8,7 +8,7 @@ KCOV_INSTRUMENT := n
obj-y = execvp.o file.o helper.o irq.o main.o mem.o process.o \
registers.o sigio.o signal.o start_up.o time.o tty.o \
- umid.o user_syms.o util.o drivers/ skas/
+ umid.o user_syms.o util.o cpu.o drivers/ skas/
CFLAGS_signal.o += -Wframe-larger-than=4096
@@ -18,7 +18,7 @@ obj-$(CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA) += elf_aux.o
USER_OBJS := $(user-objs-y) elf_aux.o execvp.o file.o helper.o irq.o \
main.o mem.o process.o registers.o sigio.o signal.o start_up.o time.o \
- tty.o umid.o util.o
+ tty.o umid.o util.o cpu.o
include $(srctree)/arch/um/scripts/Makefile.rules
CFLAGS_process.o=-g -O0
diff --git a/arch/um/os-Linux/cpu.c b/arch/um/os-Linux/cpu.c
new file mode 100644
index 000000000000..49b6d8b4d65d
--- /dev/null
+++ b/arch/um/os-Linux/cpu.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <string.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <init.h>
+#include "internal.h"
+
+int host_has_fsgsbase;
+/* those definitions can be pulled from os.h but if we include this
+ * it shows conflicts of jmp_buf definitions in longjmp.h (UM) and
+ * host one. thus we declared here instead.
+ */
+void os_info(const char *fmt, ...);
+void os_warn(const char *fmt, ...);
+
+/**
+ * get_host_cpu_features() return true with X86_FEATURE_FSGSBASE even
+ * if the kernel is older and disabled using fsgsbase instruction.
+ * thus detection is based on whether SIGILL is raised or not.
+ */
+static jmp_buf jmpbuf;
+static void sigill(int sig, siginfo_t *si, void *ctx_void)
+{
+ siglongjmp(jmpbuf, 1);
+}
+
+void __init check_fsgsbase(void)
+{
+ unsigned long fsbase;
+ struct sigaction sa;
+
+ /* Probe FSGSBASE */
+ memset(&sa, 0, sizeof(sa));
+ sa.sa_sigaction = sigill;
+ sa.sa_flags = SA_SIGINFO | SA_RESETHAND;
+ sigemptyset(&sa.sa_mask);
+ if (sigaction(SIGILL, &sa, 0))
+ os_warn("sigaction");
+
+ os_info("Checking FSGSBASE instructions...");
+ if (sigsetjmp(jmpbuf, 0) == 0) {
+ asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
+ host_has_fsgsbase = 1;
+ os_info("OK\n");
+ } else {
+ host_has_fsgsbase = 0;
+ os_info("disabled\n");
+ }
+}
diff --git a/arch/um/os-Linux/internal.h b/arch/um/os-Linux/internal.h
index 317fca190c2b..60220b8b8843 100644
--- a/arch/um/os-Linux/internal.h
+++ b/arch/um/os-Linux/internal.h
@@ -2,6 +2,11 @@
#ifndef __UM_OS_LINUX_INTERNAL_H
#define __UM_OS_LINUX_INTERNAL_H
+/*
+ * cpu.c
+ */
+void check_fsgsbase(void);
+
/*
* elf_aux.c
*/
diff --git a/arch/um/os-Linux/main.c b/arch/um/os-Linux/main.c
index 0afcdeb8995b..aecf63d3db79 100644
--- a/arch/um/os-Linux/main.c
+++ b/arch/um/os-Linux/main.c
@@ -17,6 +17,7 @@
#include <kern_util.h>
#include <os.h>
#include <um_malloc.h>
+#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include "internal.h"
#define PGD_BOUND (4 * 1024 * 1024)
@@ -158,6 +159,10 @@ int __init main(int argc, char **argv, char **envp)
change_sig(SIGPIPE, 0);
ret = linux_main(argc, argv, envp);
+#ifndef CONFIG_MMU
+ os_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
+#endif
+
/*
* Disable SIGPROF - I have no idea why libc doesn't do this or turn
* off the profiling time, but UML dies with a SIGPROF just before
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 5acf6d41a4c2..5a3b09096f92 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -221,6 +221,14 @@ void os_set_pdeathsig(void)
}
#ifndef CONFIG_MMU
+#include <unistd.h>
+#include <sys/syscall.h> /* For SYS_xxx definitions */
+
+int os_arch_prctl(int pid, int option, unsigned long *arg2)
+{
+ return syscall(SYS_arch_prctl, option, arg2);
+}
+
int os_setup_seccomp(void)
{
int err;
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 93fc82c01aba..88164893cbec 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -293,6 +293,9 @@ void __init os_early_checks(void)
*/
check_tmpexec();
+ /* probe fsgsbase instruction */
+ check_fsgsbase();
+
pid = start_ptraced_child();
if (init_pid_registers(pid))
fatal("Failed to initialize default registers");
diff --git a/arch/x86/um/do_syscall_64.c b/arch/x86/um/do_syscall_64.c
index 203bacc4cb3c..75326acc931b 100644
--- a/arch/x86/um/do_syscall_64.c
+++ b/arch/x86/um/do_syscall_64.c
@@ -3,6 +3,8 @@
//#define DEBUG 1
#include <linux/kernel.h>
#include <linux/ptrace.h>
+#include <asm/fsgsbase.h>
+#include <asm/prctl.h>
#include <kern_util.h>
#include <sysdep/syscalls.h>
#include <os.h>
@@ -34,6 +36,31 @@ static void vfork_restore_stack(void *stack_copy)
stack_copy, 8);
}
+static int os_x86_arch_prctl(int pid, int option, unsigned long *arg2)
+{
+ if (host_has_fsgsbase) {
+ switch (option) {
+ case ARCH_SET_FS:
+ wrfsbase(*arg2);
+ break;
+ case ARCH_SET_GS:
+ wrgsbase(*arg2);
+ break;
+ case ARCH_GET_FS:
+ *arg2 = rdfsbase();
+ break;
+ case ARCH_GET_GS:
+ *arg2 = rdgsbase();
+ break;
+ }
+ return 0;
+ } else {
+ return os_arch_prctl(pid, option, arg2);
+ }
+
+ return 0;
+}
+
__visible void do_syscall_64(struct pt_regs *regs)
{
int syscall;
@@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
if (syscall == __NR_vfork)
stack_copy = vfork_save_stack();
+ /* set fs register to the original host one */
+ os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
+
if (likely(syscall < NR_syscalls)) {
PT_REGS_SET_SYSCALL_RETURN(regs,
EXECUTE_SYSCALL(syscall, regs));
@@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs)
set_thread_flag(TIF_SIGPENDING);
interrupt_end();
+ /* restore back fs register to userspace configured one */
+ os_x86_arch_prctl(0, ARCH_SET_FS,
+ (void *)(current->thread.regs.regs.gp[FS_BASE
+ / sizeof(unsigned long)]));
+
/* execve succeeded */
if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
userspace(¤t->thread.regs.regs);
diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
index edb17fc73e07..d56df936a2d7 100644
--- a/arch/x86/um/syscalls_64.c
+++ b/arch/x86/um/syscalls_64.c
@@ -12,11 +12,26 @@
#include <asm/prctl.h> /* XXX This should get the constants from libc */
#include <registers.h>
#include <os.h>
+#include <asm/thread_info.h>
+#include <asm/mman.h>
+
+#ifndef CONFIG_MMU
+/*
+ * The guest libc can change FS, which confuses the host libc.
+ * In fact, changing FS directly is not supported (check
+ * man arch_prctl). So, whenever we make a host syscall,
+ * we should be changing FS to the original FS (not the
+ * one set by the guest libc). This original FS is stored
+ * in host_fs.
+ */
+long long host_fs = -1;
+#endif
long arch_prctl(struct task_struct *task, int option,
unsigned long __user *arg2)
{
long ret = -EINVAL;
+#ifdef CONFIG_MMU
switch (option) {
case ARCH_SET_FS:
@@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option,
}
return ret;
+#else
+
+ unsigned long *ptr = arg2, tmp;
+
+ switch (option) {
+ case ARCH_SET_FS:
+ if (host_fs == -1)
+ os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs);
+ ret = 0;
+ break;
+ case ARCH_SET_GS:
+ ret = 0;
+ break;
+ case ARCH_GET_FS:
+ case ARCH_GET_GS:
+ ptr = &tmp;
+ break;
+ }
+
+ ret = os_arch_prctl(0, option, ptr);
+ if (ret)
+ return ret;
+
+ switch (option) {
+ case ARCH_SET_FS:
+ current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ break;
+ case ARCH_SET_GS:
+ current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
+ (unsigned long) arg2;
+ break;
+ case ARCH_GET_FS:
+ ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
+ break;
+ case ARCH_GET_GS:
+ ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
+ break;
+ }
+
+ return ret;
+#endif
}
SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
--
2.43.0
More information about the linux-um
mailing list