[RFC PATCH v2 08/13] um: nommu: configure fs register on host syscall invocation

Benjamin Berg benjamin at sipsolutions.net
Wed Nov 27 02:00:11 PST 2024


Hi,

On Mon, 2024-11-11 at 15:27 +0900, Hajime Tazaki wrote:
> As userspace on UML/!MMU also need to configure %fs register when it is
> running to correctly access thread structure, host syscalls implemented
> in os-Linux drivers may be puzzled when they are called.  Thus it has to
> configure %fs register via arch_prctl(SET_FS) on every host syscalls.
> 
> [SNIP]
> +
> +/**
> + * get_host_cpu_features() return true with X86_FEATURE_FSGSBASE even
> + * if the kernel is older and disabled using fsgsbase instruction.
> + * thus detection is based on whether SIGILL is raised or not.
> + */
> +static jmp_buf jmpbuf;
> +static void sigill(int sig, siginfo_t *si, void *ctx_void)
> +{
> +	siglongjmp(jmpbuf, 1);
> +}
> +
> +void __init check_fsgsbase(void)
> +{
> +	unsigned long fsbase;
> +	struct sigaction sa;
> +
> +	/* Probe FSGSBASE */
> +	memset(&sa, 0, sizeof(sa));
> +	sa.sa_sigaction = sigill;
> +	sa.sa_flags = SA_SIGINFO | SA_RESETHAND;
> +	sigemptyset(&sa.sa_mask);
> +	if (sigaction(SIGILL, &sa, 0))
> +		os_warn("sigaction");
> +
> +	os_info("Checking FSGSBASE instructions...");
> +	if (sigsetjmp(jmpbuf, 0) == 0) {
> +		asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
> +		host_has_fsgsbase = 1;
> +		os_info("OK\n");
> +	} else {
> +		host_has_fsgsbase = 0;
> +		os_info("disabled\n");
> +	}
> +}

According to Documentation/arch/x86/x86_64/fsgs.rst it looks like this
can also be checked using the HWCAP2_FSGSBASE bit in AT_HWCAP2.

Maybe that is a bit simpler?

> [SNIP]
> 
>  __visible void do_syscall_64(struct pt_regs *regs)
>  {
>  	int syscall;
> @@ -49,6 +76,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
>  	if (syscall == __NR_vfork)
>  		stack_copy = vfork_save_stack();
>  
> +	/* set fs register to the original host one */
> +	os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
> +
>  	if (likely(syscall < NR_syscalls)) {
>  		PT_REGS_SET_SYSCALL_RETURN(regs,
>  				EXECUTE_SYSCALL(syscall, regs));
> @@ -63,6 +93,11 @@ __visible void do_syscall_64(struct pt_regs *regs)
>  	set_thread_flag(TIF_SIGPENDING);
>  	interrupt_end();
>  
> +	/* restore back fs register to userspace configured one */
> +	os_x86_arch_prctl(0, ARCH_SET_FS,
> +		      (void *)(current->thread.regs.regs.gp[FS_BASE
> +						     / sizeof(unsigned long)]));
> +
>  	/* execve succeeded */
>  	if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
>  		userspace(&current->thread.regs.regs);
> diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c
> index edb17fc73e07..d56df936a2d7 100644
> --- a/arch/x86/um/syscalls_64.c
> +++ b/arch/x86/um/syscalls_64.c
> @@ -12,11 +12,26 @@
>  #include <asm/prctl.h> /* XXX This should get the constants from libc */
>  #include <registers.h>
>  #include <os.h>
> +#include <asm/thread_info.h>
> +#include <asm/mman.h>
> +
> +#ifndef CONFIG_MMU
> +/*
> + * The guest libc can change FS, which confuses the host libc.
> + * In fact, changing FS directly is not supported (check
> + * man arch_prctl). So, whenever we make a host syscall,
> + * we should be changing FS to the original FS (not the
> + * one set by the guest libc). This original FS is stored
> + * in host_fs.
> + */
> +long long host_fs = -1;

Right, the libc already uses it for its own thread-local storage. That
is a bit annoying, as UML doesn't need threading in that sense.

Note that similar handling needs to happen for every userspace to
kernel switch. I guess the only other location is the signal handler.

Benjamin

> +#endif
>  
>  long arch_prctl(struct task_struct *task, int option,
>  		unsigned long __user *arg2)
>  {
>  	long ret = -EINVAL;
> +#ifdef CONFIG_MMU
>  
>  	switch (option) {
>  	case ARCH_SET_FS:
> @@ -38,6 +53,48 @@ long arch_prctl(struct task_struct *task, int option,
>  	}
>  
>  	return ret;
> +#else
> +
> +	unsigned long *ptr = arg2, tmp;
> +
> +	switch (option) {
> +	case ARCH_SET_FS:
> +		if (host_fs == -1)
> +			os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs);
> +		ret = 0;
> +		break;
> +	case ARCH_SET_GS:
> +		ret = 0;
> +		break;
> +	case ARCH_GET_FS:
> +	case ARCH_GET_GS:
> +		ptr = &tmp;
> +		break;
> +	}
> +
> +	ret = os_arch_prctl(0, option, ptr);
> +	if (ret)
> +		return ret;
> +
> +	switch (option) {
> +	case ARCH_SET_FS:
> +		current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
> +			(unsigned long) arg2;
> +		break;
> +	case ARCH_SET_GS:
> +		current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
> +			(unsigned long) arg2;
> +		break;
> +	case ARCH_GET_FS:
> +		ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
> +		break;
> +	case ARCH_GET_GS:
> +		ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
> +		break;
> +	}
> +
> +	return ret;
> +#endif
>  }
>  
>  SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)




More information about the linux-um mailing list