[PATCH -next] riscv/vdso: Add support for time namespaces
tongtiangen
tongtiangen at huawei.com
Mon Sep 27 19:35:42 PDT 2021
kindly ping.
On 2021/9/1 11:20, Tong Tiangen wrote:
> Implement generic vdso time namespace support which also enables time
> namespaces for riscv. This is quite similar to what arm64 does.
>
> selftest/timens test result:
> 1..10
> ok 1 Passed for CLOCK_BOOTTIME (syscall)
> ok 2 Passed for CLOCK_BOOTTIME (vdso)
> ok 3 # SKIP CLOCK_BOOTTIME_ALARM isn't supported
> ok 4 # SKIP CLOCK_BOOTTIME_ALARM isn't supported
> ok 5 Passed for CLOCK_MONOTONIC (syscall)
> ok 6 Passed for CLOCK_MONOTONIC (vdso)
> ok 7 Passed for CLOCK_MONOTONIC_COARSE (syscall)
> ok 8 Passed for CLOCK_MONOTONIC_COARSE (vdso)
> ok 9 Passed for CLOCK_MONOTONIC_RAW (syscall)
> ok 10 Passed for CLOCK_MONOTONIC_RAW (vdso)
> # Totals: pass:8 fail:0 xfail:0 xpass:0 skip:2 error:0
>
> Signed-off-by: Tong Tiangen <tongtiangen at huawei.com>
> ---
> This patch is based on patchset:
> https://lore.kernel.org/lkml/20210901024621.2528797-1-tongtiangen@huawei.com/
>
> arch/riscv/Kconfig | 1 +
> arch/riscv/include/asm/page.h | 2 +
> arch/riscv/include/asm/vdso.h | 2 +-
> arch/riscv/include/asm/vdso/gettimeofday.h | 7 +
> arch/riscv/kernel/vdso.c | 250 ++++++++++++++++-----
> arch/riscv/kernel/vdso/vdso.lds.S | 3 +
> 6 files changed, 211 insertions(+), 54 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index aac669a6c3d8..7b0eff8a7eef 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -61,6 +61,7 @@ config RISCV
> select GENERIC_SCHED_CLOCK
> select GENERIC_SMP_IDLE_THREAD
> select GENERIC_TIME_VSYSCALL if MMU && 64BIT
> + select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO
> select HANDLE_DOMAIN_IRQ
> select HAVE_ARCH_AUDITSYSCALL
> select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 109c97e991a6..b3e5ff0125fe 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -157,6 +157,8 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x);
> #define page_to_bus(page) (page_to_phys(page))
> #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr)))
>
> +#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x))
> +
> #ifdef CONFIG_FLATMEM
> #define pfn_valid(pfn) \
> (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr))
> diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h
> index 34210b22ba91..bee9514104f7 100644
> --- a/arch/riscv/include/asm/vdso.h
> +++ b/arch/riscv/include/asm/vdso.h
> @@ -14,7 +14,7 @@
> */
> #ifdef CONFIG_MMU
>
> -#define __VVAR_PAGES 1
> +#define __VVAR_PAGES 2
>
> #ifndef __ASSEMBLY__
>
> diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h
> index f839f16e0d2a..77d9c2f721c4 100644
> --- a/arch/riscv/include/asm/vdso/gettimeofday.h
> +++ b/arch/riscv/include/asm/vdso/gettimeofday.h
> @@ -76,6 +76,13 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
> return _vdso_data;
> }
>
> +#ifdef CONFIG_TIME_NS
> +static __always_inline
> +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd)
> +{
> + return _timens_data;
> +}
> +#endif
> #endif /* !__ASSEMBLY__ */
>
> #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
> diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c
> index b70956d80408..a9436a65161a 100644
> --- a/arch/riscv/kernel/vdso.c
> +++ b/arch/riscv/kernel/vdso.c
> @@ -13,6 +13,7 @@
> #include <linux/err.h>
> #include <asm/page.h>
> #include <asm/vdso.h>
> +#include <linux/time_namespace.h>
>
> #ifdef CONFIG_GENERIC_TIME_VSYSCALL
> #include <vdso/datapage.h>
> @@ -25,14 +26,12 @@ extern char vdso_start[], vdso_end[];
>
> enum vvar_pages {
> VVAR_DATA_PAGE_OFFSET,
> + VVAR_TIMENS_PAGE_OFFSET,
> VVAR_NR_PAGES,
> };
>
> #define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT)
>
> -static unsigned int vdso_pages __ro_after_init;
> -static struct page **vdso_pagelist __ro_after_init;
> -
> /*
> * The vDSO data page.
> */
> @@ -42,83 +41,228 @@ static union {
> } vdso_data_store __page_aligned_data;
> struct vdso_data *vdso_data = &vdso_data_store.data;
>
> -static int __init vdso_init(void)
> +struct __vdso_info {
> + const char *name;
> + const char *vdso_code_start;
> + const char *vdso_code_end;
> + unsigned long vdso_pages;
> + /* Data Mapping */
> + struct vm_special_mapping *dm;
> + /* Code Mapping */
> + struct vm_special_mapping *cm;
> +};
> +
> +static struct __vdso_info vdso_info __ro_after_init = {
> + .name = "vdso",
> + .vdso_code_start = vdso_start,
> + .vdso_code_end = vdso_end,
> +};
> +
> +static int vdso_mremap(const struct vm_special_mapping *sm,
> + struct vm_area_struct *new_vma)
> +{
> + current->mm->context.vdso = (void *)new_vma->vm_start;
> +
> + return 0;
> +}
> +
> +static int __init __vdso_init(void)
> {
> unsigned int i;
> + struct page **vdso_pagelist;
> + unsigned long pfn;
>
> - vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT;
> - vdso_pagelist =
> - kcalloc(vdso_pages + VVAR_NR_PAGES, sizeof(struct page *), GFP_KERNEL);
> - if (unlikely(vdso_pagelist == NULL)) {
> - pr_err("vdso: pagelist allocation failed\n");
> - return -ENOMEM;
> + if (memcmp(vdso_info.vdso_code_start, "\177ELF", 4)) {
> + pr_err("vDSO is not a valid ELF object!\n");
> + return -EINVAL;
> }
>
> - for (i = 0; i < vdso_pages; i++) {
> - struct page *pg;
> + vdso_info.vdso_pages = (
> + vdso_info.vdso_code_end -
> + vdso_info.vdso_code_start) >>
> + PAGE_SHIFT;
> +
> + vdso_pagelist = kcalloc(vdso_info.vdso_pages,
> + sizeof(struct page *),
> + GFP_KERNEL);
> + if (vdso_pagelist == NULL)
> + return -ENOMEM;
> +
> + /* Grab the vDSO code pages. */
> + pfn = sym_to_pfn(vdso_info.vdso_code_start);
> +
> + for (i = 0; i < vdso_info.vdso_pages; i++)
> + vdso_pagelist[i] = pfn_to_page(pfn + i);
> +
> + vdso_info.cm->pages = vdso_pagelist;
> +
> + return 0;
> +}
> +
> +#ifdef CONFIG_TIME_NS
> +struct vdso_data *arch_get_vdso_data(void *vvar_page)
> +{
> + return (struct vdso_data *)(vvar_page);
> +}
> +
> +/*
> + * The vvar mapping contains data for a specific time namespace, so when a task
> + * changes namespace we must unmap its vvar data for the old namespace.
> + * Subsequent faults will map in data for the new namespace.
> + *
> + * For more details see timens_setup_vdso_data().
> + */
> +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
> +{
> + struct mm_struct *mm = task->mm;
> + struct vm_area_struct *vma;
> +
> + mmap_read_lock(mm);
>
> - pg = virt_to_page(vdso_start + (i << PAGE_SHIFT));
> - vdso_pagelist[i] = pg;
> + for (vma = mm->mmap; vma; vma = vma->vm_next) {
> + unsigned long size = vma->vm_end - vma->vm_start;
> +
> + if (vma_is_special_mapping(vma, vdso_info.dm))
> + zap_page_range(vma, vma->vm_start, size);
> }
> - vdso_pagelist[i] = virt_to_page(vdso_data);
>
> + mmap_read_unlock(mm);
> return 0;
> }
> +
> +static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
> +{
> + if (likely(vma->vm_mm == current->mm))
> + return current->nsproxy->time_ns->vvar_page;
> +
> + /*
> + * VM_PFNMAP | VM_IO protect .fault() handler from being called
> + * through interfaces like /proc/$pid/mem or
> + * process_vm_{readv,writev}() as long as there's no .access()
> + * in special_mapping_vmops.
> + * For more details check_vma_flags() and __access_remote_vm()
> + */
> + WARN(1, "vvar_page accessed remotely");
> +
> + return NULL;
> +}
> +#else
> +static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
> +{
> + return NULL;
> +}
> +#endif
> +
> +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
> + struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> + struct page *timens_page = find_timens_vvar_page(vma);
> + unsigned long pfn;
> +
> + switch (vmf->pgoff) {
> + case VVAR_DATA_PAGE_OFFSET:
> + if (timens_page)
> + pfn = page_to_pfn(timens_page);
> + else
> + pfn = sym_to_pfn(vdso_data);
> + break;
> +#ifdef CONFIG_TIME_NS
> + case VVAR_TIMENS_PAGE_OFFSET:
> + /*
> + * If a task belongs to a time namespace then a namespace
> + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
> + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
> + * offset.
> + * See also the comment near timens_setup_vdso_data().
> + */
> + if (!timens_page)
> + return VM_FAULT_SIGBUS;
> + pfn = sym_to_pfn(vdso_data);
> + break;
> +#endif /* CONFIG_TIME_NS */
> + default:
> + return VM_FAULT_SIGBUS;
> + }
> +
> + return vmf_insert_pfn(vma, vmf->address, pfn);
> +}
> +
> +enum rv_vdso_map {
> + RV_VDSO_MAP_VVAR,
> + RV_VDSO_MAP_VDSO,
> +};
> +
> +static struct vm_special_mapping rv_vdso_maps[] __ro_after_init = {
> + [RV_VDSO_MAP_VVAR] = {
> + .name = "[vvar]",
> + .fault = vvar_fault,
> + },
> + [RV_VDSO_MAP_VDSO] = {
> + .name = "[vdso]",
> + .mremap = vdso_mremap,
> + },
> +};
> +
> +static int __init vdso_init(void)
> +{
> + vdso_info.dm = &rv_vdso_maps[RV_VDSO_MAP_VVAR];
> + vdso_info.cm = &rv_vdso_maps[RV_VDSO_MAP_VDSO];
> +
> + return __vdso_init();
> +}
> arch_initcall(vdso_init);
>
> -int arch_setup_additional_pages(struct linux_binprm *bprm,
> - int uses_interp)
> +static int __setup_additional_pages(struct mm_struct *mm,
> + struct linux_binprm *bprm,
> + int uses_interp)
> {
> - struct mm_struct *mm = current->mm;
> - unsigned long vdso_base, vdso_len;
> - int ret;
> + unsigned long vdso_base, vdso_text_len, vdso_mapping_len;
> + void *ret;
>
> BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
>
> - vdso_len = (vdso_pages + VVAR_NR_PAGES) << PAGE_SHIFT;
> + vdso_text_len = vdso_info.vdso_pages << PAGE_SHIFT;
> + /* Be sure to map the data page */
> + vdso_mapping_len = vdso_text_len + VVAR_SIZE;
>
> - if (mmap_write_lock_killable(mm))
> - return -EINTR;
> -
> - vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0);
> + vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
> if (IS_ERR_VALUE(vdso_base)) {
> - ret = vdso_base;
> - goto end;
> + ret = ERR_PTR(vdso_base);
> + goto up_fail;
> }
>
> - mm->context.vdso = NULL;
> - ret = install_special_mapping(mm, vdso_base, VVAR_SIZE,
> - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]);
> - if (unlikely(ret))
> - goto end;
> + ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE,
> + (VM_READ | VM_MAYREAD | VM_PFNMAP), vdso_info.dm);
> + if (IS_ERR(ret))
> + goto up_fail;
>
> + vdso_base += VVAR_SIZE;
> + mm->context.vdso = (void *)vdso_base;
> ret =
> - install_special_mapping(mm, vdso_base + VVAR_SIZE,
> - vdso_pages << PAGE_SHIFT,
> + _install_special_mapping(mm, vdso_base, vdso_text_len,
> (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC),
> - vdso_pagelist);
> + vdso_info.cm);
>
> - if (unlikely(ret))
> - goto end;
> + if (IS_ERR(ret))
> + goto up_fail;
>
> - /*
> - * Put vDSO base into mm struct. We need to do this before calling
> - * install_special_mapping or the perf counter mmap tracking code
> - * will fail to recognise it as a vDSO (since arch_vma_name fails).
> - */
> - mm->context.vdso = (void *)vdso_base + VVAR_SIZE;
> + return 0;
>
> -end:
> - mmap_write_unlock(mm);
> - return ret;
> +up_fail:
> + mm->context.vdso = NULL;
> + return PTR_ERR(ret);
> }
>
> -const char *arch_vma_name(struct vm_area_struct *vma)
> +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
> {
> - if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso))
> - return "[vdso]";
> - if (vma->vm_mm && (vma->vm_start ==
> - (long)vma->vm_mm->context.vdso - VVAR_SIZE))
> - return "[vdso_data]";
> - return NULL;
> + struct mm_struct *mm = current->mm;
> + int ret;
> +
> + if (mmap_write_lock_killable(mm))
> + return -EINTR;
> +
> + ret = __setup_additional_pages(mm, bprm, uses_interp);
> + mmap_write_unlock(mm);
> +
> + return ret;
> }
> diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S
> index e9111f700af0..01d94aae5bf5 100644
> --- a/arch/riscv/kernel/vdso/vdso.lds.S
> +++ b/arch/riscv/kernel/vdso/vdso.lds.S
> @@ -10,6 +10,9 @@ OUTPUT_ARCH(riscv)
> SECTIONS
> {
> PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
> +#ifdef CONFIG_TIME_NS
> + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
> +#endif
> . = SIZEOF_HEADERS;
>
> .hash : { *(.hash) } :text
>
More information about the linux-riscv
mailing list