[RFC PATCH v3 0/4] Simplify kernel-mode NEON
Dave Martin
Dave.Martin at arm.com
Thu May 25 11:24:57 PDT 2017
This series aims to simplify kernel-mode NEON.
Changes since RFC v2:
* Re-enable softirqs between kernel_neon_begin() and kernel_mode_end()
(This was the original intention, but I was overtaken by irrational
paranoia when drafting the previous version of the patch.)
* softirq disable/enable is removed from fpsimd_thread_switch() (which
runs with hardirqs disabled in any case, thus local_bh_enable() is
treated as an error by the core code). The comment block is updated
to clarify this situation.
This also means that no cost is added to the context switch critical
path after all.
* Move the this_cpu_write(fpsimd_last_state, NULL) back outside the
conditional in kernel_neon_begin() (i.e., where it was originally
before this series). RFC v2 accidentally moved this invalidation
inside the conditional, which leads to state corruption if switching
back to a user task previously running on the same CPU, after
kernel_mode_neon() is used by a kernel thread.
* Minor formatting and spurious #include tidyups.
Testing:
* For now, I've hacked linux/kernel/time/timer.c:run_timer_softirq() to
call kernel_mode_neon_begin() and erase the FPSIMD registers, and
jacked CONFIG_HZ to 1000. I also added a test syscall sys_blatt_neon
that userspace can hammer to trigger the nested kernel_neon_begin()
scenario.
See [2].
This is not hugely realistic, but was sufficient to diagnose the
corruption problem described above, when running on a model.
Original blurb:
The main motivation for these changes is that supporting kernel-mode
NEON alongside SVE is tricky with the current framework: the current
partial save/restore mechanisms would need additional porting to
save/restore the extended SVE vector bits, and this renders the cost
saving of partial save/restore rather doubtful -- even if not all vector
registers are saved, the save/restore cost will still grow with
increasing vector length. We could get the mechanics of this to work in
principle, but it doesn't feel like the right thing to do.
If we withdraw kernel-mode NEON support for hardirq context, accept some
extra softirq latency and disable nesting, then we can simplify the code
by always saving the task context directly into task_struct and
deferring all restore to ret_to_user. Previous discussions with Ard
Biesheuvel suggest that this is a feasible approach and reasonably
aligned with other architectures.
The main impact on client code is that callers must check that
kernel-mode NEON is usable in the current context and fall back to a
non-NEON when necessary. Ard has already done some work on this. [1]
The interesting changes are all in patch 2: the first patch just adds a
header inclusion guard that I noted was missing.
[1] git://git.kernel.org/pub/scm/linux/kernel/git/ardb/linux.git kernel-mode-neon
Dave Martin (4):
arm64: neon: Add missing header guard in <asm/neon.h>
arm64: fpsimd: Consistently use __this_cpu_ ops where appropriate
arm64: neon: Remove support for nested or hardirq kernel-mode NEON
arm64: neon: Add backwards compatibility kernel_neon_begin_partial()
arch/arm64/include/asm/fpsimd.h | 14 ----
arch/arm64/include/asm/fpsimdmacros.h | 56 ----------------
arch/arm64/include/asm/neon.h | 12 +++-
arch/arm64/include/asm/simd.h | 56 ++++++++++++++++
arch/arm64/kernel/entry-fpsimd.S | 24 -------
arch/arm64/kernel/fpsimd.c | 119 +++++++++++++++++++++++-----------
6 files changed, 146 insertions(+), 135 deletions(-)
create mode 100644 arch/arm64/include/asm/simd.h
--
[2] Test hacks, not for merging
>From 584f1187269dcee04cb43fc6cb443d8e56454e2b Mon Sep 17 00:00:00 2001
From: Dave Martin <Dave.Martin at arm.com>
Date: Thu, 25 May 2017 19:08:42 +0100
Subject: [PATCH] test hacks
---
arch/arm64/kernel/fpsimd.c | 63 +++++++++++++++++++++++++++++++++-
arch/arm64/kernel/sys.c | 58 +++++++++++++++++++++++++++++++
blatt-neon.c | 31 +++++++++++++++++
include/uapi/asm-generic/unistd.h | 5 ++-
kernel/time/timer.c | 72 +++++++++++++++++++++++++++++++++++++++
5 files changed, 227 insertions(+), 2 deletions(-)
create mode 100644 blatt-neon.c
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 14329d6..6156e56 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -22,6 +22,7 @@
#include <linux/cpu_pm.h>
#include <linux/kernel.h>
#include <linux/init.h>
+#include <linux/irqflags.h>
#include <linux/percpu.h>
#include <linux/sched/signal.h>
#include <linux/signal.h>
@@ -37,6 +38,8 @@
#define FPEXC_IXF (1 << 4)
#define FPEXC_IDF (1 << 7)
+static void print_suspicious_vregs(char const *str, struct fpsimd_state const *st);
+
/*
* In order to reduce the number of times the FPSIMD state is needlessly saved
* and restored, we need to keep track of two things:
@@ -138,13 +141,18 @@ void fpsimd_thread_switch(struct task_struct *next)
{
if (!system_supports_fpsimd())
return;
+
+ BUG_ON(!irqs_disabled());
+
/*
* Save the current FPSIMD state to memory, but only if whatever is in
* the registers is in fact the most recent userland FPSIMD state of
* 'current'.
*/
- if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE))
+ if (current->mm && !test_thread_flag(TIF_FOREIGN_FPSTATE)) {
fpsimd_save_state(¤t->thread.fpsimd_state);
+ print_suspicious_vregs("sched-out", ¤t->thread.fpsimd_state);
+ }
if (next->mm) {
/*
@@ -197,6 +205,55 @@ void fpsimd_preserve_current_state(void)
local_bh_enable();
}
+static bool suspicious_vreg(u8 const *reg)
+{
+ u8 b = *reg;
+ unsigned int i;
+
+ if (b < 0xc2 || b > 0xc7)
+ return false;
+
+ for (i = 1; i < 16; ++i)
+ if (b != reg[i])
+ return false;
+
+ return true;
+}
+
+static int print_vreg_if_suspicious(u8 const *reg, unsigned int num)
+{
+ char s[16 * 2 + 1], *p = s;
+ int n;
+ unsigned int i;
+
+ if (!suspicious_vreg(reg))
+ return 0;
+
+ for (i = 0; i < 16; ++i) {
+ n = snprintf(p, s + sizeof(s) - p, "%.2hhx", reg[i]);
+ if (n < 0)
+ break;
+
+ p += n;
+ }
+
+ *p = '\0';
+ return 1;
+}
+
+static void print_suspicious_vregs(char const *str, struct fpsimd_state const *st)
+{
+ int bad = 0;
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(st->vregs); ++i)
+ bad |= print_vreg_if_suspicious((u8 const *)&st->vregs[i], i);
+
+ if (bad)
+ pr_info("%s[%d]: %s: suspicious vreg(s) detected\n",
+ current->comm, task_pid_nr(current), str);
+}
+
/*
* Load the userland FPSIMD state of 'current' from memory, but only if the
* FPSIMD state already held in the registers is /not/ the most recent FPSIMD
@@ -215,6 +272,8 @@ void fpsimd_restore_current_state(void)
fpsimd_load_state(st);
__this_cpu_write(fpsimd_last_state, st);
st->cpu = smp_processor_id();
+
+ print_suspicious_vregs("fpsimd_restore_current_state", st);
}
local_bh_enable();
@@ -290,6 +349,8 @@ void kernel_neon_begin(void)
/* Invalidate any task state remaining in the fpsimd regs: */
__this_cpu_write(fpsimd_last_state, NULL);
+ print_suspicious_vregs("kernel_neon_begin", ¤t->thread.fpsimd_state);
+
preempt_disable();
local_bh_enable();
diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c
index 26fe8ea..ab73b51 100644
--- a/arch/arm64/kernel/sys.c
+++ b/arch/arm64/kernel/sys.c
@@ -26,6 +26,8 @@
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <asm/cpufeature.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags,
@@ -45,6 +47,62 @@ SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
return sys_personality(personality);
}
+SYSCALL_DEFINE0(blatt_neon)
+{
+ int ret = 0;
+ unsigned int i;
+
+ BUG_ON(!may_use_simd());
+
+ kernel_neon_begin();
+
+ for (i = 0; i < 1000; ++i) {
+ asm volatile (
+ "movi v0.16b, #0\n\t"
+ "movi v1.16b, #0\n\t"
+ "movi v2.16b, #0\n\t"
+ "movi v3.16b, #0\n\t"
+ "movi v4.16b, #0\n\t"
+ "movi v5.16b, #0\n\t"
+ "movi v6.16b, #0\n\t"
+ "movi v7.16b, #0\n\t"
+ "movi v8.16b, #0\n\t"
+ "movi v9.16b, #0\n\t"
+ "movi v10.16b, #0\n\t"
+ "movi v11.16b, #0\n\t"
+ "movi v12.16b, #0\n\t"
+ "movi v13.16b, #0\n\t"
+ "movi v14.16b, #0\n\t"
+ "movi v15.16b, #0\n\t"
+ "movi v16.16b, #0\n\t"
+ "movi v17.16b, #0\n\t"
+ "movi v18.16b, #0\n\t"
+ "movi v19.16b, #0\n\t"
+ "movi v20.16b, #0\n\t"
+ "movi v21.16b, #0\n\t"
+ "movi v22.16b, #0\n\t"
+ "movi v23.16b, #0\n\t"
+ "movi v24.16b, #0\n\t"
+ "movi v25.16b, #0\n\t"
+ "movi v26.16b, #0\n\t"
+ "movi v27.16b, #0\n\t"
+ "movi v28.16b, #0\n\t"
+ "movi v29.16b, #0\n\t"
+ "movi v30.16b, #0\n\t"
+ "movi v31.16b, #0"
+ );
+
+ if (test_thread_flag(TIF_SIGPENDING)) {
+ ret = -EINTR;
+ break;
+ }
+ }
+
+ kernel_neon_end();
+
+ return ret;
+}
+
/*
* Wrappers to pass the pt_regs argument.
*/
diff --git a/blatt-neon.c b/blatt-neon.c
new file mode 100644
index 0000000..fafb144
--- /dev/null
+++ b/blatt-neon.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <asm/unistd.h>
+
+static long blatt_neon(void)
+{
+ register long ret asm("x0");
+ register unsigned int scno asm("x8") = __NR_blatt_neon;
+
+ asm volatile (
+ "svc #0"
+ : "=r" (ret)
+ : "r" (scno)
+ );
+
+ return ret;
+}
+
+int main(void)
+{
+ long ret;
+
+ while (1) {
+ ret = blatt_neon();
+ if (ret) {
+ fprintf(stderr, "%s\n", strerror(ret));
+ return EXIT_FAILURE;
+ }
+ }
+}
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 061185a..bf2d3e9 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -732,8 +732,11 @@ __SYSCALL(__NR_pkey_free, sys_pkey_free)
#define __NR_statx 291
__SYSCALL(__NR_statx, sys_statx)
+#define __NR_blatt_neon 292
+__SYSCALL(__NR_blatt_neon, sys_blatt_neon)
+
#undef __NR_syscalls
-#define __NR_syscalls 292
+#define __NR_syscalls 293
/*
* All syscalls below here should go away really,
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 152a706..dc79771 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -34,6 +34,7 @@
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
+#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
@@ -50,6 +51,8 @@
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
#include "tick-internal.h"
@@ -1604,6 +1607,21 @@ static inline void __run_timers(struct timer_base *base)
spin_unlock_irq(&base->lock);
}
+static u64 neon_busy_count, neon_idle_count;
+
+static int __init timer_softirq_debugfs_init(void)
+{
+ struct dentry *dir = debugfs_create_dir("neon-softirq", NULL);
+
+ if (!dir ||
+ !debugfs_create_u64("busy", 0444, dir, &neon_busy_count) ||
+ !debugfs_create_u64("idle", 0444, dir, &neon_idle_count))
+ WARN_ON(1);
+
+ return 0;
+}
+early_initcall(timer_softirq_debugfs_init);
+
/*
* This function runs timers and the timer-tq in bottom half context.
*/
@@ -1611,6 +1629,60 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ BUG_ON(preemptible());
+ BUG_ON(!softirq_count());
+
+ if (!may_use_simd())
+ neon_busy_count++;
+ else /* may_use_simd() */ {
+ static u8 n = 0xc2;
+
+ neon_idle_count++;
+
+ kernel_neon_begin();
+ asm volatile (
+ "dup v0.16b, %w0\n\t"
+ "dup v1.16b, %w0\n\t"
+ "dup v2.16b, %w0\n\t"
+ "dup v3.16b, %w0\n\t"
+ "dup v4.16b, %w0\n\t"
+ "dup v5.16b, %w0\n\t"
+ "dup v6.16b, %w0\n\t"
+ "dup v7.16b, %w0\n\t"
+ "dup v8.16b, %w0\n\t"
+ "dup v9.16b, %w0\n\t"
+ "dup v10.16b, %w0\n\t"
+ "dup v11.16b, %w0\n\t"
+ "dup v12.16b, %w0\n\t"
+ "dup v13.16b, %w0\n\t"
+ "dup v14.16b, %w0\n\t"
+ "dup v15.16b, %w0\n\t"
+ "dup v16.16b, %w0\n\t"
+ "dup v17.16b, %w0\n\t"
+ "dup v18.16b, %w0\n\t"
+ "dup v19.16b, %w0\n\t"
+ "dup v20.16b, %w0\n\t"
+ "dup v21.16b, %w0\n\t"
+ "dup v22.16b, %w0\n\t"
+ "dup v23.16b, %w0\n\t"
+ "dup v24.16b, %w0\n\t"
+ "dup v25.16b, %w0\n\t"
+ "dup v26.16b, %w0\n\t"
+ "dup v27.16b, %w0\n\t"
+ "dup v28.16b, %w0\n\t"
+ "dup v29.16b, %w0\n\t"
+ "dup v30.16b, %w0\n\t"
+ "dup v31.16b, %w0"
+ :: "r" (n)
+ );
+
+ ++n;
+ if (n == 0xc8)
+ n = 0xc2;
+
+ kernel_neon_end();
+ } /* may_use_simd() */
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
--
2.1.4
More information about the linux-arm-kernel
mailing list