Overhead of arm64 LSE per-CPU atomics?

Thu Oct 30 15:37:00 PDT 2025

Hello!

To make event tracing safe for PREEMPT_RT kernels, I have been creating
optimized variants of SRCU readers that use per-CPU atomics.  This works
quite well, but on ARM Neoverse V2, I am seeing about 100ns for a
srcu_read_lock()/srcu_read_unlock() pair, or about 50ns for a single
per-CPU atomic operation.  This contrasts with a handful of nanoseconds
on x86 and similar on ARM for a atomic_set(&foo, atomic_read(&foo) + 1).

In theory, I can mess with SRCU's counter protocol, but I figured I
should check first.

The patch below shows the flavor of the change, but for the simpler
uproberet case where NMI safety is not required, permitting me to
simply disable interrupts across the non-atomic increment operations.
The overhead of interrupt disabling is not wonderful, but ~13ns is way
better than ~100ns any day of the week.

I have to do something like this for internal use here because we have
real hardware that acts this way.  If I don't hear otherwise, I will
also push it to mainline.  So if there is a more dainty approach, this
would be a most excellent time to let me know about it.  ;-)

							Thanx, Paul

------------------------------------------------------------------------

commit 1eee41590d30805ec4f5b4e96c615603b0d058d9
Author: Paul E. McKenney <paulmck at kernel.org>
Date:   Thu Oct 30 09:25:09 2025 -0700

    refscale: Add SRCU-fast-updown readers
    
    This commit adds refscale readers based on srcu_read_lock_fast_updown()
    and srcu_read_lock_fast_updown() ("refscale.scale_type=srcu-fast-updown").
    On my x86 laptop, these are about 2.2ns per pair.
    
    Signed-off-by: Paul E. McKenney <paulmck at kernel.org>

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 7429ec9f0092..07a313782dfd 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -186,6 +186,7 @@ static const struct ref_scale_ops rcu_ops = {
 // Definitions for SRCU ref scale testing.
 DEFINE_STATIC_SRCU(srcu_refctl_scale);
 DEFINE_STATIC_SRCU_FAST(srcu_fast_refctl_scale);
+DEFINE_STATIC_SRCU_FAST_UPDOWN(srcu_fast_updown_refctl_scale);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
 
 static void srcu_ref_scale_read_section(const int nloops)
@@ -254,6 +255,42 @@ static const struct ref_scale_ops srcu_fast_ops = {
 	.name		= "srcu-fast"
 };
 
+static bool srcu_fast_updown_sync_scale_init(void)
+{
+	srcu_ctlp = &srcu_fast_updown_refctl_scale;
+	return true;
+}
+
+static void srcu_fast_updown_ref_scale_read_section(const int nloops)
+{
+	int i;
+	struct srcu_ctr __percpu *scp;
+
+	for (i = nloops; i >= 0; i--) {
+		scp = srcu_read_lock_fast_updown(srcu_ctlp);
+		srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+	}
+}
+
+static void srcu_fast_updown_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+	struct srcu_ctr __percpu *scp;
+
+	for (i = nloops; i >= 0; i--) {
+		scp = srcu_read_lock_fast_updown(srcu_ctlp);
+		un_delay(udl, ndl);
+		srcu_read_unlock_fast_updown(srcu_ctlp, scp);
+	}
+}
+
+static const struct ref_scale_ops srcu_fast_updown_ops = {
+	.init		= srcu_fast_updown_sync_scale_init,
+	.readsection	= srcu_fast_updown_ref_scale_read_section,
+	.delaysection	= srcu_fast_updown_ref_scale_delay_section,
+	.name		= "srcu-fast-updown"
+};
+
 #ifdef CONFIG_TASKS_RCU
 
 // Definitions for RCU Tasks ref scale testing: Empty read markers.
@@ -1479,7 +1516,8 @@ ref_scale_init(void)
 	long i;
 	int firsterr = 0;
 	static const struct ref_scale_ops *scale_ops[] = {
-		&rcu_ops, &srcu_ops, &srcu_fast_ops, RCU_TRACE_OPS RCU_TASKS_OPS
+		&rcu_ops, &srcu_ops, &srcu_fast_ops, &srcu_fast_updown_ops,
+		RCU_TRACE_OPS RCU_TASKS_OPS
 		&refcnt_ops, &percpuinc_ops, &incpercpu_ops, &incpercpupreempt_ops,
 		&incpercpubh_ops, &incpercpuirqsave_ops,
 		&rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,