[openwrt/openwrt] kernel: add support for threaded network backlog processing

Fri Mar 24 10:24:48 PDT 2023

nbd pushed a commit to openwrt/openwrt.git, branch master:
https://git.openwrt.org/4ce0405e1880fa4a0dd565e0369ad2472f4b6848

commit 4ce0405e1880fa4a0dd565e0369ad2472f4b6848
Author: Felix Fietkau <nbd at nbd.name>
AuthorDate: Sun Feb 19 15:01:43 2023 +0100

    kernel: add support for threaded network backlog processing
    
    This can improve load balancing by pushing backlog (and RPS) processing
    to separate threads, allowing the scheduler to distribute the load.
    It can be enabled with: echo 1 > /proc/sys/net/core/backlog_threaded
    
    Signed-off-by: Felix Fietkau <nbd at nbd.name>
---
 .../hack-5.15/721-net-add-packet-mangeling.patch   |  10 +-
 ...d-optional-threading-for-backlog-processi.patch | 224 +++++++++++++++++++++
 2 files changed, 229 insertions(+), 5 deletions(-)

diff --git a/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch b/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch
index 9dc86303a7..a1d621a7a9 100644
--- a/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch
+++ b/target/linux/generic/hack-5.15/721-net-add-packet-mangeling.patch
@@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd at nbd.name>
 
 --- a/include/linux/netdevice.h
 +++ b/include/linux/netdevice.h
-@@ -1676,6 +1676,10 @@ enum netdev_priv_flags {
+@@ -1677,6 +1677,10 @@ enum netdev_priv_flags {
  	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31),
  };
  
@@ -30,7 +30,7 @@ Signed-off-by: Felix Fietkau <nbd at nbd.name>
  #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
  #define IFF_EBRIDGE			IFF_EBRIDGE
  #define IFF_BONDING			IFF_BONDING
-@@ -1708,6 +1712,7 @@ enum netdev_priv_flags {
+@@ -1709,6 +1713,7 @@ enum netdev_priv_flags {
  #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
  #define IFF_LIVE_RENAME_OK		IFF_LIVE_RENAME_OK
  #define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR
@@ -38,7 +38,7 @@ Signed-off-by: Felix Fietkau <nbd at nbd.name>
  
  /* Specifies the type of the struct net_device::ml_priv pointer */
  enum netdev_ml_priv_type {
-@@ -2009,6 +2014,7 @@ struct net_device {
+@@ -2010,6 +2015,7 @@ struct net_device {
  	/* Read-mostly cache-line for fast-path access */
  	unsigned int		flags;
  	unsigned int		priv_flags;
@@ -46,7 +46,7 @@ Signed-off-by: Felix Fietkau <nbd at nbd.name>
  	const struct net_device_ops *netdev_ops;
  	int			ifindex;
  	unsigned short		gflags;
-@@ -2069,6 +2075,11 @@ struct net_device {
+@@ -2070,6 +2076,11 @@ struct net_device {
  	const struct tlsdev_ops *tlsdev_ops;
  #endif
  
@@ -58,7 +58,7 @@ Signed-off-by: Felix Fietkau <nbd at nbd.name>
  	const struct header_ops *header_ops;
  
  	unsigned char		operstate;
-@@ -2143,6 +2154,10 @@ struct net_device {
+@@ -2144,6 +2155,10 @@ struct net_device {
  	struct mctp_dev __rcu	*mctp_ptr;
  #endif
  
diff --git a/target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch b/target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch
new file mode 100644
index 0000000000..463f405f3a
--- /dev/null
+++ b/target/linux/generic/pending-5.15/760-net-core-add-optional-threading-for-backlog-processi.patch
@@ -0,0 +1,224 @@
+From: Felix Fietkau <nbd at nbd.name>
+Date: Thu, 16 Feb 2023 18:39:04 +0100
+Subject: [PATCH] net/core: add optional threading for backlog processing
+
+When dealing with few flows or an imbalance on CPU utilization, static RPS
+CPU assignment can be too inflexible. Add support for enabling threaded NAPI
+for backlog processing in order to allow the scheduler to better balance
+processing. This helps better spread the load across idle CPUs.
+
+Signed-off-by: Felix Fietkau <nbd at nbd.name>
+---
+
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -500,6 +500,7 @@ static inline bool napi_complete(struct
+ }
+ 
+ int dev_set_threaded(struct net_device *dev, bool threaded);
++int backlog_set_threaded(bool threaded);
+ 
+ /**
+  *	napi_disable - prevent NAPI from scheduling
+@@ -3363,6 +3364,7 @@ struct softnet_data {
+ 	unsigned int		processed;
+ 	unsigned int		time_squeeze;
+ 	unsigned int		received_rps;
++	unsigned int		process_queue_empty;
+ #ifdef CONFIG_RPS
+ 	struct softnet_data	*rps_ipi_list;
+ #endif
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -4574,7 +4574,7 @@ static int rps_ipi_queued(struct softnet
+ #ifdef CONFIG_RPS
+ 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+ 
+-	if (sd != mysd) {
++	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
+ 		sd->rps_ipi_next = mysd->rps_ipi_list;
+ 		mysd->rps_ipi_list = sd;
+ 
+@@ -5755,6 +5755,8 @@ static DEFINE_PER_CPU(struct work_struct
+ /* Network device is going away, flush any packets still pending */
+ static void flush_backlog(struct work_struct *work)
+ {
++	unsigned int process_queue_empty;
++	bool threaded, flush_processq;
+ 	struct sk_buff *skb, *tmp;
+ 	struct softnet_data *sd;
+ 
+@@ -5770,9 +5772,18 @@ static void flush_backlog(struct work_st
+ 			input_queue_head_incr(sd);
+ 		}
+ 	}
++
++	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
++	flush_processq = threaded &&
++			 !skb_queue_empty_lockless(&sd->process_queue);
++	if (flush_processq)
++		process_queue_empty = sd->process_queue_empty;
+ 	rps_unlock(sd);
+ 	local_irq_enable();
+ 
++	if (threaded)
++		goto out;
++
+ 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
+ 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ 			__skb_unlink(skb, &sd->process_queue);
+@@ -5780,7 +5791,18 @@ static void flush_backlog(struct work_st
+ 			input_queue_head_incr(sd);
+ 		}
+ 	}
++
++out:
+ 	local_bh_enable();
++
++	while (flush_processq) {
++		msleep(1);
++		local_irq_disable();
++		rps_lock(sd);
++		flush_processq = process_queue_empty == sd->process_queue_empty;
++		rps_unlock(sd);
++		local_irq_enable();
++	}
+ }
+ 
+ static bool flush_required(int cpu)
+@@ -6463,6 +6485,7 @@ static int process_backlog(struct napi_s
+ 
+ 		local_irq_disable();
+ 		rps_lock(sd);
++		sd->process_queue_empty++;
+ 		if (skb_queue_empty(&sd->input_pkt_queue)) {
+ 			/*
+ 			 * Inline a custom version of __napi_complete().
+@@ -6472,7 +6495,8 @@ static int process_backlog(struct napi_s
+ 			 * We can use a plain write instead of clear_bit(),
+ 			 * and we dont need an smp_mb() memory barrier.
+ 			 */
+-			napi->state = 0;
++			napi->state &= ~(NAPIF_STATE_SCHED |
++					 NAPIF_STATE_SCHED_THREADED);
+ 			again = false;
+ 		} else {
+ 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
+@@ -6889,6 +6913,57 @@ int dev_set_threaded(struct net_device *
+ }
+ EXPORT_SYMBOL(dev_set_threaded);
+ 
++int backlog_set_threaded(bool threaded)
++{
++	static bool backlog_threaded;
++	int err = 0;
++	int i;
++
++	if (backlog_threaded == threaded)
++		return 0;
++
++	for_each_possible_cpu(i) {
++		struct softnet_data *sd = &per_cpu(softnet_data, i);
++		struct napi_struct *n = &sd->backlog;
++
++		if (n->thread)
++			continue;
++		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
++		if (IS_ERR(n->thread)) {
++			err = PTR_ERR(n->thread);
++			pr_err("kthread_run failed with err %d\n", err);
++			n->thread = NULL;
++			threaded = false;
++			break;
++		}
++
++	}
++
++	backlog_threaded = threaded;
++
++	/* Make sure kthread is created before THREADED bit
++	 * is set.
++	 */
++	smp_mb__before_atomic();
++
++	for_each_possible_cpu(i) {
++		struct softnet_data *sd = &per_cpu(softnet_data, i);
++		struct napi_struct *n = &sd->backlog;
++		unsigned long flags;
++
++		local_irq_save(flags);
++		rps_lock(sd);
++		if (threaded)
++			n->state |= NAPIF_STATE_THREADED;
++		else
++			n->state &= ~NAPIF_STATE_THREADED;
++		rps_unlock(sd);
++		local_irq_restore(flags);
++	}
++
++	return err;
++}
++
+ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ 		    int (*poll)(struct napi_struct *, int), int weight)
+ {
+@@ -11367,6 +11442,9 @@ static int dev_cpu_dead(unsigned int old
+ 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
+ 	local_irq_enable();
+ 
++	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
++		return 0;
++
+ #ifdef CONFIG_RPS
+ 	remsd = oldsd->rps_ipi_list;
+ 	oldsd->rps_ipi_list = NULL;
+--- a/net/core/sysctl_net_core.c
++++ b/net/core/sysctl_net_core.c
+@@ -28,6 +28,7 @@ static int int_3600 = 3600;
+ static int min_sndbuf = SOCK_MIN_SNDBUF;
+ static int min_rcvbuf = SOCK_MIN_RCVBUF;
+ static int max_skb_frags = MAX_SKB_FRAGS;
++static int backlog_threaded;
+ static long long_one __maybe_unused = 1;
+ static long long_max __maybe_unused = LONG_MAX;
+ 
+@@ -114,6 +115,23 @@ static int rps_sock_flow_sysctl(struct c
+ }
+ #endif /* CONFIG_RPS */
+ 
++static int backlog_threaded_sysctl(struct ctl_table *table, int write,
++			       void *buffer, size_t *lenp, loff_t *ppos)
++{
++	static DEFINE_MUTEX(backlog_threaded_mutex);
++	int ret;
++
++	mutex_lock(&backlog_threaded_mutex);
++
++	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
++	if (write && !ret)
++		ret = backlog_set_threaded(backlog_threaded);
++
++	mutex_unlock(&backlog_threaded_mutex);
++
++	return ret;
++}
++
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ static DEFINE_MUTEX(flow_limit_update_mutex);
+ 
+@@ -470,6 +488,15 @@ static struct ctl_table net_core_table[]
+ 		.proc_handler	= rps_sock_flow_sysctl
+ 	},
+ #endif
++	{
++		.procname	= "backlog_threaded",
++		.data		= &backlog_threaded,
++		.maxlen		= sizeof(unsigned int),
++		.mode		= 0644,
++		.proc_handler	= backlog_threaded_sysctl,
++		.extra1		= SYSCTL_ZERO,
++		.extra2		= SYSCTL_ONE
++	},
+ #ifdef CONFIG_NET_FLOW_LIMIT
+ 	{
+ 		.procname	= "flow_limit_cpu_bitmap",