[openwrt/openwrt] bcm4908: optimize Ethernet driver by using build_skb()

LEDE Commits lede-commits at lists.infradead.org
Thu Nov 3 01:38:29 PDT 2022


rmilecki pushed a commit to openwrt/openwrt.git, branch openwrt-21.02:
https://git.openwrt.org/da4e3881eacc66c2432f115d318cc436acead13c

commit da4e3881eacc66c2432f115d318cc436acead13c
Author: Rafał Miłecki <rafal at milecki.pl>
AuthorDate: Thu Oct 27 18:57:39 2022 +0200

    bcm4908: optimize Ethernet driver by using build_skb()
    
    This should slightly improve performance thanks to the better cache
    usage.
    
    Signed-off-by: Rafał Miłecki <rafal at milecki.pl>
    (cherry picked from commit 6a02205a4d94a7b6a888ec55d1aecd60ebb20d77)
---
 ...1-net-broadcom-bcm4908_enet-use-build_skb.patch | 152 +++++++++++++++++++++
 1 file changed, 152 insertions(+)

diff --git a/target/linux/bcm4908/patches-5.4/079-v6.2-0001-net-broadcom-bcm4908_enet-use-build_skb.patch b/target/linux/bcm4908/patches-5.4/079-v6.2-0001-net-broadcom-bcm4908_enet-use-build_skb.patch
new file mode 100644
index 0000000000..1a3dc62d44
--- /dev/null
+++ b/target/linux/bcm4908/patches-5.4/079-v6.2-0001-net-broadcom-bcm4908_enet-use-build_skb.patch
@@ -0,0 +1,152 @@
+From 3a1cc23a75abcd9cea585eb84846507363d58397 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal at milecki.pl>
+Date: Tue, 25 Oct 2022 15:22:45 +0200
+Subject: [PATCH] net: broadcom: bcm4908_enet: use build_skb()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+RX code can be more efficient with the build_skb(). Allocating actual
+SKB around eth packet buffer - right before passing it up - results in
+a better cache usage.
+
+Without RPS (echo 0 > rps_cpus) BCM4908 NAT masq performance "jumps"
+between two speeds: ~900 Mbps and 940 Mbps (it's a 4 CPUs SoC). This
+change bumps the lower speed from 905 Mb/s to 918 Mb/s (tested using
+single stream iperf 2.0.5 traffic).
+
+There are more optimizations to consider. One obvious to try is GRO
+however as BCM4908 doesn't do hw csum is may actually lower performance.
+Sometimes. Some early testing:
+
+┌─────────────────────────────────┬─────────────────────┬────────────────────┐
+│                                 │ netif_receive_skb() │ napi_gro_receive() │
+├─────────────────────────────────┼─────────────────────┼────────────────────┤
+│ netdev_alloc_skb()              │            905 Mb/s │           892 Mb/s │
+│ napi_alloc_frag() + build_skb() │            918 Mb/s │           917 Mb/s │
+└─────────────────────────────────┴─────────────────────┴────────────────────┘
+
+Another ideas:
+1. napi_build_skb()
+2. skb_copy_from_linear_data() for small packets
+
+Those need proper testing first though. That can be done later.
+
+Signed-off-by: Rafał Miłecki <rafal at milecki.pl>
+Link: https://lore.kernel.org/r/20221025132245.22871-1-zajec5@gmail.com
+Signed-off-by: Paolo Abeni <pabeni at redhat.com>
+---
+ drivers/net/ethernet/broadcom/bcm4908_enet.c | 53 +++++++++++++-------
+ 1 file changed, 36 insertions(+), 17 deletions(-)
+
+--- a/drivers/net/ethernet/broadcom/bcm4908_enet.c
++++ b/drivers/net/ethernet/broadcom/bcm4908_enet.c
+@@ -36,13 +36,24 @@
+ #define ENET_MAX_ETH_OVERHEAD			(ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
+ 						 ETH_FCS_LEN + 4) /* 32 */
+ 
++#define ENET_RX_SKB_BUF_SIZE			(NET_SKB_PAD + NET_IP_ALIGN + \
++						 ETH_HLEN + BRCM_MAX_TAG_LEN + VLAN_HLEN + \
++						 ENET_MTU_MAX + ETH_FCS_LEN + 4)
++#define ENET_RX_SKB_BUF_ALLOC_SIZE		(SKB_DATA_ALIGN(ENET_RX_SKB_BUF_SIZE) + \
++						 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
++#define ENET_RX_BUF_DMA_OFFSET			(NET_SKB_PAD + NET_IP_ALIGN)
++#define ENET_RX_BUF_DMA_SIZE			(ENET_RX_SKB_BUF_SIZE - ENET_RX_BUF_DMA_OFFSET)
++
+ struct bcm4908_enet_dma_ring_bd {
+ 	__le32 ctl;
+ 	__le32 addr;
+ } __packed;
+ 
+ struct bcm4908_enet_dma_ring_slot {
+-	struct sk_buff *skb;
++	union {
++		void *buf;			/* RX */
++		struct sk_buff *skb;		/* TX */
++	};
+ 	unsigned int len;
+ 	dma_addr_t dma_addr;
+ };
+@@ -259,22 +270,21 @@ static int bcm4908_enet_dma_alloc_rx_buf
+ 	u32 tmp;
+ 	int err;
+ 
+-	slot->len = ENET_MTU_MAX + ENET_MAX_ETH_OVERHEAD;
+-
+-	slot->skb = netdev_alloc_skb(enet->netdev, slot->len);
+-	if (!slot->skb)
++	slot->buf = napi_alloc_frag(ENET_RX_SKB_BUF_ALLOC_SIZE);
++	if (!slot->buf)
+ 		return -ENOMEM;
+ 
+-	slot->dma_addr = dma_map_single(dev, slot->skb->data, slot->len, DMA_FROM_DEVICE);
++	slot->dma_addr = dma_map_single(dev, slot->buf + ENET_RX_BUF_DMA_OFFSET,
++					ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
+ 	err = dma_mapping_error(dev, slot->dma_addr);
+ 	if (err) {
+ 		dev_err(dev, "Failed to map DMA buffer: %d\n", err);
+-		kfree_skb(slot->skb);
+-		slot->skb = NULL;
++		skb_free_frag(slot->buf);
++		slot->buf = NULL;
+ 		return err;
+ 	}
+ 
+-	tmp = slot->len << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
++	tmp = ENET_RX_BUF_DMA_SIZE << DMA_CTL_LEN_DESC_BUFLENGTH_SHIFT;
+ 	tmp |= DMA_CTL_STATUS_OWN;
+ 	if (idx == enet->rx_ring.length - 1)
+ 		tmp |= DMA_CTL_STATUS_WRAP;
+@@ -314,11 +324,11 @@ static void bcm4908_enet_dma_uninit(stru
+ 
+ 	for (i = rx_ring->length - 1; i >= 0; i--) {
+ 		slot = &rx_ring->slots[i];
+-		if (!slot->skb)
++		if (!slot->buf)
+ 			continue;
+ 		dma_unmap_single(dev, slot->dma_addr, slot->len, DMA_FROM_DEVICE);
+-		kfree_skb(slot->skb);
+-		slot->skb = NULL;
++		skb_free_frag(slot->buf);
++		slot->buf = NULL;
+ 	}
+ }
+ 
+@@ -576,6 +586,7 @@ static int bcm4908_enet_poll_rx(struct n
+ 	while (handled < weight) {
+ 		struct bcm4908_enet_dma_ring_bd *buf_desc;
+ 		struct bcm4908_enet_dma_ring_slot slot;
++		struct sk_buff *skb;
+ 		u32 ctl;
+ 		int len;
+ 		int err;
+@@ -599,16 +610,24 @@ static int bcm4908_enet_poll_rx(struct n
+ 
+ 		if (len < ETH_ZLEN ||
+ 		    (ctl & (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) != (DMA_CTL_STATUS_SOP | DMA_CTL_STATUS_EOP)) {
+-			kfree_skb(slot.skb);
++			skb_free_frag(slot.buf);
+ 			enet->netdev->stats.rx_dropped++;
+ 			break;
+ 		}
+ 
+-		dma_unmap_single(dev, slot.dma_addr, slot.len, DMA_FROM_DEVICE);
++		dma_unmap_single(dev, slot.dma_addr, ENET_RX_BUF_DMA_SIZE, DMA_FROM_DEVICE);
++
++		skb = build_skb(slot.buf, ENET_RX_SKB_BUF_ALLOC_SIZE);
++		if (unlikely(!skb)) {
++			skb_free_frag(slot.buf);
++			enet->netdev->stats.rx_dropped++;
++			break;
++		}
++		skb_reserve(skb, ENET_RX_BUF_DMA_OFFSET);
++		skb_put(skb, len - ETH_FCS_LEN);
++		skb->protocol = eth_type_trans(skb, enet->netdev);
+ 
+-		skb_put(slot.skb, len - ETH_FCS_LEN);
+-		slot.skb->protocol = eth_type_trans(slot.skb, enet->netdev);
+-		netif_receive_skb(slot.skb);
++		netif_receive_skb(skb);
+ 
+ 		enet->netdev->stats.rx_packets++;
+ 		enet->netdev->stats.rx_bytes += len;




More information about the lede-commits mailing list