[PATCH net-next v13 3/3] net: hisilicon: new hip04 ethernet driver

Ding Tianhong dingtianhong at huawei.com
Thu Jan 15 02:29:55 PST 2015


On 2015/1/15 0:34, Eric Dumazet wrote:
> On Wed, 2015-01-14 at 14:34 +0800, Ding Tianhong wrote:
>> Support Hisilicon hip04 ethernet driver, including 100M / 1000M controller.
>> The controller has no tx done interrupt, reclaim xmitted buffer in the poll.
>>
>> v13: Fix the problem of alignment parameters for function and checkpatch warming.
>>
>> v12: According Alex's suggestion, modify the changelog and add MODULE_DEVICE_TABLE
>>      for hip04 ethernet.
>>
>> v11: Add ethtool support for tx coalecse getting and setting, the xmit_more
>>      is not supported for this patch, but I think it could work for hip04,
>>      will support it later after some tests for performance better.
>>
>>      Here are some performance test results by ping and iperf(add tx_coalesce_frames/users),
>>      it looks that the performance and latency is more better by tx_coalesce_frames/usecs.
>>
>>      - Before:
>>      $ ping 192.168.1.1 ...
>>      === 192.168.1.1 ping statistics ===
>>      24 packets transmitted, 24 received, 0% packet loss, time 22999ms
>>      rtt min/avg/max/mdev = 0.180/0.202/0.403/0.043 ms
>>
>>      $ iperf -c 192.168.1.1 ...
>>      [ ID] Interval       Transfer     Bandwidth
>>      [  3]  0.0- 1.0 sec   115 MBytes   945 Mbits/sec
>>
>>      - After:
>>      $ ping 192.168.1.1 ...
>>      === 192.168.1.1 ping statistics ===
>>      24 packets transmitted, 24 received, 0% packet loss, time 22999ms
>>      rtt min/avg/max/mdev = 0.178/0.190/0.380/0.041 ms
>>
>>      $ iperf -c 192.168.1.1 ...
>>      [ ID] Interval       Transfer     Bandwidth
>>      [  3]  0.0- 1.0 sec   115 MBytes   965 Mbits/sec
>>
>> v10: According David Miller and Arnd Bergmann's suggestion, add some modification
>>      for v9 version
>>      - drop the workqueue
>>      - batch cleanup based on tx_coalesce_frames/usecs for better throughput
>>      - use a reasonable default tx timeout (200us, could be shorted
>>        based on measurements) with a range timer
>>      - fix napi poll function return value
>>      - use a lockless queue for cleanup
>>
>> Signed-off-by: Zhangfei Gao <zhangfei.gao at linaro.org>
>> Signed-off-by: Arnd Bergmann <arnd at arndb.de>
>> Signed-off-by: Ding Tianhong <dingtianhong at huawei.com>
>> ---
>>  drivers/net/ethernet/hisilicon/Makefile    |   2 +-
>>  drivers/net/ethernet/hisilicon/hip04_eth.c | 969 +++++++++++++++++++++++++++++
>>  2 files changed, 970 insertions(+), 1 deletion(-)
>>  create mode 100644 drivers/net/ethernet/hisilicon/hip04_eth.c
>>
>> diff --git a/drivers/net/ethernet/hisilicon/Makefile b/drivers/net/ethernet/hisilicon/Makefile
>> index 40115a7..6c14540 100644
>> --- a/drivers/net/ethernet/hisilicon/Makefile
>> +++ b/drivers/net/ethernet/hisilicon/Makefile
>> @@ -3,4 +3,4 @@
>>  #
>>  
>>  obj-$(CONFIG_HIX5HD2_GMAC) += hix5hd2_gmac.o
>> -obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o
>> +obj-$(CONFIG_HIP04_ETH) += hip04_mdio.o hip04_eth.o
>> diff --git a/drivers/net/ethernet/hisilicon/hip04_eth.c b/drivers/net/ethernet/hisilicon/hip04_eth.c
>> new file mode 100644
>> index 0000000..525214e
>> --- /dev/null
>> +++ b/drivers/net/ethernet/hisilicon/hip04_eth.c
>> @@ -0,0 +1,969 @@
>> +
>> +/* Copyright (c) 2014 Linaro Ltd.
>> + * Copyright (c) 2014 Hisilicon Limited.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + */
>> +
>> +#include <linux/module.h>
>> +#include <linux/etherdevice.h>
>> +#include <linux/platform_device.h>
>> +#include <linux/interrupt.h>
>> +#include <linux/ktime.h>
>> +#include <linux/of_address.h>
>> +#include <linux/phy.h>
>> +#include <linux/of_mdio.h>
>> +#include <linux/of_net.h>
>> +#include <linux/mfd/syscon.h>
>> +#include <linux/regmap.h>
>> +
>> +#define PPE_CFG_RX_ADDR			0x100
>> +#define PPE_CFG_POOL_GRP		0x300
>> +#define PPE_CFG_RX_BUF_SIZE		0x400
>> +#define PPE_CFG_RX_FIFO_SIZE		0x500
>> +#define PPE_CURR_BUF_CNT		0xa200
>> +
>> +#define GE_DUPLEX_TYPE			0x08
>> +#define GE_MAX_FRM_SIZE_REG		0x3c
>> +#define GE_PORT_MODE			0x40
>> +#define GE_PORT_EN			0x44
>> +#define GE_SHORT_RUNTS_THR_REG		0x50
>> +#define GE_TX_LOCAL_PAGE_REG		0x5c
>> +#define GE_TRANSMIT_CONTROL_REG		0x60
>> +#define GE_CF_CRC_STRIP_REG		0x1b0
>> +#define GE_MODE_CHANGE_REG		0x1b4
>> +#define GE_RECV_CONTROL_REG		0x1e0
>> +#define GE_STATION_MAC_ADDRESS		0x210
>> +#define PPE_CFG_CPU_ADD_ADDR		0x580
>> +#define PPE_CFG_MAX_FRAME_LEN_REG	0x408
>> +#define PPE_CFG_BUS_CTRL_REG		0x424
>> +#define PPE_CFG_RX_CTRL_REG		0x428
>> +#define PPE_CFG_RX_PKT_MODE_REG		0x438
>> +#define PPE_CFG_QOS_VMID_GEN		0x500
>> +#define PPE_CFG_RX_PKT_INT		0x538
>> +#define PPE_INTEN			0x600
>> +#define PPE_INTSTS			0x608
>> +#define PPE_RINT			0x604
>> +#define PPE_CFG_STS_MODE		0x700
>> +#define PPE_HIS_RX_PKT_CNT		0x804
>> +
>> +/* REG_INTERRUPT */
>> +#define RCV_INT				BIT(10)
>> +#define RCV_NOBUF			BIT(8)
>> +#define RCV_DROP			BIT(7)
>> +#define TX_DROP				BIT(6)
>> +#define DEF_INT_ERR			(RCV_NOBUF | RCV_DROP | TX_DROP)
>> +#define DEF_INT_MASK			(RCV_INT | DEF_INT_ERR)
>> +
>> +/* TX descriptor config */
>> +#define TX_FREE_MEM			BIT(0)
>> +#define TX_READ_ALLOC_L3		BIT(1)
>> +#define TX_FINISH_CACHE_INV		BIT(2)
>> +#define TX_CLEAR_WB			BIT(4)
>> +#define TX_L3_CHECKSUM			BIT(5)
>> +#define TX_LOOP_BACK			BIT(11)
>> +
>> +/* RX error */
>> +#define RX_PKT_DROP			BIT(0)
>> +#define RX_L2_ERR			BIT(1)
>> +#define RX_PKT_ERR			(RX_PKT_DROP | RX_L2_ERR)
>> +
>> +#define SGMII_SPEED_1000		0x08
>> +#define SGMII_SPEED_100			0x07
>> +#define SGMII_SPEED_10			0x06
>> +#define MII_SPEED_100			0x01
>> +#define MII_SPEED_10			0x00
>> +
>> +#define GE_DUPLEX_FULL			BIT(0)
>> +#define GE_DUPLEX_HALF			0x00
>> +#define GE_MODE_CHANGE_EN		BIT(0)
>> +
>> +#define GE_TX_AUTO_NEG			BIT(5)
>> +#define GE_TX_ADD_CRC			BIT(6)
>> +#define GE_TX_SHORT_PAD_THROUGH		BIT(7)
>> +
>> +#define GE_RX_STRIP_CRC			BIT(0)
>> +#define GE_RX_STRIP_PAD			BIT(3)
>> +#define GE_RX_PAD_EN			BIT(4)
>> +
>> +#define GE_AUTO_NEG_CTL			BIT(0)
>> +
>> +#define GE_RX_INT_THRESHOLD		BIT(6)
>> +#define GE_RX_TIMEOUT			0x04
>> +
>> +#define GE_RX_PORT_EN			BIT(1)
>> +#define GE_TX_PORT_EN			BIT(2)
>> +
>> +#define PPE_CFG_STS_RX_PKT_CNT_RC	BIT(12)
>> +
>> +#define PPE_CFG_RX_PKT_ALIGN		BIT(18)
>> +#define PPE_CFG_QOS_VMID_MODE		BIT(14)
>> +#define PPE_CFG_QOS_VMID_GRP_SHIFT	8
>> +
>> +#define PPE_CFG_RX_FIFO_FSFU		BIT(11)
>> +#define PPE_CFG_RX_DEPTH_SHIFT		16
>> +#define PPE_CFG_RX_START_SHIFT		0
>> +#define PPE_CFG_RX_CTRL_ALIGN_SHIFT	11
>> +
>> +#define PPE_CFG_BUS_LOCAL_REL		BIT(14)
>> +#define PPE_CFG_BUS_BIG_ENDIEN		BIT(0)
>> +
>> +#define RX_DESC_NUM			128
>> +#define TX_DESC_NUM			256
>> +#define TX_NEXT(N)			(((N) + 1) & (TX_DESC_NUM-1))
>> +#define RX_NEXT(N)			(((N) + 1) & (RX_DESC_NUM-1))
>> +
>> +#define GMAC_PPE_RX_PKT_MAX_LEN		379
>> +#define GMAC_MAX_PKT_LEN		1516
>> +#define GMAC_MIN_PKT_LEN		31
>> +#define RX_BUF_SIZE			1600
>> +#define RESET_TIMEOUT			1000
>> +#define TX_TIMEOUT			(6 * HZ)
>> +
>> +#define DRV_NAME			"hip04-ether"
>> +#define DRV_VERSION			"v1.0"
>> +
>> +#define HIP04_MAX_TX_COALESCE_USECS	200
>> +#define HIP04_MIN_TX_COALESCE_USECS	100
>> +#define HIP04_MAX_TX_COALESCE_FRAMES	200
>> +#define HIP04_MIN_TX_COALESCE_FRAMES	100
>> +
>> +struct tx_desc {
>> +	u32 send_addr;
> 
> 	__be32 send_adddr; ?
> 
>> +	u32 send_size;
> 
> 	__be32
> 
>> +	u32 next_addr;
> 	__be32
> 
>> +	u32 cfg;
> 	__be32
> 
>> +	u32 wb_addr;
> 	__be32 wb_addr ?
> 
>> +} __aligned(64);
>> +
>> +struct rx_desc {
>> +	u16 reserved_16;
>> +	u16 pkt_len;
>> +	u32 reserve1[3];
>> +	u32 pkt_err;
>> +	u32 reserve2[4];
>> +};
>> +
>> +struct hip04_priv {
>> +	void __iomem *base;
>> +	int phy_mode;
>> +	int chan;
>> +	unsigned int port;
>> +	unsigned int speed;
>> +	unsigned int duplex;
>> +	unsigned int reg_inten;
>> +
>> +	struct napi_struct napi;
>> +	struct net_device *ndev;
>> +
>> +	struct tx_desc *tx_desc;
>> +	dma_addr_t tx_desc_dma;
>> +	struct sk_buff *tx_skb[TX_DESC_NUM];
>> +	dma_addr_t tx_phys[TX_DESC_NUM];
> 
> This is not an efficient way to store skb/phys, as for each skb, info
> will be store in 2 separate cache lines.
> 
> It would be better to use a 
> 
> struct hip04_tx_desc {
>    struct sk_buff   *skb;
>    dma_addr_t       phys;
> } 
> 
>> +	unsigned int tx_head;
>> +
>> +	int tx_coalesce_frames;
>> +	int tx_coalesce_usecs;
>> +	struct hrtimer tx_coalesce_timer;
>> +
>> +	unsigned char *rx_buf[RX_DESC_NUM];
>> +	dma_addr_t rx_phys[RX_DESC_NUM];
> 
> Same thing here : Use a struct to get better data locality.
> 
>> +	unsigned int rx_head;
>> +	unsigned int rx_buf_size;
>> +
>> +	struct device_node *phy_node;
>> +	struct phy_device *phy;
>> +	struct regmap *map;
>> +	struct work_struct tx_timeout_task;
>> +
>> +	/* written only by tx cleanup */
>> +	unsigned int tx_tail ____cacheline_aligned_in_smp;
>> +};
>> +
>> +static inline unsigned int tx_count(unsigned int head, unsigned int tail)
>> +{
>> +	return (head - tail) % (TX_DESC_NUM - 1);
>> +}
>> +
>> +static void hip04_config_port(struct net_device *ndev, u32 speed, u32 duplex)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +	u32 val;
>> +
>> +	priv->speed = speed;
>> +	priv->duplex = duplex;
>> +
>> +	switch (priv->phy_mode) {
>> +	case PHY_INTERFACE_MODE_SGMII:
>> +		if (speed == SPEED_1000)
>> +			val = SGMII_SPEED_1000;
>> +		else if (speed == SPEED_100)
>> +			val = SGMII_SPEED_100;
>> +		else
>> +			val = SGMII_SPEED_10;
>> +		break;
>> +	case PHY_INTERFACE_MODE_MII:
>> +		if (speed == SPEED_100)
>> +			val = MII_SPEED_100;
>> +		else
>> +			val = MII_SPEED_10;
>> +		break;
>> +	default:
>> +		netdev_warn(ndev, "not supported mode\n");
>> +		val = MII_SPEED_10;
>> +		break;
>> +	}
>> +	writel_relaxed(val, priv->base + GE_PORT_MODE);
>> +
>> +	val = duplex ? GE_DUPLEX_FULL : GE_DUPLEX_HALF;
>> +	writel_relaxed(val, priv->base + GE_DUPLEX_TYPE);
>> +
>> +	val = GE_MODE_CHANGE_EN;
>> +	writel_relaxed(val, priv->base + GE_MODE_CHANGE_REG);
>> +}
>> +
>> +static void hip04_reset_ppe(struct hip04_priv *priv)
>> +{
>> +	u32 val, tmp, timeout = 0;
>> +
>> +	do {
>> +		regmap_read(priv->map, priv->port * 4 + PPE_CURR_BUF_CNT, &val);
>> +		regmap_read(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, &tmp);
>> +		if (timeout++ > RESET_TIMEOUT)
>> +			break;
>> +	} while (val & 0xfff);
>> +}
>> +
>> +static void hip04_config_fifo(struct hip04_priv *priv)
>> +{
>> +	u32 val;
>> +
>> +	val = readl_relaxed(priv->base + PPE_CFG_STS_MODE);
>> +	val |= PPE_CFG_STS_RX_PKT_CNT_RC;
>> +	writel_relaxed(val, priv->base + PPE_CFG_STS_MODE);
>> +
>> +	val = BIT(priv->port);
>> +	regmap_write(priv->map, priv->port * 4 + PPE_CFG_POOL_GRP, val);
>> +
>> +	val = priv->port << PPE_CFG_QOS_VMID_GRP_SHIFT;
>> +	val |= PPE_CFG_QOS_VMID_MODE;
>> +	writel_relaxed(val, priv->base + PPE_CFG_QOS_VMID_GEN);
>> +
>> +	val = RX_BUF_SIZE;
>> +	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_BUF_SIZE, val);
>> +
>> +	val = RX_DESC_NUM << PPE_CFG_RX_DEPTH_SHIFT;
>> +	val |= PPE_CFG_RX_FIFO_FSFU;
>> +	val |= priv->chan << PPE_CFG_RX_START_SHIFT;
>> +	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_FIFO_SIZE, val);
>> +
>> +	val = NET_IP_ALIGN << PPE_CFG_RX_CTRL_ALIGN_SHIFT;
>> +	writel_relaxed(val, priv->base + PPE_CFG_RX_CTRL_REG);
>> +
>> +	val = PPE_CFG_RX_PKT_ALIGN;
>> +	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_MODE_REG);
>> +
>> +	val = PPE_CFG_BUS_LOCAL_REL | PPE_CFG_BUS_BIG_ENDIEN;
>> +	writel_relaxed(val, priv->base + PPE_CFG_BUS_CTRL_REG);
>> +
>> +	val = GMAC_PPE_RX_PKT_MAX_LEN;
>> +	writel_relaxed(val, priv->base + PPE_CFG_MAX_FRAME_LEN_REG);
>> +
>> +	val = GMAC_MAX_PKT_LEN;
>> +	writel_relaxed(val, priv->base + GE_MAX_FRM_SIZE_REG);
>> +
>> +	val = GMAC_MIN_PKT_LEN;
>> +	writel_relaxed(val, priv->base + GE_SHORT_RUNTS_THR_REG);
>> +
>> +	val = readl_relaxed(priv->base + GE_TRANSMIT_CONTROL_REG);
>> +	val |= GE_TX_AUTO_NEG | GE_TX_ADD_CRC | GE_TX_SHORT_PAD_THROUGH;
>> +	writel_relaxed(val, priv->base + GE_TRANSMIT_CONTROL_REG);
>> +
>> +	val = GE_RX_STRIP_CRC;
>> +	writel_relaxed(val, priv->base + GE_CF_CRC_STRIP_REG);
>> +
>> +	val = readl_relaxed(priv->base + GE_RECV_CONTROL_REG);
>> +	val |= GE_RX_STRIP_PAD | GE_RX_PAD_EN;
>> +	writel_relaxed(val, priv->base + GE_RECV_CONTROL_REG);
>> +
>> +	val = GE_AUTO_NEG_CTL;
>> +	writel_relaxed(val, priv->base + GE_TX_LOCAL_PAGE_REG);
>> +}
>> +
>> +static void hip04_mac_enable(struct net_device *ndev)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +	u32 val;
>> +
>> +	/* enable tx & rx */
>> +	val = readl_relaxed(priv->base + GE_PORT_EN);
>> +	val |= GE_RX_PORT_EN | GE_TX_PORT_EN;
>> +	writel_relaxed(val, priv->base + GE_PORT_EN);
>> +
>> +	/* clear rx int */
>> +	val = RCV_INT;
>> +	writel_relaxed(val, priv->base + PPE_RINT);
>> +
>> +	/* config recv int */
>> +	val = GE_RX_INT_THRESHOLD | GE_RX_TIMEOUT;
>> +	writel_relaxed(val, priv->base + PPE_CFG_RX_PKT_INT);
>> +
>> +	/* enable interrupt */
>> +	priv->reg_inten = DEF_INT_MASK;
>> +	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>> +}
>> +
>> +static void hip04_mac_disable(struct net_device *ndev)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +	u32 val;
>> +
>> +	/* disable int */
>> +	priv->reg_inten &= ~(DEF_INT_MASK);
>> +	writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>> +
>> +	/* disable tx & rx */
>> +	val = readl_relaxed(priv->base + GE_PORT_EN);
>> +	val &= ~(GE_RX_PORT_EN | GE_TX_PORT_EN);
>> +	writel_relaxed(val, priv->base + GE_PORT_EN);
>> +}
>> +
>> +static void hip04_set_xmit_desc(struct hip04_priv *priv, dma_addr_t phys)
>> +{
>> +	writel(phys, priv->base + PPE_CFG_CPU_ADD_ADDR);
>> +}
>> +
>> +static void hip04_set_recv_desc(struct hip04_priv *priv, dma_addr_t phys)
>> +{
>> +	regmap_write(priv->map, priv->port * 4 + PPE_CFG_RX_ADDR, phys);
>> +}
>> +
>> +static u32 hip04_recv_cnt(struct hip04_priv *priv)
>> +{
>> +	return readl(priv->base + PPE_HIS_RX_PKT_CNT);
>> +}
>> +
>> +static void hip04_update_mac_address(struct net_device *ndev)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +
>> +	writel_relaxed(((ndev->dev_addr[0] << 8) | (ndev->dev_addr[1])),
>> +		       priv->base + GE_STATION_MAC_ADDRESS);
>> +	writel_relaxed(((ndev->dev_addr[2] << 24) | (ndev->dev_addr[3] << 16) |
>> +			(ndev->dev_addr[4] << 8) | (ndev->dev_addr[5])),
>> +		       priv->base + GE_STATION_MAC_ADDRESS + 4);
>> +}
>> +
>> +static int hip04_set_mac_address(struct net_device *ndev, void *addr)
>> +{
>> +	eth_mac_addr(ndev, addr);
>> +	hip04_update_mac_address(ndev);
>> +	return 0;
>> +}
>> +
>> +static int hip04_tx_reclaim(struct net_device *ndev, bool force)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +	unsigned tx_tail = priv->tx_tail;
>> +	struct tx_desc *desc;
>> +	unsigned int bytes_compl = 0, pkts_compl = 0;
>> +	unsigned int count;
>> +
>> +	smp_rmb();
>> +	count = tx_count(ACCESS_ONCE(priv->tx_head), tx_tail);
>> +	if (count == 0)
>> +		goto out;
>> +
>> +	while (count) {
>> +		desc = &priv->tx_desc[tx_tail];
>> +		if (desc->send_addr != 0) {
>> +			if (force)
>> +				desc->send_addr = 0;
>> +			else
>> +				break;
>> +		}
>> +
>> +		if (priv->tx_phys[tx_tail]) {
>> +			dma_unmap_single(&ndev->dev, priv->tx_phys[tx_tail],
>> +					 priv->tx_skb[tx_tail]->len,
>> +					 DMA_TO_DEVICE);
>> +			priv->tx_phys[tx_tail] = 0;
>> +		}
>> +		pkts_compl++;
>> +		bytes_compl += priv->tx_skb[tx_tail]->len;
>> +		dev_kfree_skb(priv->tx_skb[tx_tail]);
>> +		priv->tx_skb[tx_tail] = NULL;
>> +		tx_tail = TX_NEXT(tx_tail);
>> +		count--;
>> +	}
>> +
>> +	priv->tx_tail = tx_tail;
>> +	smp_wmb(); /* Ensure tx_tail visible to xmit */
>> +
>> +out:
>> +	if (pkts_compl || bytes_compl)
> 
> Testing bytes_compl should be enough : There is no way pkt_compl could
> be 0 if bytes_compl is not 0.
> 
>> +		netdev_completed_queue(ndev, pkts_compl, bytes_compl);
>> +
>> +	if (unlikely(netif_queue_stopped(ndev)) && (count < (TX_DESC_NUM - 1)))
>> +		netif_wake_queue(ndev);
>> +
>> +	return count;
>> +}
>> +
>> +static int hip04_mac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
>> +{
>> +	struct hip04_priv *priv = netdev_priv(ndev);
>> +	struct net_device_stats *stats = &ndev->stats;
>> +	unsigned int tx_head = priv->tx_head, count;
>> +	struct tx_desc *desc = &priv->tx_desc[tx_head];
>> +	dma_addr_t phys;
>> +
>> +	smp_rmb();
>> +	count = tx_count(tx_head, ACCESS_ONCE(priv->tx_tail));
>> +	if (count == (TX_DESC_NUM - 1)) {
>> +		netif_stop_queue(ndev);
>> +		return NETDEV_TX_BUSY;
>> +	}
>> +
>> +	phys = dma_map_single(&ndev->dev, skb->data, skb->len, DMA_TO_DEVICE);
>> +	if (dma_mapping_error(&ndev->dev, phys)) {
>> +		dev_kfree_skb(skb);
>> +		return NETDEV_TX_OK;
>> +	}
>> +
>> +	priv->tx_skb[tx_head] = skb;
>> +	priv->tx_phys[tx_head] = phys;
>> +	desc->send_addr = cpu_to_be32(phys);
>> +	desc->send_size = cpu_to_be32(skb->len);
>> +	desc->cfg = cpu_to_be32(TX_CLEAR_WB | TX_FINISH_CACHE_INV);
>> +	phys = priv->tx_desc_dma + tx_head * sizeof(struct tx_desc);
>> +	desc->wb_addr = cpu_to_be32(phys);
>> +	skb_tx_timestamp(skb);
>> +
>> +	hip04_set_xmit_desc(priv, phys);
>> +	priv->tx_head = TX_NEXT(tx_head);
>> +	count++;
> 
> Starting from this point, skb might already have been freed by TX
> completion.
> 
> Its racy to access skb->len 
> 
>> +	netdev_sent_queue(ndev, skb->len);
>> +
>> +	stats->tx_bytes += skb->len;
>> +	stats->tx_packets++;
>> +
>> +	/* Ensure tx_head update visible to tx reclaim */
>> +	smp_wmb();
>> +
>> +	/* queue is getting full, better start cleaning up now */
>> +	if (count >= priv->tx_coalesce_frames) {
>> +		if (napi_schedule_prep(&priv->napi)) {
>> +			/* disable rx interrupt and timer */
>> +			priv->reg_inten &= ~(RCV_INT);
>> +			writel_relaxed(DEF_INT_MASK & ~RCV_INT,
>> +				       priv->base + PPE_INTEN);
>> +			hrtimer_cancel(&priv->tx_coalesce_timer);
>> +			__napi_schedule(&priv->napi);
>> +		}
>> +	} else if (!hrtimer_is_queued(&priv->tx_coalesce_timer)) {
>> +		/* cleanup not pending yet, start a new timer */
>> +		hrtimer_start_expires(&priv->tx_coalesce_timer,
>> +				      HRTIMER_MODE_REL);
>> +	}
>> +
>> +	return NETDEV_TX_OK;
>> +}
>> +
>> +static int hip04_rx_poll(struct napi_struct *napi, int budget)
>> +{
>> +	struct hip04_priv *priv = container_of(napi, struct hip04_priv, napi);
>> +	struct net_device *ndev = priv->ndev;
>> +	struct net_device_stats *stats = &ndev->stats;
>> +	unsigned int cnt = hip04_recv_cnt(priv);
>> +	struct rx_desc *desc;
>> +	struct sk_buff *skb;
>> +	unsigned char *buf;
>> +	bool last = false;
>> +	dma_addr_t phys;
>> +	int rx = 0;
>> +	int tx_remaining;
>> +	u16 len;
>> +	u32 err;
>> +
>> +	while (cnt && !last) {
>> +		buf = priv->rx_buf[priv->rx_head];
>> +		skb = build_skb(buf, priv->rx_buf_size);
>> +		if (unlikely(!skb))
>> +			net_dbg_ratelimited("build_skb failed\n");
> 
> Well, is skb is NULL, you're crashing later...
> You really have to address a memory allocation error much better than
> that !
> 
>> +
>> +		dma_unmap_single(&ndev->dev, priv->rx_phys[priv->rx_head],
>> +				 RX_BUF_SIZE, DMA_FROM_DEVICE);
>> +		priv->rx_phys[priv->rx_head] = 0;
>> +
>> +		desc = (struct rx_desc *)skb->data;
>> +		len = be16_to_cpu(desc->pkt_len);
>> +		err = be32_to_cpu(desc->pkt_err);
>> +
>> +		if (0 == len) {
>> +			dev_kfree_skb_any(skb);
>> +			last = true;
>> +		} else if ((err & RX_PKT_ERR) || (len >= GMAC_MAX_PKT_LEN)) {
>> +			dev_kfree_skb_any(skb);
>> +			stats->rx_dropped++;
>> +			stats->rx_errors++;
>> +		} else {
>> +			skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
>> +			skb_put(skb, len);
>> +			skb->protocol = eth_type_trans(skb, ndev);
>> +			napi_gro_receive(&priv->napi, skb);
>> +			stats->rx_packets++;
>> +			stats->rx_bytes += len;
>> +			rx++;
>> +		}
>> +
>> +		buf = netdev_alloc_frag(priv->rx_buf_size);
>> +		if (!buf)
>> +			goto done;
> 
> Same problem here : In case of memory allocation error, your driver is
> totally screwed.
> 
>> +		phys = dma_map_single(&ndev->dev, buf,
>> +				      RX_BUF_SIZE, DMA_FROM_DEVICE);
>> +		if (dma_mapping_error(&ndev->dev, phys))
>> +			goto done;
> 
> Same problem here : You really have to recover properly.
> 
>> +		priv->rx_buf[priv->rx_head] = buf;
>> +		priv->rx_phys[priv->rx_head] = phys;
>> +		hip04_set_recv_desc(priv, phys);
>> +
>> +		priv->rx_head = RX_NEXT(priv->rx_head);
>> +		if (rx >= budget)
>> +			goto done;
>> +
>> +		if (--cnt == 0)
>> +			cnt = hip04_recv_cnt(priv);
>> +	}
>> +
>> +	if (!(priv->reg_inten & RCV_INT)) {
>> +		/* enable rx interrupt */
>> +		priv->reg_inten |= RCV_INT;
>> +		writel_relaxed(priv->reg_inten, priv->base + PPE_INTEN);
>> +	}
>> +	napi_complete(napi);
>> +done:
>> +	/* clean up tx descriptors and start a new timer if necessary */
>> +	tx_remaining = hip04_tx_reclaim(ndev, false);
>> +	if (rx < budget && tx_remaining)
>> +		hrtimer_start_expires(&priv->tx_coalesce_timer, HRTIMER_MODE_REL);
>> +
>> +	return rx;
>> +}
>> +

Yes, thanks, fix them later.

Ding

> 
> 
> .
> 





More information about the linux-arm-kernel mailing list