[PATCH v6 net-next 09/10] net: add a hardware buffer management helper API

Mon Mar 14 03:33:18 PDT 2016

I've not fully understood the hardware support part.

But I do think this generalization is very interesting work, and would
like to cooperate. If my use-case can fit into this, where my use-case
is in the extreme 100Gbit/s area.

There is some potential for performance improvements, if the API from
start is designed distinguish between being called from NAPI-context
(BH-disabled) and outside NAPI context.

See: netdev_alloc_frag() vs napi_alloc_frag().

Nitpicks inlined below...

On Mon, 14 Mar 2016 09:39:04 +0100
Gregory CLEMENT <gregory.clement at free-electrons.com> wrote:

> This basic implementation allows to share code between driver using
> hardware buffer management. As the code is hardware agnostic, there is
> few helpers, most of the optimization brought by the an HW BM has to be
> done at driver level.
> 
> Tested-by: Sebastian Careba <nitroshift at yahoo.com>
> Signed-off-by: Gregory CLEMENT <gregory.clement at free-electrons.com>
> ---
>  include/net/hwbm.h | 28 ++++++++++++++++++
>  net/Kconfig        |  3 ++
>  net/core/Makefile  |  1 +
>  net/core/hwbm.c    | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 119 insertions(+)
>  create mode 100644 include/net/hwbm.h
>  create mode 100644 net/core/hwbm.c
> 
> diff --git a/include/net/hwbm.h b/include/net/hwbm.h
> new file mode 100644
> index 000000000000..47d08662501b
> --- /dev/null
> +++ b/include/net/hwbm.h
> @@ -0,0 +1,28 @@
> +#ifndef _HWBM_H
> +#define _HWBM_H
> +
> +struct hwbm_pool {
> +	/* Capacity of the pool */
> +	int size;
> +	/* Size of the buffers managed */
> +	int frag_size;
> +	/* Number of buffers currently used by this pool */
> +	int buf_num;
> +	/* constructor called during alocation */

alocation -> allocation

> +	int (*construct)(struct hwbm_pool *bm_pool, void *buf);
> +	/* protect acces to the buffer counter*/

acces -> access
Space after "counter"

> +	spinlock_t lock;
> +	/* private data */
> +	void *priv;
> +};
> +#ifdef CONFIG_HWBM
> +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf);
> +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp);
> +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp);
> +#else
> +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf) {}
> +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp) { return 0; }
> +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
> +{ return 0; }
> +#endif /* CONFIG_HWBM */
> +#endif /* _HWBM_H */
> diff --git a/net/Kconfig b/net/Kconfig
> index 10640d5f8bee..e13449870d06 100644
> --- a/net/Kconfig
> +++ b/net/Kconfig
> @@ -253,6 +253,9 @@ config XPS
>  	depends on SMP
>  	default y
>  
> +config HWBM
> +       bool
> +
>  config SOCK_CGROUP_DATA
>  	bool
>  	default n
> diff --git a/net/core/Makefile b/net/core/Makefile
> index 014422e2561f..d6508c2ddca5 100644
> --- a/net/core/Makefile
> +++ b/net/core/Makefile
> @@ -25,4 +25,5 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
>  obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
>  obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
>  obj-$(CONFIG_DST_CACHE) += dst_cache.o
> +obj-$(CONFIG_HWBM) += hwbm.o
>  obj-$(CONFIG_NET_DEVLINK) += devlink.o
> diff --git a/net/core/hwbm.c b/net/core/hwbm.c
> new file mode 100644
> index 000000000000..941c28486896
> --- /dev/null
> +++ b/net/core/hwbm.c
> @@ -0,0 +1,87 @@
> +/* Support for hardware buffer manager.
> + *
> + * Copyright (C) 2016 Marvell
> + *
> + * Gregory CLEMENT <gregory.clement at free-electrons.com>
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + */
> +#include <linux/kernel.h>
> +#include <linux/printk.h>
> +#include <linux/skbuff.h>
> +#include <net/hwbm.h>
> +
> +void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
> +{
> +	if (likely(bm_pool->frag_size <= PAGE_SIZE))
> +		skb_free_frag(buf);
> +	else
> +		kfree(buf);
> +}
> +EXPORT_SYMBOL_GPL(hwbm_buf_free);
> +
> +/* Refill processing for HW buffer management */
> +int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
> +{
> +	int frag_size = bm_pool->frag_size;
> +	void *buf;
> +
> +	if (likely(frag_size <= PAGE_SIZE))
> +		buf = netdev_alloc_frag(frag_size);

If we know the NAPI-context, there is a performance potential in
netdev_alloc_frag() vs napi_alloc_frag().

> +	else
> +		buf = kmalloc(frag_size, gfp);
> +
> +	if (!buf)
> +		return -ENOMEM;
> +
> +	if (bm_pool->construct)
> +		if (bm_pool->construct(bm_pool, buf)) {
> +			hwbm_buf_free(bm_pool, buf);
> +			return -ENOMEM;
> +		}

Why don't we refill more objects? and do so with a bulk of memory
objects?  The "refill" name just lead me to believe that the function
might refill several objects...

Maybe that is the role of hwbm_pool_add() ?

> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(hwbm_pool_refill);
> +
> +int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
> +{
> +	int err, i;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&bm_pool->lock, flags);

This might be needed, and you take the lock for several objects. But
the save/restore variant is the most expensive lock we have (at-least
based on my measurements[1] for Intel).

[1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_sample.c

> +	if (bm_pool->buf_num == bm_pool->size) {
> +		pr_warn("pool already filled\n");
> +		return bm_pool->buf_num;
> +	}
> +
> +	if (buf_num + bm_pool->buf_num > bm_pool->size) {
> +		pr_warn("cannot allocate %d buffers for pool\n",
> +			buf_num);
> +		return 0;
> +	}
> +
> +	if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
> +		pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
> +			buf_num,  bm_pool->buf_num);
> +		return 0;
> +	}
> +
> +	for (i = 0; i < buf_num; i++) {
> +		err = hwbm_pool_refill(bm_pool, gfp);

I'm thinking why not use some bulk allocation API here...

> +		if (err < 0)
> +			break;
> +	}
> +
> +	/* Update BM driver with number of buffers added to pool */
> +	bm_pool->buf_num += i;
> +
> +	pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
> +	spin_unlock_irqrestore(&bm_pool->lock, flags);
> +
> +	return i;
> +}
> +EXPORT_SYMBOL_GPL(hwbm_pool_add);

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer