[PATCH 2/2] nvme-pci: use dma_alloc_noncontigous if possible

Leon Romanovsky leon at kernel.org
Mon Dec 2 11:05:41 PST 2024


On Fri, Nov 01, 2024 at 05:40:05AM +0100, Christoph Hellwig wrote:
> Use dma_alloc_noncontigous to allocate a single IOVA-contigous segment
> when backed by an IOMMU.  This allow to easily use bigger segments and
> avoids running into segment limits if we can avoid it.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/pci.c | 58 +++++++++++++++++++++++++++++++++++++----
>  1 file changed, 53 insertions(+), 5 deletions(-)

<...>

> +static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,
>  		u32 chunk_size)
>  {
>  	struct nvme_host_mem_buf_desc *descs;
> @@ -2049,9 +2086,18 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
>  	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
>  	u64 chunk_size;
>  
> +	/*
> +	 * If there is an IOMMU that can merge pages, try a virtually
> +	 * non-contiguous allocation for a single segment first.
> +	 */
> +	if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
> +		if (!nvme_alloc_host_mem_single(dev, preferred))
> +			return 0;
> +	}

We assume that the addition of the lines above are the root cause of the
following panic during boot. It is happening when we are trying to allocate
61 MiB chunk.

[    4.373307] ------------[ cut here ]------------
[    4.373316] WARNING: CPU: 5 PID: 11 at mm/page_alloc.c:4727 __alloc_pages_noprof+0x84c/0xd88
[    4.373332] Modules linked in: crct10dif_ce mlx5_core(+) nvme gpio_mlxbf3 nvme_core mlxfw psample i2c_mlxbf pinctrl_mlxbf3 mlxbf_gige mlxbf_tmfifo pwr_mlxbf ipv6 crc_ccitt
[    4.373353] CPU: 5 UID: 0 PID: 11 Comm: kworker/u64:0 Not tainted 6.12.0-for-upstream-bluefield-2024-11-29-01-33 #1
[    4.373357] Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.9.0.13378 Oct 30 2024
[    4.373360] Workqueue: async async_run_entry_fn
[    4.373365] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
[    4.373368] pc : __alloc_pages_noprof+0x84c/0xd88
[    4.373371] lr : __dma_direct_alloc_pages.constprop.0+0x234/0x358
[    4.373377] sp : ffffffc08011b890
[    4.373378] x29: ffffffc08011b890 x28: 000000000000000e x27: 0000000003d00000
[    4.373382] x26: ffffff80803cb840 x25: ffffff808197a0c8 x24: 000000000000000e
[    4.373385] x23: 0000000000000cc1 x22: 00000000ffffffff x21: 0000000003cfffff
[    4.373388] x20: 0000000000000000 x19: ffffffffffffffff x18: 0000000000000100
[    4.373391] x17: 0030737973627573 x16: ffffffd634e9d488 x15: 0000000000003a98
[    4.373394] x14: 0000000013ffffff x13: ffffffd636c18d88 x12: 0000000000000001
[    4.373396] x11: 0000000104ab200c x10: f56b3ce21ad3b435 x9 : ffffffd634e9ecbc
[    4.373399] x8 : ffffff808647ba80 x7 : ffffffffffffffff x6 : 0000000000000cc0
[    4.373402] x5 : 0000000000000000 x4 : ffffff80809b9140 x3 : 0000000000000000
[    4.373405] x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffffd636e5d000
[    4.373408] Call trace:
[    4.373410]  __alloc_pages_noprof+0x84c/0xd88 (P)
[    4.373414]  __dma_direct_alloc_pages.constprop.0+0x234/0x358 (L)
[    4.373418]  __dma_direct_alloc_pages.constprop.0+0x234/0x358
[    4.373421]  dma_direct_alloc_pages+0x40/0x190
[    4.373424]  __dma_alloc_pages+0x40/0x80
[    4.373428]  dma_alloc_noncontiguous+0xb4/0x218
[    4.373431]  nvme_setup_host_mem+0x370/0x400 [nvme]
[    4.373442]  nvme_probe+0x688/0x7e8 [nvme]
[    4.373446]  local_pci_probe+0x48/0xb8
[    4.373451]  pci_device_probe+0x1e0/0x200
[    4.373454]  really_probe+0xc8/0x3a0
[    4.373457]  __driver_probe_device+0x84/0x170
[    4.373460]  driver_probe_device+0x44/0x120
[    4.373462]  __driver_attach_async_helper+0x58/0x100
[    4.373465]  async_run_entry_fn+0x40/0x1e8
[    4.373468]  process_one_work+0x16c/0x3e8
[    4.373472]  worker_thread+0x284/0x448
[    4.373476]  kthread+0xec/0xf8
[    4.373479]  ret_from_fork+0x10/0x20
[    4.373483] ---[ end trace 0000000000000000 ]---
[    4.378989] nvme nvme0: allocated 61 MiB host memory buffer (16 segments).
[    4.534672] nvme nvme0: 16/0/0 default/read/poll queues
[    4.537784]  nvme0n1: p1 p2

  4715 struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
  4716                                       int preferred_nid, nodemask_t *nodemask)
...
  4723         /*
  4724          * There are several places where we assume that the order value is sane
  4725          * so bail out early if the request is out of bound.
  4726          */
  4727         if (WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER, gfp))
  4728                 return NULL;

I see at least two possible solutions, add GFP_NOWARN in nvme_alloc_host_mem_single()
or the following patch:

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4c644bb7f069..baed4059d8a5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2172,7 +2172,8 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred,

 static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
 {
-       u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
+       u64 max_chunk = PAGE_SIZE * MAX_ORDER_NR_PAGES;
+       u64 min_chunk = min_t(u64, preferred, max_chunk);
        u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
        u64 chunk_size;

@@ -2180,7 +2181,7 @@ static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
         * If there is an IOMMU that can merge pages, try a virtually
         * non-contiguous allocation for a single segment first.
         */
-       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev))) {
+       if (!(PAGE_SIZE & dma_get_merge_boundary(dev->dev)) && preferred < max_chunk) {
                if (!nvme_alloc_host_mem_single(dev, preferred))
                        return 0;
        }
(END)

What is the preferred way to overcome the warning?

Thanks



More information about the Linux-nvme mailing list