[PATCH] nvme: add DIX support for nvme-rdma
Max Gurtovoy
mgurtovoy at nvidia.com
Mon Aug 29 07:56:39 PDT 2022
On 8/29/2022 4:16 PM, Chao Leng wrote:
>
>
> On 2022/8/29 18:43, Max Gurtovoy wrote:
>> hi,
>>
>> On 8/29/2022 11:12 AM, Chao Leng wrote:
>>> Now some rdma HBA alread support DIX-DIF: host buffer to host HBA use
>>> DIX, host HBA to target use DIF.
>>> This patch add DIX support for nvme-rdma.
>>> Test results:
>>> The performance of different I/O sizes all be optimized.
>>> The CPU usage of 256k size I/Os can reduce more than 20%.
>>
>> You're adding a new PI guard type for 16bit mode only, aren't you ?
> No, Now DIX just be defined for 16bit mode.
>>
>> I don't think the subject and commit message is correct.
>>
>> Can you attach to commit message please the table of performance
>> numbers before and after the patch ?ok, I will give you the detailed
>> performance numbers later.
>>
>> You can mention that also in iser, if supported, the default is to
>> use IP_CHECKSUM for DIX and not CRC.
> According to DIX define:DIX = IP_CHECKSUM.
> To reduce CPU utilization, the end-to-end DIF for SCSI protocols is
> DIX-DIF when supported by hardware.
From what I re-call DIX was protection between host_buff -> host_device
and DIF was protection between host_device -> target_device.
If now its defined as DIX == IP_CHECKSUM and DIF == CRC please mention
it somehow in the commit message.
Something like:
"
Some RDMA HBA already support both DIX (IP_CHECKSUM) and DIF (CRC) offloads.
Therefore, for host buffer to host HBA it's preferred using DIX to get
better CPU utilization.
Add DIX for host buffer to host HBA integrity, if supported by
underlying hardware.
Wire domain protocol for data integrity was not changed (will be using DIF).
Test results (add a table please):
"
>>
>>
>>>
>>> Signed-off-by: Chao Leng <lengchao at huawei.com>
>>> ---
>>> drivers/nvme/host/core.c | 19 ++++++++++++++-----
>>> drivers/nvme/host/nvme.h | 1 +
>>> drivers/nvme/host/rdma.c | 24 ++++++++++++++++--------
>>> 3 files changed, 31 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index 2429b11eb9a8..7055d3b7b582 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -1623,7 +1623,7 @@ int nvme_getgeo(struct block_device *bdev,
>>> struct hd_geometry *geo)
>>> #ifdef CONFIG_BLK_DEV_INTEGRITY
>>> static void nvme_init_integrity(struct gendisk *disk, struct
>>> nvme_ns *ns,
>>> - u32 max_integrity_segments)
>>> + u32 max_integrity_segments, bool dix_support)
>>> {
>>> struct blk_integrity integrity = { };
>>> @@ -1631,7 +1631,11 @@ static void nvme_init_integrity(struct
>>> gendisk *disk, struct nvme_ns *ns,
>>> case NVME_NS_DPS_PI_TYPE3:
>>> switch (ns->guard_type) {
>>> case NVME_NVM_NS_16B_GUARD:
>>> - integrity.profile = &t10_pi_type3_crc;
>>> + if (dix_support)
>>> + integrity.profile = &t10_pi_type3_ip;
>>> + else
>>> + integrity.profile = &t10_pi_type3_crc;
>>> +
>>> integrity.tag_size = sizeof(u16) + sizeof(u32);
>>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
>>> break;
>>> @@ -1649,7 +1653,11 @@ static void nvme_init_integrity(struct
>>> gendisk *disk, struct nvme_ns *ns,
>>> case NVME_NS_DPS_PI_TYPE2:
>>> switch (ns->guard_type) {
>>> case NVME_NVM_NS_16B_GUARD:
>>> - integrity.profile = &t10_pi_type1_crc;
>>> + if (dix_support)
>>> + integrity.profile = &t10_pi_type1_ip;
>>> + else
>>> + integrity.profile = &t10_pi_type1_crc;
>>> +
>>> integrity.tag_size = sizeof(u16);
>>> integrity.flags |= BLK_INTEGRITY_DEVICE_CAPABLE;
>>> break;
>>> @@ -1674,7 +1682,7 @@ static void nvme_init_integrity(struct gendisk
>>> *disk, struct nvme_ns *ns,
>>> }
>>> #else
>>> static void nvme_init_integrity(struct gendisk *disk, struct
>>> nvme_ns *ns,
>>> - u32 max_integrity_segments)
>>> + u32 max_integrity_segments, bool dix_support)
>>> {
>>> }
>>> #endif /* CONFIG_BLK_DEV_INTEGRITY */
>>> @@ -1900,7 +1908,8 @@ static void nvme_update_disk_info(struct
>>> gendisk *disk,
>>> if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
>>> (ns->features & NVME_NS_METADATA_SUPPORTED))
>>> nvme_init_integrity(disk, ns,
>>> - ns->ctrl->max_integrity_segments);
>>> + ns->ctrl->max_integrity_segments,
>>> + ns->ctrl->dix_support);
>>> else if (!nvme_ns_has_pi(ns))
>>> capacity = 0;
>>> }
>>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>>> index bdc0ff7ed9ab..92cf93cf120b 100644
>>> --- a/drivers/nvme/host/nvme.h
>>> +++ b/drivers/nvme/host/nvme.h
>>> @@ -276,6 +276,7 @@ struct nvme_ctrl {
>>> u32 max_hw_sectors;
>>> u32 max_segments;
>>> u32 max_integrity_segments;
>>> + bool dix_support;
>>> u32 max_discard_sectors;
>>> u32 max_discard_segments;
>>> u32 max_zeroes_sectors;
>>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>>> index 3100643be299..0f63b626b3d4 100644
>>> --- a/drivers/nvme/host/rdma.c
>>> +++ b/drivers/nvme/host/rdma.c
>>> @@ -872,6 +872,9 @@ static int
>>> nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
>>> IBK_INTEGRITY_HANDOVER)
>>> pi_capable = true;
>>> + if (ctrl->device->dev->attrs.sig_guard_cap & IB_GUARD_T10DIF_CSUM)
>>> + ctrl->ctrl.dix_support = true;
>>> +
>>> ctrl->max_fr_pages =
>>> nvme_rdma_get_max_fr_pages(ctrl->device->dev,
>>> pi_capable);
>>> @@ -1423,10 +1426,10 @@ static int nvme_rdma_map_sg_fr(struct
>>> nvme_rdma_queue *queue,
>>> static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
>>> struct nvme_command *cmd, struct ib_sig_domain *domain,
>>> - u16 control, u8 pi_type)
>>> + u16 control, u8 pi_type, enum ib_t10_dif_bg_type dif_type)
>>> {
>>> domain->sig_type = IB_SIG_TYPE_T10_DIF;
>>> - domain->sig.dif.bg_type = IB_T10DIF_CRC;
>>> + domain->sig.dif.bg_type = dif_type;
>>> domain->sig.dif.pi_interval = 1 << bi->interval_exp;
>>> domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
>>> if (control & NVME_RW_PRINFO_PRCHK_REF)
>>> @@ -1441,7 +1444,7 @@ static void nvme_rdma_set_sig_domain(struct
>>> blk_integrity *bi,
>>> static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
>>> struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
>>> - u8 pi_type)
>>> + u8 pi_type, bool dix_support)
>>> {
>>> u16 control = le16_to_cpu(cmd->rw.control);
>>> @@ -1450,16 +1453,20 @@ static void nvme_rdma_set_sig_attrs(struct
>>> blk_integrity *bi,
>>> /* for WRITE_INSERT/READ_STRIP no memory domain */
>>> sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
>>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
>>> - pi_type);
>>> + pi_type, IB_T10DIF_CRC);
>>> /* Clear the PRACT bit since HCA will generate/verify the
>>> PI */
>>> control &= ~NVME_RW_PRINFO_PRACT;
>>> cmd->rw.control = cpu_to_le16(control);
>>> } else {
>>> /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
>>> nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
>>> - pi_type);
>>> - nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
>>> - pi_type);
>>> + pi_type, IB_T10DIF_CRC);
>>> + if (dix_support)
>>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem,
>>> + control, pi_type, IB_T10DIF_CSUM);
>>> + else
>>> + nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem,
>>> + control, pi_type, IB_T10DIF_CRC);
>>> }
>>> }
>>> @@ -1501,7 +1508,8 @@ static int nvme_rdma_map_sg_pi(struct
>>> nvme_rdma_queue *queue,
>>> goto mr_put;
>>> nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
>>> - req->mr->sig_attrs, ns->pi_type);
>>> + req->mr->sig_attrs, ns->pi_type,
>>> + ns->ctrl->dix_support);
>>> nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
>>> ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
>> .
More information about the Linux-nvme
mailing list