[PATCH 3/4] NVMe: End-to-end data protection
David.Darrington at hgst.com
David.Darrington at hgst.com
Mon Mar 11 18:11:04 EDT 2013
Is this correct:
> +/*
> + * Valid formats must have either no meta-data, or meta-data equal to
the DIF
> + * size and formatted for protection information. The driver has no use
for
> + * meta-data for any other purpose.
> + */
This patch fails nvme_alloc_ns() if there is meta data, but data
protection is not enabled.
Are there valid use cases where the driver passes through the meta-data to
and from the controller, without generating or verifying? Is it possible
for applications to provided there own meta-data?
David Darrington
"Linux-nvme" <linux-nvme-bounces at lists.infradead.org> wrote on 03/04/2013
06:24:22 PM:
> From: Keith Busch <keith.busch at intel.com>
>
> Registers a DIF capable nvme namespace with block integrity.
>
> Most of this is a copy from sd_dif.c, which I understand parts may be
> pulled into a kernel library that can be used from nvme and other
drivers
> in the future, but will be copied here until then.
>
> If the meta-data is a separate buffer, the driver will verify and
> calcuate the data integrity on reads and writes and supply a meta-data
> in the command buffer for this. The NVMe PRACT field is set to have the
> controller generate DIF on writes and strip it on reads for lba formats
> that interleave the meta-data with the block data. LBA formats that the
> driver cannot deal with will not create a block device for that
namespace.
>
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> ---
> drivers/block/nvme.c | 258
> +++++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/nvme.h | 29 +++++-
> 2 files changed, 280 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
> index 00b4063..ff524c0 100644
> --- a/drivers/block/nvme.c
> +++ b/drivers/block/nvme.c
> @@ -20,6 +20,7 @@
> #include <linux/bio.h>
> #include <linux/bitops.h>
> #include <linux/blkdev.h>
> +#include <linux/crc-t10dif.h>
> #include <linux/delay.h>
> #include <linux/errno.h>
> #include <linux/fs.h>
> @@ -94,6 +95,8 @@ struct nvme_ns {
>
> int ns_id;
> int lba_shift;
> + int pi_type;
> + int extended;
> };
>
> /*
> @@ -158,6 +161,189 @@ static struct nvme_cmd_info
> *nvme_cmd_info(struct nvme_queue *nvmeq)
> return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
> }
>
> +/*
> + * Data Integrity Field tuple.
> + */
> +struct sd_dif_tuple {
> + __be16 guard_tag; /* Checksum */
> + __be16 app_tag; /* Opaque storage */
> + __be32 ref_tag; /* Target LBA or indirect LBA */
> +};
> +
> +static void sd_dif_type1_generate(struct blk_integrity_exchg *bix)
> +{
> + void *buf = bix->data_buf;
> + struct sd_dif_tuple *sdt = bix->prot_buf;
> + sector_t sector = bix->sector;
> + unsigned int i;
> +
> + for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) {
> + sdt->guard_tag = cpu_to_be16(crc_t10dif(buf, bix->sector_size));
> + sdt->ref_tag = cpu_to_be32(sector & 0xffffffff);
> + sdt->app_tag = 0;
> +
> + buf += bix->sector_size;
> + sector++;
> + }
> +}
> +
> +static int sd_dif_type1_verify(struct blk_integrity_exchg *bix)
> +{
> + void *buf = bix->data_buf;
> + struct sd_dif_tuple *sdt = bix->prot_buf;
> + sector_t sector = bix->sector;
> + unsigned int i;
> + __u16 csum;
> +
> + for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) {
> + /* Unwritten sectors */
> + if (sdt->app_tag == 0xffff)
> + return 0;
> +
> + if (be32_to_cpu(sdt->ref_tag) != (sector & 0xffffffff)) {
> + printk(KERN_ERR
> + "%s: ref tag error on sector %lu (rcvd %u)\n",
> + bix->disk_name, (unsigned long)sector,
> + be32_to_cpu(sdt->ref_tag));
> + return -EIO;
> + }
> +
> + csum = cpu_to_be16(crc_t10dif(buf, bix->sector_size));
> + if (sdt->guard_tag != csum) {
> + printk(KERN_ERR "%s: guard tag error on sector %lu " \
> + "(rcvd %04x, data %04x)\n", bix->disk_name,
> + (unsigned long)sector,
> + be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum));
> + return -EIO;
> + }
> +
> + buf += bix->sector_size;
> + sector++;
> + }
> +
> + return 0;
> +}
> +
> +static void sd_dif_type1_set_tag(void *prot, void *tag_buf,
> unsigned int sectors)
> +{
> + struct sd_dif_tuple *sdt = prot;
> + u8 *tag = tag_buf;
> + unsigned int i, j;
> +
> + for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) {
> + sdt->app_tag = tag[j] << 8 | tag[j+1];
> + BUG_ON(sdt->app_tag == 0xffff);
> + }
> +}
> +
> +static void sd_dif_type1_get_tag(void *prot, void *tag_buf,
> unsigned int sectors)
> +{
> + struct sd_dif_tuple *sdt = prot;
> + u8 *tag = tag_buf;
> + unsigned int i, j;
> +
> + for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) {
> + tag[j] = (sdt->app_tag & 0xff00) >> 8;
> + tag[j+1] = (sdt->app_tag & 0xff);
> + }
> +}
> +
> +static struct blk_integrity sd_dif_type1_integrity = {
> + .name = "T10-DIF-TYPE1-CRC",
> + .generate_fn = sd_dif_type1_generate,
> + .verify_fn = sd_dif_type1_verify,
> + .set_tag_fn = sd_dif_type1_set_tag,
> + .get_tag_fn = sd_dif_type1_get_tag,
> + .tuple_size = sizeof(struct sd_dif_tuple),
> + .tag_size = sizeof(u16),
> +};
> +
> +static void sd_dif_type3_generate(struct blk_integrity_exchg *bix)
> +{
> + void *buf = bix->data_buf;
> + struct sd_dif_tuple *sdt = bix->prot_buf;
> + unsigned int i;
> +
> + for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) {
> + sdt->guard_tag = cpu_to_be16(crc_t10dif(buf, bix->sector_size));
> + sdt->ref_tag = 0;
> + sdt->app_tag = 0;
> +
> + buf += bix->sector_size;
> + }
> +}
> +
> +static int sd_dif_type3_verify(struct blk_integrity_exchg *bix)
> +{
> + void *buf = bix->data_buf;
> + struct sd_dif_tuple *sdt = bix->prot_buf;
> + sector_t sector = bix->sector;
> + unsigned int i;
> + __u16 csum;
> +
> + for (i = 0 ; i < bix->data_size ; i += bix->sector_size, sdt++) {
> + /* Unwritten sectors */
> + if (sdt->app_tag == 0xffff && sdt->ref_tag == 0xffffffff)
> + continue;
> +
> + csum = cpu_to_be16(crc_t10dif(buf, bix->sector_size));
> +
> + if (sdt->guard_tag != csum) {
> + printk(KERN_ERR "%s: guard error on sector %lu" \
> + "(rcvd:%04x data:%04x)\n", bix->disk_name,
> + (unsigned long) sector,
> + be16_to_cpu(sdt->guard_tag), be16_to_cpu(csum));
> + return -EIO;
> + }
> +
> + buf += bix->sector_size;
> + sector++;
> + }
> +
> + return 0;
> +}
> +
> +static void sd_dif_type3_set_tag(void *prot, void *tag_buf,
> unsigned int sectors)
> +{
> + struct sd_dif_tuple *sdt = prot;
> + u8 *tag = tag_buf;
> + unsigned int i, j;
> +
> + for (i = 0, j = 0 ; i < sectors ; i++, j += 6, sdt++) {
> + sdt->app_tag = tag[j] << 8 | tag[j+1];
> + sdt->ref_tag = tag[j+2] << 24 | tag[j+3] << 16 |
> + tag[j+4] << 8 | tag[j+5];
> + }
> +}
> +
> +static void sd_dif_type3_get_tag(void *prot, void *tag_buf,
> unsigned int sectors)
> +{
> + struct sd_dif_tuple *sdt = prot;
> + u8 *tag = tag_buf;
> + unsigned int i, j;
> +
> + for (i = 0, j = 0 ; i < sectors ; i++, j += 2, sdt++) {
> + tag[j] = (sdt->app_tag & 0xff00) >> 8;
> + tag[j+1] = (sdt->app_tag & 0xff);
> + tag[j+2] = (sdt->ref_tag & 0xff000000) >> 24;
> + tag[j+3] = (sdt->ref_tag & 0xff0000) >> 16;
> + tag[j+4] = (sdt->ref_tag & 0xff00) >> 8;
> + tag[j+5] = (sdt->ref_tag & 0xff);
> + BUG_ON(sdt->app_tag == 0xffff || sdt->ref_tag == 0xffffffff);
> + }
> +}
> +
> +
> +static struct blk_integrity sd_dif_type3_integrity = {
> + .name = "T10-DIF-TYPE3-CRC",
> + .generate_fn = sd_dif_type3_generate,
> + .verify_fn = sd_dif_type3_verify,
> + .set_tag_fn = sd_dif_type3_set_tag,
> + .get_tag_fn = sd_dif_type3_get_tag,
> + .tuple_size = sizeof(struct sd_dif_tuple),
> + .tag_size = sizeof(u16) + sizeof(u32),
> +};
> +
> /**
> * alloc_cmdid() - Allocate a Command ID
> * @nvmeq: The queue that will be used for this command
> @@ -313,6 +499,9 @@ struct nvme_iod {
> int nents; /* Used in scatterlist */
> int length; /* Of data, in bytes */
> dma_addr_t first_dma;
> + dma_addr_t meta_dma;
> + unsigned int meta_size;
> + enum dma_data_direction dma_dir;
> struct scatterlist sg[0];
> };
>
> @@ -344,6 +533,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t
gfp)
> iod->npages = -1;
> iod->length = nbytes;
> iod->nents = 0;
> + iod->meta_size = 0;
> }
>
> return iod;
> @@ -364,6 +554,9 @@ static void nvme_free_iod(struct nvme_dev *dev,
> struct nvme_iod *iod)
> dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
> prp_dma = next_prp_dma;
> }
> + if (iod->meta_size)
> + dma_unmap_single(&dev->pci_dev->dev, iod->meta_dma,
> + iod->meta_size, iod->dma_dir);
> kfree(iod);
> }
>
> @@ -649,6 +842,7 @@ static int nvme_submit_bio_queue(struct
> nvme_queue *nvmeq, struct nvme_ns *ns,
> cmnd->rw.opcode = nvme_cmd_read;
> dma_dir = DMA_FROM_DEVICE;
> }
> + iod->dma_dir = dma_dir;
>
> result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
> if (result <= 0)
> @@ -661,6 +855,27 @@ static int nvme_submit_bio_queue(struct
> nvme_queue *nvmeq, struct nvme_ns *ns,
> GFP_ATOMIC);
> cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
> cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
> +
> + if (ns->pi_type) {
> + control |= NVME_RW_PRINFO_PRCHK_GUARD;
> + if (ns->pi_type != NVME_NS_DPS_PI_TYPE3) {
> + control |= NVME_RW_PRINFO_PRCHK_REF;
> + cmnd->rw.reftag = cpu_to_le32(
> + (bio->bi_sector >> (ns->lba_shift - 9)) &
> + 0xffffffff);
> + }
> + if (bio_integrity(bio)) {
> + iod->meta_dma = dma_map_single(nvmeq->q_dmadev,
> + bio->bi_integrity->bip_buf,
> + bio->bi_integrity->bip_size,
> + dma_dir);
> + iod->meta_size = bio->bi_integrity->bip_size;
> + cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
> + } else {
> + control |= NVME_RW_PRINFO_PRACT;
> + }
> + }
> +
> cmnd->rw.control = cpu_to_le16(control);
> cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
>
> @@ -1404,16 +1619,46 @@ static void nvme_put_ns_idx(int index)
> spin_unlock(&dev_list_lock);
> }
>
> +static void nvme_ns_register_pi(struct nvme_ns *ns)
> +{
> + struct blk_integrity *integrity;
> + if (ns->pi_type == NVME_NS_DPS_PI_TYPE3)
> + integrity = &sd_dif_type3_integrity;
> + else
> + integrity = &sd_dif_type1_integrity;
> + blk_integrity_register(ns->disk, integrity);
> +}
> +
> +/*
> + * Valid formats must have either no meta-data, or meta-data equal to
the DIF
> + * size and formatted for protection information. The driver has no use
for
> + * meta-data for any other purpose.
> + */
> +static int nvme_check_pi_format(struct nvme_id_ns *id)
> +{
> + int lbaf = id->flbas & NVME_NS_FLBAS_LBAF_MASK;
> + int ms = id->lbaf[lbaf].ms;
> + if (id->dps & NVME_NS_DPS_PI_MASK && ms == sizeof(struct
sd_dif_tuple))
> + return id->dps & NVME_NS_DPS_PI_MASK;
> + else if (ms)
> + return -1;
> + return 0;
> +}
> +
> static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
> struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
> {
> struct nvme_ns *ns;
> struct gendisk *disk;
> - int lbaf;
> + int lbaf, pi_type;
>
> if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
> return NULL;
>
> + pi_type = nvme_check_pi_format(id);
> + if (pi_type < 0)
> + return NULL;
> +
> ns = kzalloc(sizeof(*ns), GFP_KERNEL);
> if (!ns)
> return NULL;
> @@ -1428,6 +1673,10 @@ static struct nvme_ns *nvme_alloc_ns(struct
> nvme_dev *dev, int nsid,
> ns->dev = dev;
> ns->queue->queuedata = ns;
>
> + ns->pi_type = pi_type;
> + if (pi_type)
> + ns->extended = id->flbas & NVME_NS_FLBAS_LBA_EXTENDED;
> +
> disk = alloc_disk(NVME_MINORS);
> if (!disk)
> goto out_free_queue;
> @@ -1603,8 +1852,11 @@ static int __devinit nvme_dev_add(struct nvme_dev
*dev)
> if (ns)
> list_add_tail(&ns->list, &dev->namespaces);
> }
> - list_for_each_entry(ns, &dev->namespaces, list)
> + list_for_each_entry(ns, &dev->namespaces, list) {
> add_disk(ns->disk);
> + if (!ns->extended && ns->pi_type)
> + nvme_ns_register_pi(ns);
> + }
>
> goto out;
>
> @@ -1629,6 +1881,8 @@ static int nvme_dev_remove(struct nvme_dev *dev)
>
> list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
> list_del(&ns->list);
> + if (!ns->extended && ns->pi_type)
> + blk_integrity_unregister(ns->disk);
> del_gendisk(ns->disk);
> nvme_ns_free(ns);
> }
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 4fa3b0b..ee0a1f6 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -130,11 +130,25 @@ struct nvme_id_ns {
> };
>
> enum {
> - NVME_NS_FEAT_THIN = 1 << 0,
> - NVME_LBAF_RP_BEST = 0,
> - NVME_LBAF_RP_BETTER = 1,
> - NVME_LBAF_RP_GOOD = 2,
> - NVME_LBAF_RP_DEGRADED = 3,
> + NVME_NS_FEAT_THIN = 1 << 0,
> + NVME_NS_MC_EXTENDED = 1 << 0,
> + NVME_NS_MC_SEPARATE = 1 << 1,
> + NVME_NS_FLBAS_LBA_EXTENDED = 1 << 4,
> + NVME_NS_FLBAS_LBAF_MASK = 0xf,
> + NVME_NS_DPC_PI_LAST = 1 << 4,
> + NVME_NS_DPC_PI_FIRST = 1 << 3,
> + NVME_NS_DPC_PI_TYPE3 = 1 << 2,
> + NVME_NS_DPC_PI_TYPE2 = 1 << 1,
> + NVME_NS_DPC_PI_TYPE1 = 1 << 0,
> + NVME_NS_DPS_PI_FIRST = 1 << 3,
> + NVME_NS_DPS_PI_MASK = 0x7,
> + NVME_NS_DPS_PI_TYPE1 = 1,
> + NVME_NS_DPS_PI_TYPE2 = 2,
> + NVME_NS_DPS_PI_TYPE3 = 3,
> + NVME_LBAF_RP_BEST = 0,
> + NVME_LBAF_RP_BETTER = 1,
> + NVME_LBAF_RP_GOOD = 2,
> + NVME_LBAF_RP_DEGRADED = 3,
> };
>
> struct nvme_smart_log {
> @@ -244,6 +258,11 @@ enum {
> NVME_RW_DSM_LATENCY_LOW = 3 << 4,
> NVME_RW_DSM_SEQ_REQ = 1 << 6,
> NVME_RW_DSM_COMPRESSED = 1 << 7,
> + NVME_RW_PRINFO_PRACT = 1 << 13,
> + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
> + NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
> + NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
> +
> };
>
> /* Admin commands */
> --
> 1.7.0.4
>
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://merlin.infradead.org/mailman/listinfo/linux-nvme
More information about the Linux-nvme
mailing list