[PATCH] NVMe: Metadata and PI format support
Sam Bradshaw
sbradshaw at micron.com
Tue Feb 17 09:03:54 PST 2015
>> If there is interest in incorporating this support, we can provide
>> a patch on top of this that enables 16b/32b/64b metadata with PI and
>> supports PIL={0,1}
>
> I would also like that to work. I was hoping to reuse the
> t10_pi_generate/verify functions for this. Those can work if the
> iter->prot_buf is incremented by the blk_intergity's tuple_size rather
> than just the 8-byte PI field.
>
> That'd work with PIL=0, not sure what to do with PIL=1. Maybe we should
> just define our own functions for nvme even though they'd mostly be the
> same as the ones scsi uses.
We were unable to come up with an elegant solution that reused the
t10_pi_generate/verify functions while also handling the PIL=1 case.
So we defined our own functions.
---
drivers/block/nvme-core.c | 319 +++++++++++++++++++++++++++++++++++++++++++--
include/linux/nvme.h | 22 +++
2 files changed, 328 insertions(+), 13 deletions(-)
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 23df65d..692eb31 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -38,6 +38,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/t10-pi.h>
+#include <linux/crc-t10dif.h>
#include <linux/types.h>
#include <scsi/sg.h>
#include <asm-generic/io-64-nonatomic-lo-hi.h>
@@ -449,6 +450,7 @@ static void nvme_dif_remap(struct request *req,
struct nvme_ns *ns = req->rq_disk->private_data;
struct bio_integrity_payload *bip;
struct t10_pi_tuple *pi;
+ int pi_sz = sizeof(struct t10_pi_tuple);
void *p;
u32 i, nlb, ts, phys, virt;
@@ -468,6 +470,11 @@ static void nvme_dif_remap(struct request *req,
nlb = (blk_rq_bytes(req) >> ns->lba_shift);
ts = ns->disk->integrity->tuple_size;
+ /* Adjust if metadata > 8b and PI is in last 8b of metadata */
+ if ((ns->ms > pi_sz) && !ns->pi_first)
+ p = (struct t10_pi_tuple *) ((char *) p + ns->ms
+ - pi_sz);
+
for (i = 0; i < nlb; i++, virt++, phys++) {
pi = (struct t10_pi_tuple *)p;
dif_swap(phys, virt, pi);
@@ -1944,6 +1951,183 @@ static void nvme_config_discard(struct nvme_ns *ns)
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
}
+/*
+ * Calculates CRC value for the data buffer of given len, and returns
+ * it in big-endian format
+ */
+static __be16 nvme_crc_fn(void *data, unsigned int len)
+{
+ return cpu_to_be16(crc_t10dif(data, len));
+}
+
+/*
+ * Calculates unified CRC value for the inputted crc and data buffer of given
+ * len, and returns it in big-endian format
+ */
+static __be16 nvme_crc_generic_fn(__u16 crc, void *data, unsigned int len)
+{
+ return cpu_to_be16(crc_t10dif_generic(crc, data, len));
+}
+
+static int nvme_generate(struct blk_integrity_iter *bix, csum_fn *fn, u8 ms,
+ u8 type, u8 first)
+{
+ struct t10_pi_tuple *pi = bix->prot_buf;
+ int i, pi_sz = sizeof(struct t10_pi_tuple);
+
+ /* Adjust if metadata > 8b and PI is in last 8b of metadata */
+ if ((ms > pi_sz) && (!first))
+ pi = (struct t10_pi_tuple *) ((char *) pi + ms - pi_sz);
+
+ for (i = 0; i < bix->data_size; i += bix->interval) {
+
+ pi->guard_tag = fn(bix->data_buf, bix->interval);
+
+ /*
+ * Include metadata for CRC calc if PI is in last 8b
+ * of metadata
+ */
+ if ((ms > pi_sz) && (!first)) {
+ u8 *p = ((u8 *) pi - (ms - pi_sz));
+
+ memset(p, 0, ms - pi_sz);
+ pi->guard_tag = nvme_crc_generic_fn(
+ be16_to_cpu(pi->guard_tag),
+ p,
+ ms - pi_sz);
+ }
+
+ /*
+ * 'seed' is a virtual start sector that we received, and
+ * this func populates ref_tag based on it. The ref_tag will
+ * be remapped with actual physical sector in nvme_dif_remap()
+ */
+ if (type != NVME_NS_DPS_PI_TYPE3)
+ pi->ref_tag = cpu_to_be32(lower_32_bits(bix->seed));
+ else
+ pi->ref_tag = 0;
+
+ pi->app_tag = 0;
+ bix->data_buf += bix->interval;
+ bix->prot_buf += ms;
+ bix->seed++;
+ pi = (struct t10_pi_tuple *) ((char *) pi + ms);
+ }
+ return 0;
+}
+
+/* DEFINE_GENERATOR(ms, pi_type, pi_first) */
+DEFINE_GENERATOR(8, 1, 0);
+DEFINE_GENERATOR(8, 2, 0);
+DEFINE_GENERATOR(8, 3, 0);
+DEFINE_GENERATOR(16, 1, 0);
+DEFINE_GENERATOR(16, 1, 1);
+DEFINE_GENERATOR(16, 2, 0);
+DEFINE_GENERATOR(16, 2, 1);
+DEFINE_GENERATOR(16, 3, 0);
+DEFINE_GENERATOR(16, 3, 1);
+DEFINE_GENERATOR(32, 1, 0);
+DEFINE_GENERATOR(32, 1, 1);
+DEFINE_GENERATOR(32, 2, 0);
+DEFINE_GENERATOR(32, 2, 1);
+DEFINE_GENERATOR(32, 3, 0);
+DEFINE_GENERATOR(32, 3, 1);
+DEFINE_GENERATOR(64, 1, 0);
+DEFINE_GENERATOR(64, 1, 1);
+DEFINE_GENERATOR(64, 2, 0);
+DEFINE_GENERATOR(64, 2, 1);
+DEFINE_GENERATOR(64, 3, 0);
+DEFINE_GENERATOR(64, 3, 1);
+
+static int nvme_verify(struct blk_integrity_iter *bix, csum_fn *fn, u8 ms,
+ u8 type, u8 first)
+{
+ struct t10_pi_tuple *pi = bix->prot_buf;
+ int i, pi_sz = sizeof(struct t10_pi_tuple);
+ u16 csum;
+
+ /* If metadata > 8 bytes and PI is in last 8 bytes of metadata */
+ if ((ms > pi_sz) && (!first))
+ pi = (struct t10_pi_tuple *) ((char *) pi + ms - pi_sz);
+
+ for (i = 0; i < bix->data_size; i += bix->interval) {
+ /*
+ * For type1 & type2, apptag == 0xffff means skip verification
+ * For type3, apptag == 0xffff and reftag == 0xffffffff means
+ * skip verification
+ */
+ if (pi->app_tag == 0xffff) {
+ switch (type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ if (pi->ref_tag != 0xffffffff)
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ goto next;
+ }
+ }
+
+ csum = fn(bix->data_buf, bix->interval);
+
+ /* Update CRC if PI is in last 8 bytes of metadata */
+ if ((ms > pi_sz) && (!first)) {
+ csum = nvme_crc_generic_fn(
+ be16_to_cpu(csum),
+ ((u8 *) pi - (ms - pi_sz)),
+ ms - pi_sz);
+ }
+
+ if (pi->guard_tag != csum) {
+ pr_err("%s: guard tag error at sector %llu "
+ "(rcvd %04x, want %04x)\n", bix->disk_name,
+ (unsigned long long)bix->seed,
+ be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
+ return -EILSEQ;
+ }
+ /*
+ * ref_tag would have been remapped with virtual sector values
+ * by nvme_dif_remap(). So match it directly with 'seed'.
+ */
+ if (be32_to_cpu(pi->ref_tag) !=
+ lower_32_bits(bix->seed)) {
+ pr_err("%s: ref tag error at location %llu "
+ "(rcvd %u)\n", bix->disk_name,
+ (unsigned long long)
+ bix->seed, be32_to_cpu(pi->ref_tag));
+ return -EILSEQ;
+ }
+ next:
+ bix->data_buf += bix->interval;
+ bix->prot_buf += ms;
+ bix->seed++;
+ pi = (struct t10_pi_tuple *) ((char *) pi + ms);
+ }
+ return 0;
+}
+
+/* DEFINE_VERIFIER(ms, pi_type, pi_first) */
+DEFINE_VERIFIER(8, 1, 0);
+DEFINE_VERIFIER(8, 2, 0);
+DEFINE_VERIFIER(8, 3, 0);
+DEFINE_VERIFIER(16, 1, 0);
+DEFINE_VERIFIER(16, 1, 1);
+DEFINE_VERIFIER(16, 2, 0);
+DEFINE_VERIFIER(16, 2, 1);
+DEFINE_VERIFIER(16, 3, 0);
+DEFINE_VERIFIER(16, 3, 1);
+DEFINE_VERIFIER(32, 1, 0);
+DEFINE_VERIFIER(32, 1, 1);
+DEFINE_VERIFIER(32, 2, 0);
+DEFINE_VERIFIER(32, 2, 1);
+DEFINE_VERIFIER(32, 3, 0);
+DEFINE_VERIFIER(32, 3, 1);
+DEFINE_VERIFIER(64, 1, 0);
+DEFINE_VERIFIER(64, 1, 1);
+DEFINE_VERIFIER(64, 2, 0);
+DEFINE_VERIFIER(64, 2, 1);
+DEFINE_VERIFIER(64, 3, 0);
+DEFINE_VERIFIER(64, 3, 1);
+
static int nvme_noop_verify(struct blk_integrity_iter *iter)
{
return 0;
@@ -1960,22 +2144,130 @@ struct blk_integrity nvme_meta_noop = {
.verify_fn = nvme_noop_verify,
};
+/*
+ * Supports PI within metadata of size 8/16/32/64b formats.
+ * For rest of metadata formats, 'noop' functions are registered.
+ */
static void nvme_init_integrity(struct nvme_ns *ns)
{
- struct blk_integrity integrity;
+ struct blk_integrity integrity = nvme_meta_noop;
- switch (ns->pi_type) {
- case NVME_NS_DPS_PI_TYPE3:
- integrity = t10_pi_type3_crc;
+ switch (ns->ms) {
+ case NVME_8B_META:
+ integrity.name = "NVME_META_8B";
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ integrity.generate_fn = nvme_gen_8_1_0;
+ integrity.verify_fn = nvme_ver_8_1_0;
+ break;
+ case NVME_NS_DPS_PI_TYPE2:
+ integrity.generate_fn = nvme_gen_8_2_0;
+ integrity.verify_fn = nvme_ver_8_2_0;
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ integrity.generate_fn = nvme_gen_8_3_0;
+ integrity.verify_fn = nvme_ver_8_3_0;
+ break;
+ }
break;
- case NVME_NS_DPS_PI_TYPE1:
- case NVME_NS_DPS_PI_TYPE2:
- integrity = t10_pi_type1_crc;
+ case NVME_16B_META:
+ integrity.name = "NVME_META_16B";
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_16_1_1;
+ integrity.verify_fn = nvme_ver_16_1_1;
+ } else {
+ integrity.generate_fn = nvme_gen_16_1_0;
+ integrity.verify_fn = nvme_ver_16_1_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE2:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_16_2_1;
+ integrity.verify_fn = nvme_ver_16_2_1;
+ } else {
+ integrity.generate_fn = nvme_gen_16_2_0;
+ integrity.verify_fn = nvme_ver_16_2_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_16_3_1;
+ integrity.verify_fn = nvme_ver_16_3_1;
+ } else {
+ integrity.generate_fn = nvme_gen_16_3_0;
+ integrity.verify_fn = nvme_ver_16_3_0;
+ }
+ break;
+ }
break;
- default:
- integrity = nvme_meta_noop;
+ case NVME_32B_META:
+ integrity.name = "NVME_META_32B";
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_32_1_1;
+ integrity.verify_fn = nvme_ver_32_1_1;
+ } else {
+ integrity.generate_fn = nvme_gen_32_1_0;
+ integrity.verify_fn = nvme_ver_32_1_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE2:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_32_2_1;
+ integrity.verify_fn = nvme_ver_32_2_1;
+ } else {
+ integrity.generate_fn = nvme_gen_32_2_0;
+ integrity.verify_fn = nvme_ver_32_2_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_32_3_1;
+ integrity.verify_fn = nvme_ver_32_3_1;
+ } else {
+ integrity.generate_fn = nvme_gen_32_3_0;
+ integrity.verify_fn = nvme_ver_32_3_0;
+ }
+ break;
+ }
+ break;
+ case NVME_64B_META:
+ integrity.name = "NVME_META_64B";
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE1:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_64_1_1;
+ integrity.verify_fn = nvme_ver_64_1_1;
+ } else {
+ integrity.generate_fn = nvme_gen_64_1_0;
+ integrity.verify_fn = nvme_ver_64_1_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE2:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_64_2_1;
+ integrity.verify_fn = nvme_ver_64_2_1;
+ } else {
+ integrity.generate_fn = nvme_gen_64_2_0;
+ integrity.verify_fn = nvme_ver_64_2_0;
+ }
+ break;
+ case NVME_NS_DPS_PI_TYPE3:
+ if (ns->pi_first) {
+ integrity.generate_fn = nvme_gen_64_3_1;
+ integrity.verify_fn = nvme_ver_64_3_1;
+ } else {
+ integrity.generate_fn = nvme_gen_64_3_0;
+ integrity.verify_fn = nvme_ver_64_3_0;
+ }
+ break;
+ }
break;
}
+
integrity.tuple_size = ns->ms;
blk_integrity_register(ns->disk, &integrity);
blk_queue_max_integrity_segments(ns->queue, 1);
@@ -1987,7 +2279,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
struct nvme_dev *dev = ns->dev;
struct nvme_id_ns *id;
dma_addr_t dma_addr;
- int lbaf, pi_type, old_ms;
+ int lbaf, pi_type, old_ms, pi_first;
unsigned short bs;
id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
@@ -2017,16 +2309,17 @@ static int nvme_revalidate_disk(struct gendisk *disk)
ns->lba_shift = 9;
bs = 1 << ns->lba_shift;
- /* XXX: PI implementation requires metadata equal t10 pi tuple size */
- pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
- id->dps & NVME_NS_DPS_PI_MASK : 0;
+ pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ pi_first = (id->dps & NVME_NS_DPS_PI_FIRST) >> 3;
if (disk->integrity && (ns->pi_type != pi_type || ns->ms != old_ms ||
bs != queue_logical_block_size(disk->queue) ||
+ ns->pi_first != pi_first ||
(ns->ms && id->flbas & NVME_NS_FLBAS_META_EXT)))
blk_integrity_unregister(disk);
ns->pi_type = pi_type;
+ ns->pi_first = pi_first;
blk_queue_logical_block_size(ns->queue, bs);
if (ns->ms && !disk->integrity && (disk->flags & GENHD_FL_UP) &&
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index e1fcea0..05cd302 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -122,6 +122,7 @@ struct nvme_ns {
int lba_shift;
int ms;
int pi_type;
+ int pi_first; /* PI location in metadata. Whether first or last 8b*/
u64 mode_select_num_blocks;
u32 mode_select_block_len;
};
@@ -148,6 +149,27 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
return (sector >> (ns->lba_shift - 9));
}
+#define NVME_8B_META 8
+#define NVME_16B_META 16
+#define NVME_32B_META 32
+#define NVME_64B_META 64
+
+#define DEFINE_GENERATOR(m, t, f) GENERATOR(m, t, f, _)
+#define DEFINE_VERIFIER(m, t, f) VERIFIER(m, t, f, _)
+
+#define GENERATOR(m, t, f, u) \
+ static int nvme_gen_##m##u##t##u##f(struct blk_integrity_iter *bix) \
+ { \
+ return nvme_generate(bix, nvme_crc_fn, m, t, f); \
+ }
+#define VERIFIER(m, t, f, u) \
+ static int nvme_ver_##m##u##t##u##f(struct blk_integrity_iter *bix) \
+ { \
+ return nvme_verify(bix, nvme_crc_fn, m, t, f); \
+ }
+
+typedef __u16 (csum_fn) (void*, unsigned int);
+
/**
* nvme_free_iod - frees an nvme_iod
* @dev: The device that the I/O was submitted to
--
1.7.1
More information about the Linux-nvme
mailing list