[PATCH 2/8] nvme: set uniform metadata settings for ns head

Mon Jan 22 06:56:53 PST 2024

Do not allow having different metadata settings for a namespace and its
associated head.

In case of a different metadata settings discovered for a shared
namespace, that can caused by controller or path capabilities, create a
dedicated namespace head for each controller for that namespace.

For example, in the fabrics case, where there might be a multipath
configuration which one controller/path supports metadata offloading and
the second path that doesn't. In that case, we don't want to put these
paths and their namespaces under the same multipath umbrella.
Therefore, if the metadata settings are not the same we will create a
separate namespace head for each such namespace.

While we're here, also make sure that all the namespaces under the
namespace head have the same lba shift size.

Reviewed-by: Israel Rukshin <israelr at nvidia.com>
Signed-off-by: Ori Evron <oevron at nvidia.com>
Signed-off-by: Max Gurtovoy <mgurtovoy at nvidia.com>
---
 drivers/nvme/host/core.c | 372 +++++++++++++++++++++++++--------------
 1 file changed, 236 insertions(+), 136 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ff18c263c62c..9d46acf5b6cb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -35,6 +35,14 @@
 struct nvme_ns_info {
 	struct nvme_ns_ids ids;
 	u32 nsid;
+	unsigned long features;
+	u64 nuse;
+	u16 ms;
+	u16 pi_size;
+	u8 pi_type;
+	u8 guard_type;
+	u8 flbas;
+	u8 ds;
 	__le32 anagrpid;
 	bool is_shared;
 	bool is_readonly;
@@ -1473,6 +1481,126 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	return error;
 }
 
+static int nvme_elbaf_from_id_ns_nvm(struct nvme_ctrl *ctrl, u32 nsid,
+		unsigned lbaf, u32 *elbaf)
+{
+	struct nvme_command c = { };
+	struct nvme_id_ns_nvm *nvm;
+	int ret;
+
+	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
+	if (!nvm)
+		return -ENOMEM;
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.nsid = cpu_to_le32(nsid);
+	c.identify.cns = NVME_ID_CNS_CS_NS;
+	c.identify.csi = NVME_CSI_NVM;
+
+	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
+	if (ret)
+		goto free_data;
+
+	*elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+
+free_data:
+	kfree(nvm);
+
+	return ret;
+}
+
+static int nvme_info_init(struct nvme_ctrl *ctrl, struct nvme_ns_info *info,
+		struct nvme_id_ns *id)
+{
+	bool first = id->dps & NVME_NS_DPS_PI_FIRST;
+	unsigned lbaf = nvme_lbaf_index(id->flbas);
+	int ret = 0;
+	u32 elbaf;
+
+	info->nuse = le64_to_cpu(id->nuse);
+	info->flbas = id->flbas;
+	info->ds = id->lbaf[lbaf].ds;
+	info->pi_size = 0;
+	info->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+		info->pi_size = sizeof(struct t10_pi_tuple);
+		info->guard_type = NVME_NVM_NS_16B_GUARD;
+		goto set_pi;
+	}
+
+	ret = nvme_elbaf_from_id_ns_nvm(ctrl, info->nsid, lbaf, &elbaf);
+	if (ret)
+		goto set_pi;
+
+	/* no support for storage tag formats right now */
+	if (nvme_elbaf_sts(elbaf))
+		goto set_pi;
+
+	info->guard_type = nvme_elbaf_guard_type(elbaf);
+	switch (info->guard_type) {
+	case NVME_NVM_NS_64B_GUARD:
+		info->pi_size = sizeof(struct crc64_pi_tuple);
+		break;
+	case NVME_NVM_NS_16B_GUARD:
+		info->pi_size = sizeof(struct t10_pi_tuple);
+		break;
+	default:
+		break;
+	}
+
+set_pi:
+	if (info->pi_size && (first || info->ms == info->pi_size))
+		info->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+	else
+		info->pi_type = 0;
+
+	return ret;
+}
+
+static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
+		struct nvme_ns_info *info)
+{
+	info->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+	if (!info->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
+		return;
+
+	if (ctrl->ops->flags & NVME_F_FABRICS) {
+		/*
+		 * The NVMe over Fabrics specification only supports metadata as
+		 * part of the extended data LBA.  We rely on HCA/HBA support to
+		 * remap the separate metadata buffer from the block layer.
+		 */
+		if (WARN_ON_ONCE(!(info->flbas & NVME_NS_FLBAS_META_EXT)))
+			return;
+
+		info->features |= NVME_NS_EXT_LBAS;
+
+		/*
+		 * The current fabrics transport drivers support namespace
+		 * metadata formats only if pi_type != 0 and ms == pi_size.
+		 * Suppress support for all other formats so the namespace will
+		 * have a 0 capacity and not be usable through the block stack.
+		 *
+		 * Note, this check will need to be modified if any drivers
+		 * gain the ability to use other metadata formats.
+		 */
+		if (ctrl->max_integrity_segments && info->pi_type &&
+		    info->ms == info->pi_size)
+			info->features |= NVME_NS_METADATA_SUPPORTED;
+	} else {
+		/*
+		 * For PCIe controllers, we can't easily remap the separate
+		 * metadata buffer from the block layer and thus require a
+		 * separate metadata buffer for block layer metadata/PI support.
+		 * We allow extended LBAs for the passthrough interface, though.
+		 */
+		if (info->flbas & NVME_NS_FLBAS_META_EXT)
+			info->features |= NVME_NS_EXT_LBAS;
+		else
+			info->features |= NVME_NS_METADATA_SUPPORTED;
+	}
+}
+
 static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
 		struct nvme_ns_info *info)
 {
@@ -1491,6 +1619,21 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl,
 		goto error;
 	}
 
+	ret = nvme_info_init(ctrl, info, id);
+	if (ret)
+		goto error;
+
+	nvme_configure_metadata(ctrl, info);
+
+	/*
+	 * Only set the DEAC bit if the device guarantees that reads from
+	 * deallocated data return zeroes.  While the DEAC bit does not
+	 * require that, it must be a no-op if reads from deallocated data
+	 * do not return zeroes.
+	 */
+	if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
+		info->features |= NVME_NS_DEAC;
+
 	info->anagrpid = id->anagrpid;
 	info->is_shared = id->nmic & NVME_NS_NMIC_SHARED;
 	info->is_readonly = id->nsattr & NVME_NS_ATTR_RO;
@@ -1773,116 +1916,6 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 		a->csi == b->csi;
 }
 
-static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
-		struct nvme_id_ns *id)
-{
-	bool first = id->dps & NVME_NS_DPS_PI_FIRST;
-	unsigned lbaf = nvme_lbaf_index(id->flbas);
-	struct nvme_command c = { };
-	struct nvme_id_ns_nvm *nvm;
-	int ret = 0;
-	u32 elbaf;
-
-	head->pi_size = 0;
-	head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
-	if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
-		head->pi_size = sizeof(struct t10_pi_tuple);
-		head->guard_type = NVME_NVM_NS_16B_GUARD;
-		goto set_pi;
-	}
-
-	nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
-	if (!nvm)
-		return -ENOMEM;
-
-	c.identify.opcode = nvme_admin_identify;
-	c.identify.nsid = cpu_to_le32(head->ns_id);
-	c.identify.cns = NVME_ID_CNS_CS_NS;
-	c.identify.csi = NVME_CSI_NVM;
-
-	ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
-	if (ret)
-		goto free_data;
-
-	elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
-
-	/* no support for storage tag formats right now */
-	if (nvme_elbaf_sts(elbaf))
-		goto free_data;
-
-	head->guard_type = nvme_elbaf_guard_type(elbaf);
-	switch (head->guard_type) {
-	case NVME_NVM_NS_64B_GUARD:
-		head->pi_size = sizeof(struct crc64_pi_tuple);
-		break;
-	case NVME_NVM_NS_16B_GUARD:
-		head->pi_size = sizeof(struct t10_pi_tuple);
-		break;
-	default:
-		break;
-	}
-
-free_data:
-	kfree(nvm);
-set_pi:
-	if (head->pi_size && (first || head->ms == head->pi_size))
-		head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
-	else
-		head->pi_type = 0;
-
-	return ret;
-}
-
-static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
-		struct nvme_ns_head *head, struct nvme_id_ns *id)
-{
-	int ret;
-
-	ret = nvme_init_ms(ctrl, head, id);
-	if (ret)
-		return ret;
-
-	head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
-	if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
-		return 0;
-
-	if (ctrl->ops->flags & NVME_F_FABRICS) {
-		/*
-		 * The NVMe over Fabrics specification only supports metadata as
-		 * part of the extended data LBA.  We rely on HCA/HBA support to
-		 * remap the separate metadata buffer from the block layer.
-		 */
-		if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
-			return 0;
-
-		head->features |= NVME_NS_EXT_LBAS;
-
-		/*
-		 * The current fabrics transport drivers support namespace
-		 * metadata formats only if nvme_ns_has_pi() returns true.
-		 * Suppress support for all other formats so the namespace will
-		 * have a 0 capacity and not be usable through the block stack.
-		 *
-		 * Note, this check will need to be modified if any drivers
-		 * gain the ability to use other metadata formats.
-		 */
-		if (ctrl->max_integrity_segments && nvme_ns_has_pi(head))
-			head->features |= NVME_NS_METADATA_SUPPORTED;
-	} else {
-		/*
-		 * For PCIe controllers, we can't easily remap the separate
-		 * metadata buffer from the block layer and thus require a
-		 * separate metadata buffer for block layer metadata/PI support.
-		 * We allow extended LBAs for the passthrough interface, though.
-		 */
-		if (id->flbas & NVME_NS_FLBAS_META_EXT)
-			head->features |= NVME_NS_EXT_LBAS;
-		else
-			head->features |= NVME_NS_METADATA_SUPPORTED;
-	}
-	return 0;
-}
-
 static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
 		struct request_queue *q)
 {
@@ -2058,15 +2091,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 
 	blk_mq_freeze_queue(ns->disk->queue);
 	lbaf = nvme_lbaf_index(id->flbas);
-	ns->head->lba_shift = id->lbaf[lbaf].ds;
-	ns->head->nuse = le64_to_cpu(id->nuse);
-	nvme_set_queue_limits(ns->ctrl, ns->queue);
 
-	ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
-	if (ret < 0) {
+	if (info->flbas != id->flbas || info->ds != id->lbaf[lbaf].ds) {
 		blk_mq_unfreeze_queue(ns->disk->queue);
-		goto out;
+		ret = -EINVAL;
+		goto error;
 	}
+
+	nvme_set_queue_limits(ns->ctrl, ns->queue);
 	nvme_set_chunk_sectors(ns, id);
 	nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
 
@@ -2078,14 +2110,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 		}
 	}
 
-	/*
-	 * Only set the DEAC bit if the device guarantees that reads from
-	 * deallocated data return zeroes.  While the DEAC bit does not
-	 * require that, it must be a no-op if reads from deallocated data
-	 * do not return zeroes.
-	 */
-	if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
-		ns->head->features |= NVME_NS_DEAC;
 	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
 	set_bit(NVME_NS_READY, &ns->flags);
 	blk_mq_unfreeze_queue(ns->disk->queue);
@@ -3294,8 +3318,18 @@ static const struct file_operations nvme_dev_fops = {
 	.uring_cmd	= nvme_dev_uring_cmd,
 };
 
+static bool nvme_head_is_same_metadata(struct nvme_ns_head *h,
+		struct nvme_ns_info *info)
+{
+	unsigned int mask = NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS;
+
+	return (h->features & mask) == (info->features & mask) &&
+	       h->pi_size == info->pi_size && h->pi_type == info->pi_type &&
+	       h->guard_type == info->guard_type && h->ms == info->ms;
+}
+
 static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
-		unsigned nsid)
+		struct nvme_ns_info *info)
 {
 	struct nvme_ns_head *h;
 
@@ -3307,7 +3341,11 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
 		 * In that case we can't use the same ns_head for namespaces
 		 * with the same NSID.
 		 */
-		if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
+		if (h->ns_id != info->nsid || !nvme_is_unique_nsid(ctrl, h))
+			continue;
+		if (!nvme_head_is_same_metadata(h, info))
+			continue;
+		if (h->lba_shift != info->ds)
 			continue;
 		if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
 			return h;
@@ -3316,24 +3354,51 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
 	return NULL;
 }
 
-static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
+static bool nvme_head_is_duplicate_ids(struct nvme_ns_head *h,
 		struct nvme_ns_ids *ids)
 {
 	bool has_uuid = !uuid_is_null(&ids->uuid);
 	bool has_nguid = memchr_inv(ids->nguid, 0, sizeof(ids->nguid));
 	bool has_eui64 = memchr_inv(ids->eui64, 0, sizeof(ids->eui64));
+
+	if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
+		return true;
+	if (has_nguid &&
+	    memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
+		return true;
+	if (has_eui64 &&
+	    memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
+		return true;
+
+	return false;
+}
+
+static int nvme_subsys_check_duplicate_ids(struct nvme_subsystem *subsys,
+		struct nvme_ns_ids *ids)
+{
 	struct nvme_ns_head *h;
 
 	lockdep_assert_held(&subsys->lock);
 
 	list_for_each_entry(h, &subsys->nsheads, entry) {
-		if (has_uuid && uuid_equal(&ids->uuid, &h->ids.uuid))
-			return -EINVAL;
-		if (has_nguid &&
-		    memcmp(&ids->nguid, &h->ids.nguid, sizeof(ids->nguid)) == 0)
+		if (nvme_head_is_duplicate_ids(h, ids))
 			return -EINVAL;
-		if (has_eui64 &&
-		    memcmp(&ids->eui64, &h->ids.eui64, sizeof(ids->eui64)) == 0)
+	}
+
+	return 0;
+}
+
+static int nvme_subsys_check_duplicate_info(struct nvme_subsystem *subsys,
+		struct nvme_ns_info *info)
+{
+	struct nvme_ns_head *h;
+
+	lockdep_assert_held(&subsys->lock);
+
+	list_for_each_entry(h, &subsys->nsheads, entry) {
+		if (nvme_head_is_same_metadata(h, info) &&
+		    nvme_head_is_duplicate_ids(h, &info->ids) &&
+		    h->lba_shift == info->ds)
 			return -EINVAL;
 	}
 
@@ -3432,6 +3497,13 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	head->subsys = ctrl->subsys;
 	head->ns_id = info->nsid;
 	head->ids = info->ids;
+	head->nuse = info->nuse;
+	head->pi_size = info->pi_size;
+	head->pi_type = info->pi_type;
+	head->guard_type = info->guard_type;
+	head->ms = info->ms;
+	head->lba_shift = info->ds;
+	head->features = info->features;
 	head->shared = info->is_shared;
 	ratelimit_state_init(&head->rs_nuse, 5 * HZ, 1);
 	ratelimit_set_flags(&head->rs_nuse, RATELIMIT_MSG_ON_RELEASE);
@@ -3536,12 +3608,12 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 	}
 
 	mutex_lock(&ctrl->subsys->lock);
-	head = nvme_find_ns_head(ctrl, info->nsid);
+	head = nvme_find_ns_head(ctrl, info);
 	if (!head) {
-		ret = nvme_subsys_check_duplicate_ids(ctrl->subsys, &info->ids);
+		ret = nvme_subsys_check_duplicate_info(ctrl->subsys, info);
 		if (ret) {
 			dev_err(ctrl->device,
-				"duplicate IDs in subsystem for nsid %d\n",
+				"duplicate NS Info in subsystem for nsid %d\n",
 				info->nsid);
 			goto out_unlock;
 		}
@@ -3565,6 +3637,21 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 			goto out_put_ns_head;
 		}
 
+		if (!nvme_head_is_same_metadata(head, info)) {
+			dev_err(ctrl->device,
+				"Metadata doesn't match for shared namespace %d\n",
+				info->nsid);
+			goto out_put_ns_head;
+		}
+
+		if (head->lba_shift != info->ds) {
+			dev_err(ctrl->device,
+				"LBA data size doesn't match for shared namespace %d\n",
+				info->nsid);
+			goto out_put_ns_head;
+
+		}
+
 		if (!multipath) {
 			dev_warn(ctrl->device,
 				"Found shared namespace %d, but multipathing not supported.\n",
@@ -3780,6 +3867,19 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
 		goto out;
 	}
 
+	if (!nvme_head_is_same_metadata(ns->head, info)) {
+		dev_err(ns->ctrl->device,
+			"Metadata changed for nsid %d\n", ns->head->ns_id);
+			goto out;
+	}
+
+	if (ns->head->lba_shift != info->ds) {
+		dev_err(ns->ctrl->device,
+			"LBA data size changed for nsid %d\n",
+			ns->head->ns_id);
+		goto out;
+	}
+
 	ret = nvme_update_ns_info(ns, info);
 out:
 	/*
-- 
2.18.1