[PATCH 12/19] nvme-multipath: add PR support for libmultipath

Wed Feb 25 07:40:00 PST 2026

Add PR support for libmultipath in the addition of nvme_mpath_pr_ops
structure.

The callbacks here pass mpath_device pointers. These can be converted to
NS pointer. However, the current PR callbacks for nvme_pr_ops work in
pass a bdev, and the helps us this to figure out if we are for a
multipath head or a NS. Later the send command helpers can be changed to
work per NS, when the full change to libmultipath happens. Until then,
have separate per-NS command send helpers. The original PR callback
functions from nvme_pr_ops can also be refactored to use the new
NS-based callbacks then, reducing duplication.

The new NS-based helpers are marked as __maybe_unused until the switch
to libmultipath happens.

Signed-off-by: John Garry <john.g.garry at oracle.com>
---
 drivers/nvme/host/multipath.c |   1 +
 drivers/nvme/host/nvme.h      |   1 +
 drivers/nvme/host/pr.c        | 314 ++++++++++++++++++++++++++++++++++
 3 files changed, 316 insertions(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 6cadbc0449d3d..ac75db92dd124 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -1501,6 +1501,7 @@ static const struct mpath_head_template mpdt = {
 	.get_access_state = nvme_mpath_get_access_state,
 	.bdev_ioctl = nvme_mpath_bdev_ioctl,
 	.cdev_ioctl = nvme_mpath_cdev_ioctl,
+	.pr_ops = &nvme_mpath_pr_ops,
 	.chr_uring_cmd = nvme_mpath_chr_uring_cmd,
 	.chr_uring_cmd_iopoll = nvme_ns_chr_uring_cmd_iopoll,
 	.get_iopolicy = nvme_mpath_get_iopolicy,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index da9bd1ada6ad6..619d2fff969e3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -22,6 +22,7 @@
 #include <trace/events/block.h>
 
 extern const struct pr_ops nvme_pr_ops;
+extern const struct mpath_pr_ops nvme_mpath_pr_ops;
 
 extern unsigned int nvme_io_timeout;
 #define NVME_IO_TIMEOUT	(nvme_io_timeout * HZ)
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index ad2ecc2f49a97..fd5a9f309a56f 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -116,6 +116,51 @@ static int nvme_send_pr_command(struct block_device *bdev, u32 cdw10, u32 cdw11,
 	return ret < 0 ? ret : nvme_status_to_pr_err(ret);
 }
 
+static int __nvme_send_pr_command_ns(struct nvme_ns *ns, u32 cdw10,
+		u32 cdw11, u8 op, void *data, unsigned int data_len)
+{
+	struct nvme_command c = { 0 };
+
+	c.common.opcode = op;
+	c.common.cdw10 = cpu_to_le32(cdw10);
+	c.common.cdw11 = cpu_to_le32(cdw11);
+
+	return nvme_send_ns_pr_command(ns, &c, data, data_len);
+}
+
+static int nvme_send_pr_command_ns(struct nvme_ns *ns, u32 cdw10, u32 cdw11,
+		u8 op, void *data, unsigned int data_len)
+{
+	int ret;
+
+	ret = __nvme_send_pr_command_ns(ns, cdw10, cdw11, op, data, data_len);
+	return ret < 0 ? ret : nvme_status_to_pr_err(ret);
+}
+
+__maybe_unused
+static int nvme_pr_register_ns(struct nvme_ns *ns, u64 old_key, u64 new_key,
+			u32 flags)
+{
+	struct nvmet_pr_register_data data = { 0 };
+	u32 cdw10;
+	int ret;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	data.crkey = cpu_to_le64(old_key);
+	data.nrkey = cpu_to_le64(new_key);
+
+	cdw10 = old_key ? NVME_PR_REGISTER_ACT_REPLACE :
+		NVME_PR_REGISTER_ACT_REG;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0;
+	cdw10 |= NVME_PR_CPTPL_PERSIST;
+
+	ret = nvme_send_pr_command_ns(ns, cdw10, 0, nvme_cmd_resv_register,
+			&data, sizeof(data));
+	return ret;
+}
+
 static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
 		unsigned int flags)
 {
@@ -137,6 +182,26 @@ static int nvme_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
 			&data, sizeof(data));
 }
 
+__maybe_unused
+static int nvme_pr_reserve_ns(struct nvme_ns *ns, u64 key, enum pr_type type,
+		u32 flags)
+{
+	struct nvmet_pr_acquire_data data = { 0 };
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	data.crkey = cpu_to_le64(key);
+
+	cdw10 = NVME_PR_ACQUIRE_ACT_ACQUIRE;
+	cdw10 |= nvme_pr_type_from_blk(type) << 8;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? NVME_PR_IGNORE_KEY : 0;
+
+	return nvme_send_pr_command_ns(ns, cdw10, 0, nvme_cmd_resv_acquire,
+			&data, sizeof(data));
+}
+
 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 		enum pr_type type, unsigned flags)
 {
@@ -156,6 +221,24 @@ static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 			&data, sizeof(data));
 }
 
+__maybe_unused
+static int nvme_pr_preempt_ns(struct nvme_ns *ns, u64 old, u64 new,
+		enum pr_type type, bool abort)
+{
+	struct nvmet_pr_acquire_data data = { 0 };
+	u32 cdw10;
+
+	data.crkey = cpu_to_le64(old);
+	data.prkey = cpu_to_le64(new);
+
+	cdw10 = abort ? NVME_PR_ACQUIRE_ACT_PREEMPT_AND_ABORT :
+			NVME_PR_ACQUIRE_ACT_PREEMPT;
+	cdw10 |= nvme_pr_type_from_blk(type) << 8;
+
+	return nvme_send_pr_command_ns(ns, cdw10, 0, nvme_cmd_resv_acquire,
+			&data, sizeof(data));
+}
+
 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 		enum pr_type type, bool abort)
 {
@@ -173,6 +256,21 @@ static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
 			&data, sizeof(data));
 }
 
+__maybe_unused
+static int nvme_pr_clear_ns(struct nvme_ns *ns, u64 key)
+{
+	struct nvmet_pr_release_data data = { 0 };
+	u32 cdw10;
+
+	data.crkey = cpu_to_le64(key);
+
+	cdw10 = NVME_PR_RELEASE_ACT_CLEAR;
+	cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY;
+
+	return nvme_send_pr_command_ns(ns, cdw10, 0, nvme_cmd_resv_release,
+			&data, sizeof(data));
+}
+
 static int nvme_pr_clear(struct block_device *bdev, u64 key)
 {
 	struct nvmet_pr_release_data data = { 0 };
@@ -202,6 +300,45 @@ static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type
 			&data, sizeof(data));
 }
 
+__maybe_unused
+static int nvme_pr_release_ns(struct nvme_ns *ns, u64 key, enum pr_type type)
+{
+	struct nvmet_pr_release_data data = { 0 };
+	u32 cdw10;
+
+	data.crkey = cpu_to_le64(key);
+
+	cdw10 = NVME_PR_RELEASE_ACT_RELEASE;
+	cdw10 |= nvme_pr_type_from_blk(type) << 8;
+	cdw10 |= key ? 0 : NVME_PR_IGNORE_KEY;
+
+	return nvme_send_pr_command_ns(ns, cdw10, 0, nvme_cmd_resv_release,
+			&data, sizeof(data));
+}
+
+static int nvme_mpath_pr_resv_report_ns(struct nvme_ns *ns, void *data,
+		u32 data_len, bool *eds)
+{
+	u32 cdw10, cdw11;
+	int ret;
+
+	cdw10 = nvme_bytes_to_numd(data_len);
+	cdw11 = NVME_EXTENDED_DATA_STRUCT;
+	*eds = true;
+
+retry:
+	ret = __nvme_send_pr_command_ns(ns, cdw10, cdw11, nvme_cmd_resv_report,
+			data, data_len);
+	if (ret == NVME_SC_HOST_ID_INCONSIST &&
+	    cdw11 == NVME_EXTENDED_DATA_STRUCT) {
+		cdw11 = 0;
+		*eds = false;
+		goto retry;
+	}
+
+	return ret < 0 ? ret : nvme_status_to_pr_err(ret);
+}
+
 static int nvme_pr_resv_report(struct block_device *bdev, void *data,
 		u32 data_len, bool *eds)
 {
@@ -225,6 +362,52 @@ static int nvme_pr_resv_report(struct block_device *bdev, void *data,
 	return ret < 0 ? ret : nvme_status_to_pr_err(ret);
 }
 
+__maybe_unused
+static int nvme_pr_read_keys_ns(struct nvme_ns *ns, struct pr_keys *keys_info)
+{
+	size_t rse_len;
+	u32 num_keys = keys_info->num_keys;
+	struct nvme_reservation_status_ext *rse;
+	int ret, i;
+	bool eds;
+
+	/*
+	 * Assume we are using 128-bit host IDs and allocate a buffer large
+	 * enough to get enough keys to fill the return keys buffer.
+	 */
+	rse_len = struct_size(rse, regctl_eds, num_keys);
+	if (rse_len > U32_MAX)
+		return -EINVAL;
+
+	rse = kzalloc(rse_len, GFP_KERNEL);
+	if (!rse)
+		return -ENOMEM;
+
+	ret = nvme_mpath_pr_resv_report_ns(ns, rse, rse_len, &eds);
+	if (ret)
+		goto free_rse;
+
+	keys_info->generation = le32_to_cpu(rse->gen);
+	keys_info->num_keys = get_unaligned_le16(&rse->regctl);
+
+	num_keys = min(num_keys, keys_info->num_keys);
+	for (i = 0; i < num_keys; i++) {
+		if (eds) {
+			keys_info->keys[i] =
+					le64_to_cpu(rse->regctl_eds[i].rkey);
+		} else {
+			struct nvme_reservation_status *rs;
+
+			rs = (struct nvme_reservation_status *)rse;
+			keys_info->keys[i] = le64_to_cpu(rs->regctl_ds[i].rkey);
+		}
+	}
+
+free_rse:
+	kfree(rse);
+	return ret;
+}
+
 static int nvme_pr_read_keys(struct block_device *bdev,
 		struct pr_keys *keys_info)
 {
@@ -271,6 +454,70 @@ static int nvme_pr_read_keys(struct block_device *bdev,
 	return ret;
 }
 
+__maybe_unused
+static int nvme_pr_read_reservation_ns(struct nvme_ns *ns,
+				  struct pr_held_reservation *resv)
+{
+	struct nvme_reservation_status_ext tmp_rse, *rse;
+	int ret, i, num_regs;
+	u32 rse_len;
+	bool eds;
+
+get_num_regs:
+	/*
+	 * Get the number of registrations so we know how big to allocate
+	 * the response buffer.
+	 */
+	ret = nvme_mpath_pr_resv_report_ns(ns, &tmp_rse, sizeof(tmp_rse),
+					&eds);
+	if (ret)
+		return ret;
+
+	num_regs = get_unaligned_le16(&tmp_rse.regctl);
+	if (!num_regs) {
+		resv->generation = le32_to_cpu(tmp_rse.gen);
+		return 0;
+	}
+
+	rse_len = struct_size(rse, regctl_eds, num_regs);
+	rse = kzalloc(rse_len, GFP_KERNEL);
+	if (!rse)
+		return -ENOMEM;
+
+	ret = nvme_mpath_pr_resv_report_ns(ns, rse, rse_len, &eds);
+	if (ret)
+		goto free_rse;
+
+	if (num_regs != get_unaligned_le16(&rse->regctl)) {
+		kfree(rse);
+		goto get_num_regs;
+	}
+
+	resv->generation = le32_to_cpu(rse->gen);
+	resv->type = block_pr_type_from_nvme(rse->rtype);
+
+	for (i = 0; i < num_regs; i++) {
+		if (eds) {
+			if (rse->regctl_eds[i].rcsts) {
+				resv->key = le64_to_cpu(rse->regctl_eds[i].rkey);
+				break;
+			}
+		} else {
+			struct nvme_reservation_status *rs;
+
+			rs = (struct nvme_reservation_status *)rse;
+			if (rs->regctl_ds[i].rcsts) {
+				resv->key = le64_to_cpu(rs->regctl_ds[i].rkey);
+				break;
+			}
+		}
+	}
+
+free_rse:
+	kfree(rse);
+	return ret;
+}
+
 static int nvme_pr_read_reservation(struct block_device *bdev,
 		struct pr_held_reservation *resv)
 {
@@ -333,6 +580,73 @@ static int nvme_pr_read_reservation(struct block_device *bdev,
 	return ret;
 }
 
+#if defined(CONFIG_NVME_MULTIPATH)
+static int nvme_mpath_pr_register(struct mpath_device *mpath_device,
+		u64 old_key, u64 new_key, unsigned int flags)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_register_ns(ns, old_key, new_key, flags);
+}
+
+static int nvme_mpath_pr_reserve(struct mpath_device *mpath_device, u64 key,
+		enum pr_type type, unsigned flags)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_reserve_ns(ns, key, type, flags);
+}
+
+static int nvme_mpath_pr_release(struct mpath_device *mpath_device, u64 key,
+		enum pr_type type)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_release_ns(ns, key, type);
+}
+
+static int nvme_mpath_pr_preempt(struct mpath_device *mpath_device, u64 old,
+		u64 new, enum pr_type type, bool abort)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_preempt_ns(ns, old, new, type, abort);
+}
+
+static int nvme_mpath_pr_clear(struct mpath_device *mpath_device, u64 key)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_clear_ns(ns, key);
+}
+
+static int nvme_mpath_pr_read_keys(struct mpath_device *mpath_device,
+		struct pr_keys *keys_info)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_read_keys_ns(ns, keys_info);
+}
+
+static int nvme_mpath_pr_read_reservation(struct mpath_device *mpath_device,
+		struct pr_held_reservation *resv)
+{
+	struct nvme_ns *ns = nvme_mpath_to_ns(mpath_device);
+
+	return nvme_pr_read_reservation_ns(ns, resv);
+}
+
+const struct mpath_pr_ops nvme_mpath_pr_ops = {
+	.pr_register	= nvme_mpath_pr_register,
+	.pr_reserve	= nvme_mpath_pr_reserve,
+	.pr_release	= nvme_mpath_pr_release,
+	.pr_preempt	= nvme_mpath_pr_preempt,
+	.pr_clear	= nvme_mpath_pr_clear,
+	.pr_read_keys	= nvme_mpath_pr_read_keys,
+	.pr_read_reservation = nvme_mpath_pr_read_reservation,
+};
+#endif
+
 const struct pr_ops nvme_pr_ops = {
 	.pr_register	= nvme_pr_register,
 	.pr_reserve	= nvme_pr_reserve,
-- 
2.43.5