[PATCH v3 12/19] block,nvme,scsi,dm: Add blk_status to pr_ops callouts

Mike Christie michael.christie at oracle.com
Sat Nov 5 11:36:18 PDT 2022


On 11/1/22 5:15 AM, Christoph Hellwig wrote:
> On Sun, Oct 30, 2022 at 06:05:35PM -0500, Mike Christie wrote:
>> The problem I hit is that in the ioctl code I then have to do:
>>
>> @@ -269,7 +270,14 @@ static int blkdev_pr_register(struct block_device *bdev,
>>  
>>  	if (reg.flags & ~PR_FL_IGNORE_KEY)
>>  		return -EOPNOTSUPP;
>> -	return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
>> +	ret = ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
>> +	if (ret == -EBADE) {
>> +		if (bdev_is_nvme(bdev))
>> +			ret = NVME_SC_RESERVATION_CONFLICT;
>> +		else if (bdev_is_scsi(bdev)
>> +			ret = SAM_STAT_RESERVATION_CONFLICT;
>> +	}
>> +	return ret;
> 
> Eww.  We should have never leaked protocol specific values out to
> userspace.  This is an original bug I introduceѕ, so all blame is on me.
> 
> I suspect the right way to fix is is to keep the numeric value of
> SAM_STAT_RESERVATION_CONFLICT and give it a new constant exposed in
> the uapi header, assuming that SCSI is the thing people actually
> used the PR API for, and nvme was just an nice little add-on.
> 

Do you mean just doing this:

diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index 727123c611e6..6ac70514170d 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -72,12 +72,17 @@ static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
 static int nvme_send_pr_command(struct block_device *bdev,
 		struct nvme_command *c, void *data, unsigned int data_len)
 {
+	int ret;
+
 	if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
 	    bdev->bd_disk->fops == &nvme_ns_head_ops)
-		return nvme_send_ns_head_pr_command(bdev, c, data, data_len);
+		ret = nvme_send_ns_head_pr_command(bdev, c, data, data_len);
 	else
-		return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c,
-					       data, data_len);
+		ret = nvme_send_ns_pr_command(bdev->bd_disk->private_data, c,
+					      data, data_len);
+	if (ret == NVME_SC_RESERVATION_CONFLICT)
+		ret = PR_STS_RESERVATION_CONFLICT;
+	return ret;
 }
 
 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c0a614efcfce..c7621d8ffa51 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1840,7 +1840,8 @@ static int sd_pr_out_command(struct block_device *bdev, u8 sa,
 	    scsi_sense_valid(&sshdr)) {
 		sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result);
 		scsi_print_sense_hdr(sdev, NULL, &sshdr);
-	}
+	} else if (__get_status_byte(result) == SAM_STAT_RESERVATION_CONFLICT)
+		result = PR_STS_RESERVATION_CONFLICT;
 
 	return result;
 }
diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h
index ccc78cbf1221..7a637f9e5b49 100644
--- a/include/uapi/linux/pr.h
+++ b/include/uapi/linux/pr.h
@@ -13,6 +13,15 @@ enum pr_type {
 	PR_EXCLUSIVE_ACCESS_ALL_REGS	= 6,
 };
 
+enum pr_status {
+	PR_STS_SUCCESS			= 0x0,
+	/*
+	 * The error codes are based on SCSI, because it was the primary user
+	 * and had userspace users.
+	 */
+	PR_STS_RESERVATION_CONFLICT	= 0x18,
+};
+
 struct pr_reservation {
 	__u64	key;
 	__u32	type;


---------------------------------------------------------------------------


Or we could add a new flag and make it nicer for the user in the future,
but also uglier for the drivers but a little less uglier than adding the
blk_sts field to the calls:



diff --git a/block/ioctl.c b/block/ioctl.c
index 60121e89052b..cae58f57ea13 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -269,7 +269,8 @@ static int blkdev_pr_register(struct block_device *bdev,
 
 	if (reg.flags & ~PR_FL_IGNORE_KEY)
 		return -EOPNOTSUPP;
-	return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
+	return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags,
+				reg.flags & PR_FL_PR_STAT);
 }
 
 static int blkdev_pr_reserve(struct block_device *bdev,
@@ -287,7 +288,8 @@ static int blkdev_pr_reserve(struct block_device *bdev,
 
 	if (rsv.flags & ~PR_FL_IGNORE_KEY)
 		return -EOPNOTSUPP;
-	return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags);
+	return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags,
+			       rsv.flags & PR_FL_PR_STAT)
 }
 
 static int blkdev_pr_release(struct block_device *bdev,
@@ -305,7 +307,8 @@ static int blkdev_pr_release(struct block_device *bdev,
 
 	if (rsv.flags)
 		return -EOPNOTSUPP;
-	return ops->pr_release(bdev, rsv.key, rsv.type);
+	return ops->pr_release(bdev, rsv.key, rsv.type,
+			       rsv.flags & PR_FL_PR_STAT);
 }
 
 static int blkdev_pr_preempt(struct block_device *bdev,
@@ -323,7 +326,8 @@ static int blkdev_pr_preempt(struct block_device *bdev,
 
 	if (p.flags)
 		return -EOPNOTSUPP;
-	return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort);
+	return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort,
+			       p.flags & PR_FL_PR_STAT)
 }
 
 static int blkdev_pr_clear(struct block_device *bdev,
@@ -341,7 +345,7 @@ static int blkdev_pr_clear(struct block_device *bdev,
 
 	if (c.flags)
 		return -EOPNOTSUPP;
-	return ops->pr_clear(bdev, c.key);
+	return ops->pr_clear(bdev, c.key, c.flags & PR_FL_PR_STAT);
 }
 
 static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/nvme/host/pr.c b/drivers/nvme/host/pr.c
index 727123c611e6..7c317b646cde 100644
--- a/drivers/nvme/host/pr.c
+++ b/drivers/nvme/host/pr.c
@@ -70,18 +70,44 @@ static int nvme_send_ns_pr_command(struct nvme_ns *ns, struct nvme_command *c,
 }
 
 static int nvme_send_pr_command(struct block_device *bdev,
-		struct nvme_command *c, void *data, unsigned int data_len)
+		struct nvme_command *c, void *data, unsigned int data_len,
+		bool use_pr_sts)
 {
+	int ret;
+
 	if (IS_ENABLED(CONFIG_NVME_MULTIPATH) &&
 	    bdev->bd_disk->fops == &nvme_ns_head_ops)
-		return nvme_send_ns_head_pr_command(bdev, c, data, data_len);
+		ret = nvme_send_ns_head_pr_command(bdev, c, data, data_len);
 	else
-		return nvme_send_ns_pr_command(bdev->bd_disk->private_data, c,
-					       data, data_len);
+		ret = nvme_send_ns_pr_command(bdev->bd_disk->private_data, c,
+					      data, data_len);
+	if (!ret || ret < 0 || (ret > 0 && !use_pr_sts))
+		return ret;
+
+	switch (ret) {
+	case NVME_SC_RESERVATION_CONFLICT:
+		ret = PR_STS_RESERVATION_CONFLICT;
+		break;
+	case NVME_SC_HOST_PATH_ERROR:
+		ret = PR_STS_PATH_FAILURE;
+		break
+	case NVME_SC_BAD_ATTRIBUTES:
+	case NVME_SC_ONCS_NOT_SUPPORTED:
+	case NVME_SC_INVALID_OPCODE:
+	case NVME_SC_INVALID_FIELD:
+	case NVME_SC_INVALID_NS:
+		ret = PR_STS_INVALID_OP;
+		break;
+	default:
+		ret = PR_STS_IOERR;
+		break;
+	}
+
+	return ret;
 }
 
 static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
-				u64 key, u64 sa_key, u8 op)
+				u64 key, u64 sa_key, u8 op, bool use_pr_sts)
 {
 	struct nvme_command c = { };
 	u8 data[16] = { 0, };
@@ -92,11 +118,11 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
 	c.common.opcode = op;
 	c.common.cdw10 = cpu_to_le32(cdw10);
 
-	return nvme_send_pr_command(bdev, &c, data, sizeof(data));
+	return nvme_send_pr_command(bdev, &c, data, sizeof(data), use_pr_sts);
 }
 
 static int nvme_pr_register(struct block_device *bdev, u64 old,
-		u64 new, unsigned flags)
+		u64 new, unsigned flags, bool use_pr_sts)
 {
 	u32 cdw10;
 
@@ -106,11 +132,12 @@ static int nvme_pr_register(struct block_device *bdev, u64 old,
 	cdw10 = old ? 2 : 0;
 	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
 	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
-	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register,
+			       use_pr_sts);
 }
 
 static int nvme_pr_reserve(struct block_device *bdev, u64 key,
-		enum pr_type type, unsigned flags)
+		enum pr_type type, unsigned flags, bool use_pr_sts)
 {
 	u32 cdw10;
 
@@ -119,29 +146,34 @@ static int nvme_pr_reserve(struct block_device *bdev, u64 key,
 
 	cdw10 = nvme_pr_type_from_blk(type) << 8;
 	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire,
+			       use_pr_sts);
 }
 
 static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
-		enum pr_type type, bool abort)
+		enum pr_type type, bool abort, bool use_pr_sts)
 {
 	u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (abort ? 2 : 1);
 
-	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire,
+			       use_pr_sts);
 }
 
-static int nvme_pr_clear(struct block_device *bdev, u64 key)
+static int nvme_pr_clear(struct block_device *bdev, u64 key, bool use_pr_sts)
 {
 	u32 cdw10 = 1 | (key ? 0 : 1 << 3);
 
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release,
+			       use_pr_sts);
 }
 
-static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+static int nvme_pr_release(struct block_device *bdev, u64 key,
+		enum pr_type type, bool use_pr_sts)
 {
 	u32 cdw10 = nvme_pr_type_from_blk(type) << 8 | (key ? 0 : 1 << 3);
 
-	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release,
+			       use_pr_sts);
 }
 
 static int nvme_pr_resv_report(struct block_device *bdev, void *data,
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index c0a614efcfce..46393bdd7427 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1797,7 +1797,8 @@ static int sd_pr_read_reservation(struct block_device *bdev,
 }
 
 static int sd_pr_out_command(struct block_device *bdev, u8 sa,
-		u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags)
+		u64 key, u64 sa_key, enum scsi_pr_type type, u8 flags,
+		bool use_pr_sts)
 {
 	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
 	struct scsi_device *sdev = sdkp->device;
@@ -1842,44 +1843,74 @@ static int sd_pr_out_command(struct block_device *bdev, u8 sa,
 		scsi_print_sense_hdr(sdev, NULL, &sshdr);
 	}
 
+	if (!result || result < 0 || (result > 0 && !use_pr_sts))
+		return result;
+
+	result = PR_STS_IOERR;
+
+	switch host_byte(result) {
+	case DID_TRANSPORT_FAILFAST:
+	case DID_TRANSPORT_MARGINAL:
+	case DID_TRANSPORT_DISRUPTED:
+		result = PR_STS_PATH_FAILURE;
+		goto done;
+	}
+
+	switch (__get_status_byte(result)) {
+	case SAM_STAT_RESERVATION_CONFLICT:
+		result = PR_STS_RESERVATION_CONFLICT;
+		goto done;
+	case SAM_STAT_CHECK_CONDITION:
+		if (!scsi_sense_valid(&sshdr))
+			goto done;
+
+		if (sshdr.sense_key == ILLEGAL_REQUEST &&
+		    (sshdr.asc == 0x26 || sshdr.asc == 0x24)) {
+			result = PR_STS_INVALID_OP;
+			goto done;
+		}
+	}
+
+done:
 	return result;
 }
 
 static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
-		u32 flags)
+		u32 flags, bool use_pr_sts)
 {
 	if (flags & ~PR_FL_IGNORE_KEY)
 		return -EOPNOTSUPP;
 	return sd_pr_out_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00,
-			old_key, new_key, 0,
-			(1 << 0) /* APTPL */);
+				 old_key, new_key, 0, (1 << 0) /* APTPL */,
+				 use_pr_sts);
 }
 
 static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
-		u32 flags)
+		u32 flags, bool use_pr_sts)
 {
 	if (flags)
 		return -EOPNOTSUPP;
 	return sd_pr_out_command(bdev, 0x01, key, 0,
-				 block_pr_type_to_scsi(type), 0);
+				 block_pr_type_to_scsi(type), 0, use_pr_sts)
 }
 
-static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type,
+			 bool use_pr_sts)
 {
 	return sd_pr_out_command(bdev, 0x02, key, 0,
-				 block_pr_type_to_scsi(type), 0);
+				 block_pr_type_to_scsi(type), 0, use_pr_sts);
 }
 
 static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
-		enum pr_type type, bool abort)
+		enum pr_type type, bool abort, bool use_pr_sts)
 {
 	return sd_pr_out_command(bdev, abort ? 0x05 : 0x04, old_key, new_key,
-				 block_pr_type_to_scsi(type), 0);
+				 block_pr_type_to_scsi(type), 0, use_pr_sts);
 }
 
-static int sd_pr_clear(struct block_device *bdev, u64 key)
+static int sd_pr_clear(struct block_device *bdev, u64 key, bool use_pr_sts)
 {
-	return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0);
+	return sd_pr_out_command(bdev, 0x03, key, 0, 0, 0, use_pr_sts);
 }
 
 static const struct pr_ops sd_pr_ops = {
diff --git a/include/linux/pr.h b/include/linux/pr.h
index 3003daec28a5..51e03e73a87f 100644
--- a/include/linux/pr.h
+++ b/include/linux/pr.h
@@ -18,14 +18,14 @@ struct pr_held_reservation {
 
 struct pr_ops {
 	int (*pr_register)(struct block_device *bdev, u64 old_key, u64 new_key,
-			u32 flags);
+			u32 flags, bool use_pr_sts);
 	int (*pr_reserve)(struct block_device *bdev, u64 key,
-			enum pr_type type, u32 flags);
+			enum pr_type type, u32 flags, bool use_pr_sts);
 	int (*pr_release)(struct block_device *bdev, u64 key,
-			enum pr_type type);
+			enum pr_type type, bool use_pr_sts);
 	int (*pr_preempt)(struct block_device *bdev, u64 old_key, u64 new_key,
-			enum pr_type type, bool abort);
-	int (*pr_clear)(struct block_device *bdev, u64 key);
+			enum pr_type type, bool abort, bool use_pr_sts);
+	int (*pr_clear)(struct block_device *bdev, u64 key, bool use_pr_sts);
 	/*
 	 * pr_read_keys - Read the registered keys and return them in the
 	 * pr_keys->keys array. The keys array will have been allocated at the
diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h
index ccc78cbf1221..ac8f7fc404ff 100644
--- a/include/uapi/linux/pr.h
+++ b/include/uapi/linux/pr.h
@@ -13,6 +13,14 @@ enum pr_type {
 	PR_EXCLUSIVE_ACCESS_ALL_REGS	= 6,
 };
 
+enum pr_status {
+	PR_STS_SUCCESS,
+	PR_STS_IOERR,
+	PR_STS_RESERVATION_CONFLICT,
+	PR_STS_PATH_FAILURE,
+	PR_STS_INVALID_OP,
+};
+
 struct pr_reservation {
 	__u64	key;
 	__u32	type;
@@ -40,6 +48,7 @@ struct pr_clear {
 };
 
 #define PR_FL_IGNORE_KEY	(1 << 0)	/* ignore existing key */
+#define PR_FL_PR_STAT		(1 << 1)	/* Convert device errors to pr_status */
 
 #define IOC_PR_REGISTER		_IOW('p', 200, struct pr_registration)
 #define IOC_PR_RESERVE		_IOW('p', 201, struct pr_reservation)


Bart, if this is what you were suggesting I misunderstood you. I thought you wanted to
only pass the new code in the kernel so that's why I was saying we still have the problem
of converting the error back when passing it back to userspace.



More information about the Linux-nvme mailing list