[PATCH 03/13] mpt3sas: Added support for nvme encapsulated request message.

Suganath Prabu S suganath-prabu.subramani at broadcom.com
Tue Oct 31 05:32:29 PDT 2017


* Mpt3sas driver uses the NVMe Encapsulated Request message to
send an NVMe command to an NVMe device attached to the IOC.

* Normal I/O commands like reads and writes are passed to the
controller as SCSI commands and the controller has the ability
to translate the commands to NVMe equivalent.

* This encapsulated NVMe command is used by applications to send
direct NVMe commands to NVMe drives.

Signed-off-by: Chaitra P B <chaitra.basappa at broadcom.com>
Signed-off-by: Suganath Prabu S <suganath-prabu.subramani at broadcom.com>
---
 drivers/scsi/mpt3sas/mpt3sas_base.c | 276 +++++++++++++++++++++++++++++++++++-
 drivers/scsi/mpt3sas/mpt3sas_base.h |   4 +
 drivers/scsi/mpt3sas/mpt3sas_ctl.c  |  69 ++++++++-
 3 files changed, 342 insertions(+), 7 deletions(-)

diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.c b/drivers/scsi/mpt3sas/mpt3sas_base.c
index 1ad3cbb..b0a75c6 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.c
@@ -557,6 +557,11 @@ _base_sas_ioc_info(struct MPT3SAS_ADAPTER *ioc, MPI2DefaultReply_t *mpi_reply,
 		frame_sz = sizeof(Mpi2SmpPassthroughRequest_t) + ioc->sge_size;
 		func_str = "smp_passthru";
 		break;
+	case MPI2_FUNCTION_NVME_ENCAPSULATED:
+		frame_sz = sizeof(Mpi26NVMeEncapsulatedRequest_t) +
+		    ioc->sge_size;
+		func_str = "nvme_encapsulated";
+		break;
 	default:
 		frame_sz = 32;
 		func_str = "unknown";
@@ -985,7 +990,9 @@ _base_interrupt(int irq, void *bus_id)
 		if (request_desript_type ==
 		    MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS ||
 		    request_desript_type ==
-		    MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS) {
+		    MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS ||
+		    request_desript_type ==
+		    MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS) {
 			cb_idx = _base_get_cb_idx(ioc, smid);
 			if ((likely(cb_idx < MPT_MAX_CALLBACKS)) &&
 			    (likely(mpt_callbacks[cb_idx] != NULL))) {
@@ -1345,6 +1352,225 @@ _base_build_sg(struct MPT3SAS_ADAPTER *ioc, void *psge,
 	}
 }
 
+/* IEEE format sgls */
+
+/**
+ * _base_build_nvme_prp - This function is called for NVMe end devices to build
+ * a native SGL (NVMe PRP). The native SGL is built starting in the first PRP
+ * entry of the NVMe message (PRP1).  If the data buffer is small enough to be
+ * described entirely using PRP1, then PRP2 is not used.  If needed, PRP2 is
+ * used to describe a larger data buffer.  If the data buffer is too large to
+ * describe using the two PRP entriess inside the NVMe message, then PRP1
+ * describes the first data memory segment, and PRP2 contains a pointer to a PRP
+ * list located elsewhere in memory to describe the remaining data memory
+ * segments.  The PRP list will be contiguous.
+
+ * The native SGL for NVMe devices is a Physical Region Page (PRP).  A PRP
+ * consists of a list of PRP entries to describe a number of noncontigous
+ * physical memory segments as a single memory buffer, just as a SGL does.  Note
+ * however, that this function is only used by the IOCTL call, so the memory
+ * given will be guaranteed to be contiguous.  There is no need to translate
+ * non-contiguous SGL into a PRP in this case.  All PRPs will describe
+ * contiguous space that is one page size each.
+ *
+ * Each NVMe message contains two PRP entries.  The first (PRP1) either contains
+ * a PRP list pointer or a PRP element, depending upon the command.  PRP2
+ * contains the second PRP element if the memory being described fits within 2
+ * PRP entries, or a PRP list pointer if the PRP spans more than two entries.
+ *
+ * A PRP list pointer contains the address of a PRP list, structured as a linear
+ * array of PRP entries.  Each PRP entry in this list describes a segment of
+ * physical memory.
+ *
+ * Each 64-bit PRP entry comprises an address and an offset field.  The address
+ * always points at the beginning of a 4KB physical memory page, and the offset
+ * describes where within that 4KB page the memory segment begins.  Only the
+ * first element in a PRP list may contain a non-zero offest, implying that all
+ * memory segments following the first begin at the start of a 4KB page.
+ *
+ * Each PRP element normally describes 4KB of physical memory, with exceptions
+ * for the first and last elements in the list.  If the memory being described
+ * by the list begins at a non-zero offset within the first 4KB page, then the
+ * first PRP element will contain a non-zero offset indicating where the region
+ * begins within the 4KB page.  The last memory segment may end before the end
+ * of the 4KB segment, depending upon the overall size of the memory being
+ * described by the PRP list.
+ *
+ * Since PRP entries lack any indication of size, the overall data buffer length
+ * is used to determine where the end of the data memory buffer is located, and
+ * how many PRP entries are required to describe it.
+ *
+ * @ioc: per adapter object
+ * @smid: system request message index for getting asscociated SGL
+ * @nvme_encap_request: the NVMe request msg frame pointer
+ * @data_out_dma: physical address for WRITES
+ * @data_out_sz: data xfer size for WRITES
+ * @data_in_dma: physical address for READS
+ * @data_in_sz: data xfer size for READS
+ *
+ * Returns nothing.
+ */
+static void
+_base_build_nvme_prp(struct MPT3SAS_ADAPTER *ioc, u16 smid,
+	Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request,
+	dma_addr_t data_out_dma, size_t data_out_sz, dma_addr_t data_in_dma,
+	size_t data_in_sz)
+{
+	int		prp_size = NVME_PRP_SIZE;
+	u64		*prp_entry, *prp1_entry, *prp2_entry, *prp_entry_phys;
+	u64		*prp_page, *prp_page_phys;
+	u32		offset, entry_len;
+	u32		page_mask_result, page_mask;
+	dma_addr_t	paddr;
+	size_t		length;
+
+	/*
+	 * Not all commands require a data transfer. If no data, just return
+	 * without constructing any PRP.
+	 */
+	if (!data_in_sz && !data_out_sz)
+		return;
+	/*
+	 * Set pointers to PRP1 and PRP2, which are in the NVMe command.
+	 * PRP1 is located at a 24 byte offset from the start of the NVMe
+	 * command.  Then set the current PRP entry pointer to PRP1.
+	 */
+	prp1_entry = (u64 *)(nvme_encap_request->NVMe_Command +
+	    NVME_CMD_PRP1_OFFSET);
+	prp2_entry = (u64 *)(nvme_encap_request->NVMe_Command +
+	    NVME_CMD_PRP2_OFFSET);
+	prp_entry = prp1_entry;
+	/*
+	 * For the PRP entries, use the specially allocated buffer of
+	 * contiguous memory.
+	 */
+	prp_page = (u64 *)mpt3sas_base_get_pcie_sgl(ioc, smid);
+	prp_page_phys = (u64 *)mpt3sas_base_get_pcie_sgl_dma(ioc, smid);
+
+	/*
+	 * Check if we are within 1 entry of a page boundary we don't
+	 * want our first entry to be a PRP List entry.
+	 */
+	page_mask = ioc->page_size - 1;
+	page_mask_result = (uintptr_t)((u8 *)prp_page + prp_size) & page_mask;
+	if (!page_mask_result) {
+		/* Bump up to next page boundary. */
+		prp_page = (u64 *)((u8 *)prp_page + prp_size);
+		prp_page_phys = (u64 *)((u8 *)prp_page_phys + prp_size);
+	}
+
+	/*
+	 * Set PRP physical pointer, which initially points to the current PRP
+	 * DMA memory page.
+	 */
+	prp_entry_phys = prp_page_phys;
+
+	/* Get physical address and length of the data buffer. */
+	if (data_in_sz) {
+		paddr = data_in_dma;
+		length = data_in_sz;
+	} else {
+		paddr = data_out_dma;
+		length = data_out_sz;
+	}
+
+	/* Loop while the length is not zero. */
+	while (length) {
+		/*
+		 * Check if we need to put a list pointer here if we are at
+		 * page boundary - prp_size (8 bytes).
+		 */
+		page_mask_result =
+		    (uintptr_t)((u8 *)prp_entry_phys + prp_size) & page_mask;
+		if (!page_mask_result) {
+			/*
+			 * This is the last entry in a PRP List, so we need to
+			 * put a PRP list pointer here.  What this does is:
+			 *   - bump the current memory pointer to the next
+			 *     address, which will be the next full page.
+			 *   - set the PRP Entry to point to that page.  This
+			 *     is now the PRP List pointer.
+			 *   - bump the PRP Entry pointer the start of the
+			 *     next page.  Since all of this PRP memory is
+			 *     contiguous, no need to get a new page - it's
+			 *     just the next address.
+			 */
+			prp_entry_phys++;
+			*prp_entry = cpu_to_le64((uintptr_t)prp_entry_phys);
+			prp_entry++;
+		}
+
+		/* Need to handle if entry will be part of a page. */
+		offset = (u32)paddr & page_mask;
+		entry_len = ioc->page_size - offset;
+
+		if (prp_entry == prp1_entry) {
+			/*
+			 * Must fill in the first PRP pointer (PRP1) before
+			 * moving on.
+			 */
+			*prp1_entry = cpu_to_le64((u64)paddr);
+
+			/*
+			 * Now point to the second PRP entry within the
+			 * command (PRP2).
+			 */
+			prp_entry = prp2_entry;
+		} else if (prp_entry == prp2_entry) {
+			/*
+			 * Should the PRP2 entry be a PRP List pointer or just
+			 * a regular PRP pointer?  If there is more than one
+			 * more page of data, must use a PRP List pointer.
+			 */
+			if (length > ioc->page_size) {
+				/*
+				 * PRP2 will contain a PRP List pointer because
+				 * more PRP's are needed with this command. The
+				 * list will start at the beginning of the
+				 * contiguous buffer.
+				 */
+				*prp2_entry =
+				    cpu_to_le64((uintptr_t)prp_entry_phys);
+
+				/*
+				 * The next PRP Entry will be the start of the
+				 * first PRP List.
+				 */
+				prp_entry = prp_page;
+			} else {
+				/*
+				 * After this, the PRP Entries are complete.
+				 * This command uses 2 PRP's and no PRP list.
+				 */
+				*prp2_entry = cpu_to_le64((u64)paddr);
+			}
+		} else {
+			/*
+			 * Put entry in list and bump the addresses.
+			 *
+			 * After PRP1 and PRP2 are filled in, this will fill in
+			 * all remaining PRP entries in a PRP List, one per
+			 * each time through the loop.
+			 */
+			*prp_entry = cpu_to_le64((u64)paddr);
+			prp_entry++;
+			prp_entry_phys++;
+		}
+
+		/*
+		 * Bump the phys address of the command's data buffer by the
+		 * entry_len.
+		 */
+		paddr += entry_len;
+
+		/* Decrement length accounting for last partial page. */
+		if (entry_len > length)
+			length = 0;
+		else
+			length -= entry_len;
+	}
+}
+
 /**
  * base_make_prp_nvme -
  * Prepare PRPs(Physical Region Page)- SGLs specific to NVMe drives only
@@ -2794,6 +3020,30 @@ _base_put_smid_hi_priority(struct MPT3SAS_ADAPTER *ioc, u16 smid,
 }
 
 /**
+ * _base_put_smid_nvme_encap - send NVMe encapsulated request to
+ *  firmware
+ * @ioc: per adapter object
+ * @smid: system request message index
+ *
+ * Return nothing.
+ */
+static void
+_base_put_smid_nvme_encap(struct MPT3SAS_ADAPTER *ioc, u16 smid)
+{
+	Mpi2RequestDescriptorUnion_t descriptor;
+	u64 *request = (u64 *)&descriptor;
+
+	descriptor.Default.RequestFlags =
+		MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED;
+	descriptor.Default.MSIxIndex =  _base_get_msix_index(ioc);
+	descriptor.Default.SMID = cpu_to_le16(smid);
+	descriptor.Default.LMID = 0;
+	descriptor.Default.DescriptorTypeDependent = 0;
+	_base_writeq(*request, &ioc->chip->RequestDescriptorPostLow,
+	    &ioc->scsi_lookup_lock);
+}
+
+/**
  * _base_put_smid_default - Default, primarily used for config pages
  * @ioc: per adapter object
  * @smid: system request message index
@@ -2884,6 +3134,27 @@ _base_put_smid_hi_priority_atomic(struct MPT3SAS_ADAPTER *ioc, u16 smid,
 }
 
 /**
+ * _base_put_smid_nvme_encap_atomic - send NVMe encapsulated request to
+ *   firmware using Atomic Request Descriptor
+ * @ioc: per adapter object
+ * @smid: system request message index
+ *
+ * Return nothing.
+ */
+static void
+_base_put_smid_nvme_encap_atomic(struct MPT3SAS_ADAPTER *ioc, u16 smid)
+{
+	Mpi26AtomicRequestDescriptor_t descriptor;
+	u32 *request = (u32 *)&descriptor;
+
+	descriptor.RequestFlags = MPI26_REQ_DESCRIPT_FLAGS_PCIE_ENCAPSULATED;
+	descriptor.MSIxIndex = _base_get_msix_index(ioc);
+	descriptor.SMID = cpu_to_le16(smid);
+
+	writel(cpu_to_le32(*request), &ioc->chip->AtomicRequestDescriptorPost);
+}
+
+/**
  * _base_put_smid_default - Default, primarily used for config pages
  * use Atomic Request Descriptor
  * @ioc: per adapter object
@@ -5707,6 +5978,7 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
 		 */
 		ioc->build_sg_scmd = &_base_build_sg_scmd_ieee;
 		ioc->build_sg = &_base_build_sg_ieee;
+		ioc->build_nvme_prp = &_base_build_nvme_prp;
 		ioc->build_zero_len_sge = &_base_build_zero_len_sge_ieee;
 		ioc->sge_size_ieee = sizeof(Mpi2IeeeSgeSimple64_t);
 
@@ -5718,11 +5990,13 @@ mpt3sas_base_attach(struct MPT3SAS_ADAPTER *ioc)
 		ioc->put_smid_scsi_io = &_base_put_smid_scsi_io_atomic;
 		ioc->put_smid_fast_path = &_base_put_smid_fast_path_atomic;
 		ioc->put_smid_hi_priority = &_base_put_smid_hi_priority_atomic;
+		ioc->put_smid_nvme_encap = &_base_put_smid_nvme_encap_atomic;
 	} else {
 		ioc->put_smid_default = &_base_put_smid_default;
 		ioc->put_smid_scsi_io = &_base_put_smid_scsi_io;
 		ioc->put_smid_fast_path = &_base_put_smid_fast_path;
 		ioc->put_smid_hi_priority = &_base_put_smid_hi_priority;
+		ioc->put_smid_nvme_encap = &_base_put_smid_nvme_encap;
 	}
 
 
diff --git a/drivers/scsi/mpt3sas/mpt3sas_base.h b/drivers/scsi/mpt3sas/mpt3sas_base.h
index 4758729..034b34d 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_base.h
+++ b/drivers/scsi/mpt3sas/mpt3sas_base.h
@@ -1184,6 +1184,9 @@ struct MPT3SAS_ADAPTER {
 	MPT_BUILD_SG    build_sg_mpi;
 	MPT_BUILD_ZERO_LEN_SGE build_zero_len_sge_mpi;
 
+	/* function ptr for NVMe PRP elements only */
+	NVME_BUILD_PRP  build_nvme_prp;
+
 	/* event log */
 	u32		event_type[MPI2_EVENT_NOTIFY_EVENTMASK_WORDS];
 	u32		event_context;
@@ -1354,6 +1357,7 @@ struct MPT3SAS_ADAPTER {
 	PUT_SMID_IO_FP_HIP put_smid_fast_path;
 	PUT_SMID_IO_FP_HIP put_smid_hi_priority;
 	PUT_SMID_DEFAULT put_smid_default;
+	PUT_SMID_DEFAULT put_smid_nvme_encap;
 
 };
 
diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
index 67c7280..205c443 100644
--- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c
+++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c
@@ -272,6 +272,7 @@ mpt3sas_ctl_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 {
 	MPI2DefaultReply_t *mpi_reply;
 	Mpi2SCSIIOReply_t *scsiio_reply;
+	Mpi26NVMeEncapsulatedErrorReply_t *nvme_error_reply;
 	const void *sense_data;
 	u32 sz;
 
@@ -298,6 +299,18 @@ mpt3sas_ctl_done(struct MPT3SAS_ADAPTER *ioc, u16 smid, u8 msix_index,
 				memcpy(ioc->ctl_cmds.sense, sense_data, sz);
 			}
 		}
+		/*
+		 * Get Error Response data for NVMe device. The ctl_cmds.sense
+		 * buffer is used to store the Error Response data.
+		 */
+		if (mpi_reply->Function == MPI2_FUNCTION_NVME_ENCAPSULATED) {
+			nvme_error_reply =
+			    (Mpi26NVMeEncapsulatedErrorReply_t *)mpi_reply;
+			sz = min_t(u32, NVME_ERROR_RESPONSE_SIZE,
+			    le32_to_cpu(nvme_error_reply->ErrorResponseCount));
+			sense_data = mpt3sas_base_get_sense_buffer(ioc, smid);
+			memcpy(ioc->ctl_cmds.sense, sense_data, sz);
+		}
 	}
 
 	_ctl_display_some_debug(ioc, smid, "ctl_done", mpi_reply);
@@ -641,11 +654,12 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg,
 {
 	MPI2RequestHeader_t *mpi_request = NULL, *request;
 	MPI2DefaultReply_t *mpi_reply;
+	Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request = NULL;
 	u32 ioc_state;
 	u16 smid;
 	unsigned long timeout;
 	u8 issue_reset;
-	u32 sz;
+	u32 sz, sz_arg;
 	void *psge;
 	void *data_out = NULL;
 	dma_addr_t data_out_dma = 0;
@@ -742,7 +756,8 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg,
 	if (mpi_request->Function == MPI2_FUNCTION_SCSI_IO_REQUEST ||
 	    mpi_request->Function == MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH ||
 	    mpi_request->Function == MPI2_FUNCTION_SCSI_TASK_MGMT ||
-	    mpi_request->Function == MPI2_FUNCTION_SATA_PASSTHROUGH) {
+	    mpi_request->Function == MPI2_FUNCTION_SATA_PASSTHROUGH ||
+	    mpi_request->Function == MPI2_FUNCTION_NVME_ENCAPSULATED) {
 
 		device_handle = le16_to_cpu(mpi_request->FunctionDependent1);
 		if (!device_handle || (device_handle >
@@ -793,6 +808,38 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg,
 
 	init_completion(&ioc->ctl_cmds.done);
 	switch (mpi_request->Function) {
+	case MPI2_FUNCTION_NVME_ENCAPSULATED:
+	{
+		nvme_encap_request = (Mpi26NVMeEncapsulatedRequest_t *)request;
+		/*
+		 * Get the Physical Address of the sense buffer.
+		 * Use Error Response buffer address field to hold the sense
+		 * buffer address.
+		 * Clear the internal sense buffer, which will potentially hold
+		 * the Completion Queue Entry on return, or 0 if no Entry.
+		 * Build the PRPs and set direction bits.
+		 * Send the request.
+		 */
+		nvme_encap_request->ErrorResponseBaseAddress = ioc->sense_dma &
+		    0xFFFFFFFF00000000;
+		nvme_encap_request->ErrorResponseBaseAddress |=
+		    (U64)mpt3sas_base_get_sense_buffer_dma(ioc, smid);
+		nvme_encap_request->ErrorResponseAllocationLength =
+						NVME_ERROR_RESPONSE_SIZE;
+		memset(ioc->ctl_cmds.sense, 0, NVME_ERROR_RESPONSE_SIZE);
+		ioc->build_nvme_prp(ioc, smid, nvme_encap_request,
+		    data_out_dma, data_out_sz, data_in_dma, data_in_sz);
+		if (test_bit(device_handle, ioc->device_remove_in_progress)) {
+			dtmprintk(ioc, pr_info(MPT3SAS_FMT "handle(0x%04x) :"
+			    "ioctl failed due to device removal in progress\n",
+			    ioc->name, device_handle));
+			mpt3sas_base_free_smid(ioc, smid);
+			ret = -EINVAL;
+			goto out;
+		}
+		ioc->put_smid_nvme_encap(ioc, smid);
+		break;
+	}
 	case MPI2_FUNCTION_SCSI_IO_REQUEST:
 	case MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH:
 	{
@@ -1008,15 +1055,25 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg,
 		}
 	}
 
-	/* copy out sense to user */
+	/* copy out sense/NVMe Error Response to user */
 	if (karg.max_sense_bytes && (mpi_request->Function ==
 	    MPI2_FUNCTION_SCSI_IO_REQUEST || mpi_request->Function ==
-	    MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH)) {
-		sz = min_t(u32, karg.max_sense_bytes, SCSI_SENSE_BUFFERSIZE);
+	    MPI2_FUNCTION_RAID_SCSI_IO_PASSTHROUGH || mpi_request->Function ==
+	    MPI2_FUNCTION_NVME_ENCAPSULATED)) {
+		if (karg.sense_data_ptr == NULL) {
+			pr_info(MPT3SAS_FMT "Response buffer provided"
+			    " by application is NULL; Response data will"
+			    " not be returned.\n", ioc->name);
+			goto out;
+		}
+		sz_arg = (mpi_request->Function ==
+		MPI2_FUNCTION_NVME_ENCAPSULATED) ? NVME_ERROR_RESPONSE_SIZE :
+							SCSI_SENSE_BUFFERSIZE;
+		sz = min_t(u32, karg.max_sense_bytes, sz_arg);
 		if (copy_to_user(karg.sense_data_ptr, ioc->ctl_cmds.sense,
 		    sz)) {
 			pr_err("failure at %s:%d/%s()!\n", __FILE__,
-			    __LINE__, __func__);
+				__LINE__, __func__);
 			ret = -ENODATA;
 			goto out;
 		}
-- 
2.5.5




More information about the Linux-nvme mailing list