[PATCH 9/9] nvme: implement multipath access to nvme subsystems

Tony Yang yyt168 at gmail.com
Tue Sep 19 17:18:41 PDT 2017


Hi , Christoph

     I use the above code to recompile the kernel. The following error
occurred. I can't find the blk_steal_bios function. What's the reason
for that? Hope to get your help, thank you


[root at scst1 nvme-nvme-4.13]# make
  CHK     include/config/kernel.release
  CHK     include/generated/uapi/linux/version.h
  CHK     include/generated/utsrelease.h
  CHK     include/generated/bounds.h
  CHK     include/generated/timeconst.h
  CHK     include/generated/asm-offsets.h
  CALL    scripts/checksyscalls.sh
  CHK     scripts/mod/devicetable-offsets.h
  CHK     include/generated/compile.h
  CC [M]  drivers/nvme/host/core.o
drivers/nvme/host/core.c: In function 'nvme_failover_req':
drivers/nvme/host/core.c:115:2: error: implicit declaration of
function 'blk_steal_bios' [-Werror=implicit-function-declaration]
  blk_steal_bios(&ns->head->requeue_list, req);
  ^
In file included from drivers/nvme/host/core.c:32:0:
drivers/nvme/host/core.c: In function 'nvme_req_needs_failover':
drivers/nvme/host/nvme.h:100:25: error: 'REQ_DRV' undeclared (first
use in this function)
 #define REQ_NVME_MPATH  REQ_DRV
                         ^
drivers/nvme/host/core.c:151:25: note: in expansion of macro 'REQ_NVME_MPATH'
  if (!(req->cmd_flags & REQ_NVME_MPATH))
                         ^
drivers/nvme/host/nvme.h:100:25: note: each undeclared identifier is
reported only once for each function it appears in
 #define REQ_NVME_MPATH  REQ_DRV
                         ^
drivers/nvme/host/core.c:151:25: note: in expansion of macro 'REQ_NVME_MPATH'
  if (!(req->cmd_flags & REQ_NVME_MPATH))
                         ^
In file included from ./include/linux/byteorder/little_endian.h:4:0,
                 from ./arch/x86/include/uapi/asm/byteorder.h:4,
                 from ./include/asm-generic/bitops/le.h:5,
                 from ./arch/x86/include/asm/bitops.h:517,
                 from ./include/linux/bitops.h:36,
                 from ./include/linux/kernel.h:10,
                 from ./arch/x86/include/asm/percpu.h:44,
                 from ./arch/x86/include/asm/current.h:5,
                 from ./include/linux/sched.h:11,
                 from ./include/linux/blkdev.h:4,
                 from drivers/nvme/host/core.c:15:
drivers/nvme/host/core.c: In function 'nvme_toggle_streams':
drivers/nvme/host/core.c:433:33: error: 'NVME_NSID_ALL' undeclared
(first use in this function)
  c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
                                 ^
./include/uapi/linux/byteorder/little_endian.h:32:51: note: in
definition of macro '__cpu_to_le32'
 #define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
                                                   ^
drivers/nvme/host/core.c:433:21: note: in expansion of macro 'cpu_to_le32'
  c.directive.nsid = cpu_to_le32(NVME_NSID_ALL);
                     ^
drivers/nvme/host/core.c: In function 'nvme_configure_directives':
drivers/nvme/host/core.c:483:41: error: 'NVME_NSID_ALL' undeclared
(first use in this function)
  ret = nvme_get_stream_params(ctrl, &s, NVME_NSID_ALL);
                                         ^
drivers/nvme/host/core.c: In function 'nvme_submit_user_cmd':
drivers/nvme/host/core.c:770:6: error: 'struct bio' has no member
named 'bi_disk'
   bio->bi_disk = disk;
      ^
drivers/nvme/host/core.c: In function 'nvme_enable_ctrl':
drivers/nvme/host/core.c:1598:23: error: 'NVME_CC_AMS_RR' undeclared
(first use in this function)
  ctrl->ctrl_config |= NVME_CC_AMS_RR | NVME_CC_SHN_NONE;
                       ^
drivers/nvme/host/core.c: In function 'nvme_configure_timestamp':
drivers/nvme/host/core.c:1665:21: error: 'NVME_CTRL_ONCS_TIMESTAMP'
undeclared (first use in this function)
  if (!(ctrl->oncs & NVME_CTRL_ONCS_TIMESTAMP))
                     ^
drivers/nvme/host/core.c:1669:32: error: 'NVME_FEAT_TIMESTAMP'
undeclared (first use in this function)
  ret = nvme_set_features(ctrl, NVME_FEAT_TIMESTAMP, 0, &ts, sizeof(ts),
                                ^
In file included from ./include/linux/byteorder/little_endian.h:4:0,
                 from ./arch/x86/include/uapi/asm/byteorder.h:4,
                 from ./include/asm-generic/bitops/le.h:5,
                 from ./arch/x86/include/asm/bitops.h:517,
                 from ./include/linux/bitops.h:36,
                 from ./include/linux/kernel.h:10,
                 from ./arch/x86/include/asm/percpu.h:44,
                 from ./arch/x86/include/asm/current.h:5,
                 from ./include/linux/sched.h:11,
                 from ./include/linux/blkdev.h:4,
                 from drivers/nvme/host/core.c:15:
drivers/nvme/host/core.c: In function 'nvme_init_identify':
drivers/nvme/host/core.c:2116:33: error: 'struct nvme_id_ctrl' has no
member named 'hmminds'
   ctrl->hmminds = le32_to_cpu(id->hmminds);
                                 ^
./include/uapi/linux/byteorder/little_endian.h:33:51: note: in
definition of macro '__le32_to_cpu'
 #define __le32_to_cpu(x) ((__force __u32)(__le32)(x))
                                                   ^
drivers/nvme/host/core.c:2116:19: note: in expansion of macro 'le32_to_cpu'
   ctrl->hmminds = le32_to_cpu(id->hmminds);
                   ^
drivers/nvme/host/core.c:2117:32: error: 'struct nvme_id_ctrl' has no
member named 'hmmaxd'
   ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
                                ^
./include/uapi/linux/byteorder/little_endian.h:35:51: note: in
definition of macro '__le16_to_cpu'
 #define __le16_to_cpu(x) ((__force __u16)(__le16)(x))
                                                   ^
drivers/nvme/host/core.c:2117:18: note: in expansion of macro 'le16_to_cpu'
   ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
                  ^
drivers/nvme/host/core.c: In function 'nvme_make_request':
drivers/nvme/host/core.c:2535:6: error: 'struct bio' has no member
named 'bi_disk'
   bio->bi_disk = ns->disk;
      ^
In file included from drivers/nvme/host/core.c:32:0:
drivers/nvme/host/nvme.h:100:25: error: 'REQ_DRV' undeclared (first
use in this function)
 #define REQ_NVME_MPATH  REQ_DRV
                         ^
drivers/nvme/host/core.c:2536:18: note: in expansion of macro 'REQ_NVME_MPATH'
   bio->bi_opf |= REQ_NVME_MPATH;
                  ^
drivers/nvme/host/core.c:2537:3: error: implicit declaration of
function 'direct_make_request' [-Werror=implicit-function-declaration]
   ret = direct_make_request(bio);
   ^
drivers/nvme/host/core.c: In function 'nvme_requeue_work':
drivers/nvme/host/core.c:2577:6: error: 'struct bio' has no member
named 'bi_disk'
   bio->bi_disk = head->disk;
      ^
drivers/nvme/host/core.c: In function 'nvme_ctrl_pp_status':
drivers/nvme/host/core.c:3078:58: error: 'NVME_CSTS_PP' undeclared
(first use in this function)
  return ((ctrl->ctrl_config & NVME_CC_ENABLE) && (csts & NVME_CSTS_PP));
                                                          ^
drivers/nvme/host/core.c: In function 'nvme_get_fw_slot_info':
drivers/nvme/host/core.c:3086:23: error: dereferencing pointer to
incomplete type
  log = kmalloc(sizeof(*log), GFP_KERNEL);
                       ^
In file included from ./include/linux/byteorder/little_endian.h:4:0,
                 from ./arch/x86/include/uapi/asm/byteorder.h:4,
                 from ./include/asm-generic/bitops/le.h:5,
                 from ./arch/x86/include/asm/bitops.h:517,
                 from ./include/linux/bitops.h:36,
                 from ./include/linux/kernel.h:10,
                 from ./arch/x86/include/asm/percpu.h:44,
                 from ./arch/x86/include/asm/current.h:5,
                 from ./include/linux/sched.h:11,
                 from ./include/linux/blkdev.h:4,
                 from drivers/nvme/host/core.c:15:
drivers/nvme/host/core.c:3091:30: error: 'NVME_NSID_ALL' undeclared
(first use in this function)
  c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
                              ^
./include/uapi/linux/byteorder/little_endian.h:32:51: note: in
definition of macro '__cpu_to_le32'
 #define __cpu_to_le32(x) ((__force __le32)(__u32)(x))
                                                   ^
drivers/nvme/host/core.c:3091:18: note: in expansion of macro 'cpu_to_le32'
  c.common.nsid = cpu_to_le32(NVME_NSID_ALL);
                  ^
drivers/nvme/host/core.c:3092:65: error: dereferencing pointer to
incomplete type
  c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log));
                                                                 ^
drivers/nvme/host/core.c:3094:59: error: dereferencing pointer to
incomplete type
  if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log)))
                                                           ^
drivers/nvme/host/core.c: In function 'nvme_complete_async_event':
drivers/nvme/host/core.c:3159:7: error:
'NVME_AER_NOTICE_FW_ACT_STARTING' undeclared (first use in this
function)
  case NVME_AER_NOTICE_FW_ACT_STARTING:
       ^
drivers/nvme/host/core.c: In function 'nvme_ctrl_pp_status':
drivers/nvme/host/core.c:3079:1: warning: control reaches end of
non-void function [-Wreturn-type]
 }
 ^
cc1: some warnings being treated as errors
make[3]: *** [drivers/nvme/host/core.o] Error 1
make[2]: *** [drivers/nvme/host] Error 2
make[1]: *** [drivers/nvme] Error 2
make: *** [drivers] Error 2
[root at scst1 nvme-nvme-4.13]#

2017-09-19 7:14 GMT+08:00 Christoph Hellwig <hch at lst.de>:
> This patch adds initial multipath support to the nvme driver.  For each
> namespace we create a new block device node, which can be used to access
> that namespace through any of the controllers that refer to it.
>
> Currently we will always send I/O to the first available path, this will
> be changed once the NVMe Asynchronous Namespace Access (ANA) TP is
> ratified and implemented, at which point we will look at the ANA state
> for each namespace.  Another possibility that was prototyped is to
> use the path that is closes to the submitting NUMA code, which will be
> mostly interesting for PCI, but might also be useful for RDMA or FC
> transports in the future.  There is not plan to implement round robin
> or I/O service time path selectors, as those are not scalable with
> the performance rates provided by NVMe.
>
> The multipath device will go away once all paths to it disappear,
> any delay to keep it alive needs to be implemented at the controller
> level.
>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/core.c | 264 ++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/nvme/host/nvme.h |  11 ++
>  2 files changed, 259 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 3e8405fd57a9..5449c83a9dc3 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -77,6 +77,8 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
>  static LIST_HEAD(nvme_ctrl_list);
>  static DEFINE_SPINLOCK(dev_list_lock);
>
> +static DEFINE_IDA(nvme_disk_ida);
> +
>  static struct class *nvme_class;
>
>  static __le32 nvme_get_log_dw10(u8 lid, size_t size)
> @@ -104,6 +106,19 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
>         return ret;
>  }
>
> +static void nvme_failover_req(struct request *req)
> +{
> +       struct nvme_ns *ns = req->q->queuedata;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&ns->head->requeue_lock, flags);
> +       blk_steal_bios(&ns->head->requeue_list, req);
> +       spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> +
> +       nvme_reset_ctrl(ns->ctrl);
> +       kblockd_schedule_work(&ns->head->requeue_work);
> +}
> +
>  static blk_status_t nvme_error_status(struct request *req)
>  {
>         switch (nvme_req(req)->status & 0x7ff) {
> @@ -131,6 +146,53 @@ static blk_status_t nvme_error_status(struct request *req)
>         }
>  }
>
> +static bool nvme_req_needs_failover(struct request *req)
> +{
> +       if (!(req->cmd_flags & REQ_NVME_MPATH))
> +               return false;
> +
> +       switch (nvme_req(req)->status & 0x7ff) {
> +       /*
> +        * Generic command status:
> +        */
> +       case NVME_SC_INVALID_OPCODE:
> +       case NVME_SC_INVALID_FIELD:
> +       case NVME_SC_INVALID_NS:
> +       case NVME_SC_LBA_RANGE:
> +       case NVME_SC_CAP_EXCEEDED:
> +       case NVME_SC_RESERVATION_CONFLICT:
> +               return false;
> +
> +       /*
> +        * I/O command set specific error.  Unfortunately these values are
> +        * reused for fabrics commands, but those should never get here.
> +        */
> +       case NVME_SC_BAD_ATTRIBUTES:
> +       case NVME_SC_INVALID_PI:
> +       case NVME_SC_READ_ONLY:
> +       case NVME_SC_ONCS_NOT_SUPPORTED:
> +               WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
> +                       nvme_fabrics_command);
> +               return false;
> +
> +       /*
> +        * Media and Data Integrity Errors:
> +        */
> +       case NVME_SC_WRITE_FAULT:
> +       case NVME_SC_READ_ERROR:
> +       case NVME_SC_GUARD_CHECK:
> +       case NVME_SC_APPTAG_CHECK:
> +       case NVME_SC_REFTAG_CHECK:
> +       case NVME_SC_COMPARE_FAILED:
> +       case NVME_SC_ACCESS_DENIED:
> +       case NVME_SC_UNWRITTEN_BLOCK:
> +               return false;
> +       }
> +
> +       /* Everything else could be a path failure, so should be retried */
> +       return true;
> +}
> +
>  static inline bool nvme_req_needs_retry(struct request *req)
>  {
>         if (blk_noretry_request(req))
> @@ -145,6 +207,11 @@ static inline bool nvme_req_needs_retry(struct request *req)
>  void nvme_complete_rq(struct request *req)
>  {
>         if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
> +               if (nvme_req_needs_failover(req)) {
> +                       nvme_failover_req(req);
> +                       return;
> +               }
> +
>                 nvme_req(req)->retries++;
>                 blk_mq_requeue_request(req, true);
>                 return;
> @@ -173,6 +240,18 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
>  }
>  EXPORT_SYMBOL_GPL(nvme_cancel_request);
>
> +static void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> +{
> +       struct nvme_ns *ns;
> +
> +       mutex_lock(&ctrl->namespaces_mutex);
> +       list_for_each_entry(ns, &ctrl->namespaces, list) {
> +               if (ns->head)
> +                       kblockd_schedule_work(&ns->head->requeue_work);
> +       }
> +       mutex_unlock(&ctrl->namespaces_mutex);
> +}
> +
>  bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
>                 enum nvme_ctrl_state new_state)
>  {
> @@ -240,9 +319,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
>
>         if (changed)
>                 ctrl->state = new_state;
> -
>         spin_unlock_irqrestore(&ctrl->lock, flags);
>
> +       if (changed && ctrl->state == NVME_CTRL_LIVE)
> +               nvme_kick_requeue_lists(ctrl);
>         return changed;
>  }
>  EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
> @@ -252,6 +332,15 @@ static void nvme_destroy_ns_head(struct kref *ref)
>         struct nvme_ns_head *head =
>                 container_of(ref, struct nvme_ns_head, ref);
>
> +       del_gendisk(head->disk);
> +       blk_set_queue_dying(head->disk->queue);
> +       /* make sure all pending bios are cleaned up */
> +       kblockd_schedule_work(&head->requeue_work);
> +       flush_work(&head->requeue_work);
> +       blk_cleanup_queue(head->disk->queue);
> +       put_disk(head->disk);
> +       ida_simple_remove(&nvme_disk_ida, head->instance);
> +
>         list_del_init(&head->entry);
>         cleanup_srcu_struct(&head->srcu);
>         kfree(head);
> @@ -1123,8 +1212,10 @@ static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
>         if (blk_get_integrity(disk) &&
>             (ns->pi_type != pi_type || ns->ms != old_ms ||
>              bs != queue_logical_block_size(disk->queue) ||
> -            (ns->ms && ns->ext)))
> +            (ns->ms && ns->ext))) {
>                 blk_integrity_unregister(disk);
> +               blk_integrity_unregister(ns->head->disk);
> +       }
>
>         ns->pi_type = pi_type;
>  }
> @@ -1152,7 +1243,9 @@ static void nvme_init_integrity(struct nvme_ns *ns)
>         }
>         integrity.tuple_size = ns->ms;
>         blk_integrity_register(ns->disk, &integrity);
> +       blk_integrity_register(ns->head->disk, &integrity);
>         blk_queue_max_integrity_segments(ns->queue, 1);
> +       blk_queue_max_integrity_segments(ns->head->disk->queue, 1);
>  }
>  #else
>  static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id,
> @@ -1170,7 +1263,7 @@ static void nvme_set_chunk_size(struct nvme_ns *ns)
>         blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size));
>  }
>
> -static void nvme_config_discard(struct nvme_ns *ns)
> +static void nvme_config_discard(struct nvme_ns *ns, struct request_queue *queue)
>  {
>         struct nvme_ctrl *ctrl = ns->ctrl;
>         u32 logical_block_size = queue_logical_block_size(ns->queue);
> @@ -1181,18 +1274,18 @@ static void nvme_config_discard(struct nvme_ns *ns)
>         if (ctrl->nr_streams && ns->sws && ns->sgs) {
>                 unsigned int sz = logical_block_size * ns->sws * ns->sgs;
>
> -               ns->queue->limits.discard_alignment = sz;
> -               ns->queue->limits.discard_granularity = sz;
> +               queue->limits.discard_alignment = sz;
> +               queue->limits.discard_granularity = sz;
>         } else {
>                 ns->queue->limits.discard_alignment = logical_block_size;
>                 ns->queue->limits.discard_granularity = logical_block_size;
>         }
> -       blk_queue_max_discard_sectors(ns->queue, UINT_MAX);
> -       blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES);
> -       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
> +       blk_queue_max_discard_sectors(queue, UINT_MAX);
> +       blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
> +       queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue);
>
>         if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
> -               blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX);
> +               blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
>  }
>
>  static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid,
> @@ -1249,17 +1342,25 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
>         if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)
>                 nvme_prep_integrity(disk, id, bs);
>         blk_queue_logical_block_size(ns->queue, bs);
> +       blk_queue_logical_block_size(ns->head->disk->queue, bs);
>         if (ns->noiob)
>                 nvme_set_chunk_size(ns);
>         if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
>                 nvme_init_integrity(ns);
> -       if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
> +       if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) {
>                 set_capacity(disk, 0);
> -       else
> +               if (ns->head)
> +                       set_capacity(ns->head->disk, 0);
> +       } else {
>                 set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
> +               if (ns->head)
> +                       set_capacity(ns->head->disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
> +       }
>
> -       if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
> -               nvme_config_discard(ns);
> +       if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
> +               nvme_config_discard(ns, ns->queue);
> +               nvme_config_discard(ns, ns->head->disk->queue);
> +       }
>         blk_mq_unfreeze_queue(disk->queue);
>  }
>
> @@ -2404,6 +2505,80 @@ static const struct attribute_group *nvme_dev_attr_groups[] = {
>         NULL,
>  };
>
> +static struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
> +{
> +       struct nvme_ns *ns;
> +
> +       list_for_each_entry_rcu(ns, &head->list, siblings) {
> +               if (ns->ctrl->state == NVME_CTRL_LIVE) {
> +                       rcu_assign_pointer(head->current_path, ns);
> +                       return ns;
> +               }
> +       }
> +
> +       return NULL;
> +}
> +
> +static blk_qc_t nvme_make_request(struct request_queue *q, struct bio *bio)
> +{
> +       struct nvme_ns_head *head = q->queuedata;
> +       struct device *dev = disk_to_dev(head->disk);
> +       struct nvme_ns *ns;
> +       blk_qc_t ret = BLK_QC_T_NONE;
> +       int srcu_idx;
> +
> +       srcu_idx = srcu_read_lock(&head->srcu);
> +       ns = srcu_dereference(head->current_path, &head->srcu);
> +       if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
> +               ns = nvme_find_path(head);
> +       if (likely(ns)) {
> +               bio->bi_disk = ns->disk;
> +               bio->bi_opf |= REQ_NVME_MPATH;
> +               ret = direct_make_request(bio);
> +       } else if (!list_empty_careful(&head->list)) {
> +               dev_warn_ratelimited(dev, "no path available - requeing I/O\n");
> +
> +               spin_lock_irq(&head->requeue_lock);
> +               bio_list_add(&head->requeue_list, bio);
> +               spin_unlock_irq(&head->requeue_lock);
> +       } else {
> +               dev_warn_ratelimited(dev, "no path - failing I/O\n");
> +
> +               bio->bi_status = BLK_STS_IOERR;
> +               bio_endio(bio);
> +       }
> +
> +       srcu_read_unlock(&head->srcu, srcu_idx);
> +       return ret;
> +}
> +
> +static const struct block_device_operations nvme_subsys_ops = {
> +       .owner          = THIS_MODULE,
> +};
> +
> +static void nvme_requeue_work(struct work_struct *work)
> +{
> +       struct nvme_ns_head *head =
> +               container_of(work, struct nvme_ns_head, requeue_work);
> +       struct bio *bio, *next;
> +
> +       spin_lock_irq(&head->requeue_lock);
> +       next = bio_list_get(&head->requeue_list);
> +       spin_unlock_irq(&head->requeue_lock);
> +
> +       while ((bio = next) != NULL) {
> +               next = bio->bi_next;
> +               bio->bi_next = NULL;
> +
> +               /*
> +                * Reset disk to the mpath node and resubmit to select a new
> +                * path.
> +                */
> +               bio->bi_disk = head->disk;
> +               direct_make_request(bio);
> +       }
> +}
> +
>  static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys,
>                 unsigned nsid)
>  {
> @@ -2439,6 +2614,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>                 unsigned nsid, struct nvme_id_ns *id)
>  {
>         struct nvme_ns_head *head;
> +       struct request_queue *q;
>         int ret = -ENOMEM;
>
>         head = kzalloc(sizeof(*head), GFP_KERNEL);
> @@ -2447,6 +2623,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>
>         INIT_LIST_HEAD(&head->list);
>         head->ns_id = nsid;
> +       bio_list_init(&head->requeue_list);
> +       spin_lock_init(&head->requeue_lock);
> +       INIT_WORK(&head->requeue_work, nvme_requeue_work);
>         init_srcu_struct(&head->srcu);
>         kref_init(&head->ref);
>
> @@ -2459,8 +2638,37 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>                 goto out_free_head;
>         }
>
> +       ret = -ENOMEM;
> +       q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
> +       if (!q)
> +               goto out_free_head;
> +       q->queuedata = head;
> +       blk_queue_make_request(q, nvme_make_request);
> +       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
> +       /* set to a default value for 512 until disk is validated */
> +       blk_queue_logical_block_size(q, 512);
> +       nvme_set_queue_limits(ctrl, q);
> +
> +       head->instance = ida_simple_get(&nvme_disk_ida, 1, 0, GFP_KERNEL);
> +       if (head->instance < 0)
> +               goto out_cleanup_queue;
> +
> +       head->disk = alloc_disk(0);
> +       if (!head->disk)
> +               goto out_ida_remove;
> +       head->disk->fops = &nvme_subsys_ops;
> +       head->disk->private_data = head;
> +       head->disk->queue = q;
> +       head->disk->flags = GENHD_FL_EXT_DEVT;
> +       sprintf(head->disk->disk_name, "nvme/ns%d", head->instance);
> +
>         list_add_tail(&head->entry, &ctrl->subsys->nsheads);
>         return head;
> +
> +out_ida_remove:
> +       ida_simple_remove(&nvme_disk_ida, head->instance);
> +out_cleanup_queue:
> +       blk_cleanup_queue(q);
>  out_free_head:
>         cleanup_srcu_struct(&head->srcu);
>         kfree(head);
> @@ -2469,7 +2677,7 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
>  }
>
>  static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
> -               struct nvme_id_ns *id)
> +               struct nvme_id_ns *id, bool *new)
>  {
>         struct nvme_ctrl *ctrl = ns->ctrl;
>         bool is_shared = id->nmic & (1 << 0);
> @@ -2485,6 +2693,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
>                         ret = PTR_ERR(head);
>                         goto out_unlock;
>                 }
> +
> +               *new = true;
>         } else {
>                 struct nvme_ns_ids ids;
>
> @@ -2496,6 +2706,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid,
>                         ret = -EINVAL;
>                         goto out_unlock;
>                 }
> +
> +               *new = false;
>         }
>
>         list_add_tail(&ns->siblings, &head->list);
> @@ -2565,6 +2777,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>         struct nvme_id_ns *id;
>         char disk_name[DISK_NAME_LEN];
>         int node = dev_to_node(ctrl->dev);
> +       bool new = true;
>
>         ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
>         if (!ns)
> @@ -2597,7 +2810,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>         if (id->ncap == 0)
>                 goto out_free_id;
>
> -       if (nvme_init_ns_head(ns, nsid, id))
> +       if (nvme_init_ns_head(ns, nsid, id, &new))
>                 goto out_free_id;
>
>         if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) {
> @@ -2636,6 +2849,19 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>         if (ns->ndev && nvme_nvm_register_sysfs(ns))
>                 pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
>                         ns->disk->disk_name);
> +
> +       if (new)
> +               add_disk(ns->head->disk);
> +
> +       if (sysfs_create_link(&disk_to_dev(ns->disk)->kobj,
> +                       &disk_to_dev(ns->head->disk)->kobj, "mpath"))
> +               pr_warn("%s: failed to create sysfs link to mpath device\n",
> +                       ns->disk->disk_name);
> +       if (sysfs_create_link(&disk_to_dev(ns->head->disk)->kobj,
> +                       &disk_to_dev(ns->disk)->kobj, ns->disk->disk_name))
> +               pr_warn("%s: failed to create sysfs link from mpath device\n",
> +                       ns->disk->disk_name);
> +
>         return;
>   out_unlink_ns:
>         mutex_lock(&ctrl->subsys->lock);
> @@ -2663,6 +2889,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>                         blk_integrity_unregister(ns->disk);
>                 sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
>                                         &nvme_ns_attr_group);
> +               sysfs_remove_link(&disk_to_dev(ns->disk)->kobj, "mpath");
> +               sysfs_remove_link(&disk_to_dev(ns->head->disk)->kobj,
> +                               ns->disk->disk_name);
>                 if (ns->ndev)
>                         nvme_nvm_unregister_sysfs(ns);
>                 del_gendisk(ns->disk);
> @@ -2670,8 +2899,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
>         }
>
>         mutex_lock(&ns->ctrl->subsys->lock);
> -       if (head)
> +       if (head) {
> +               rcu_assign_pointer(head->current_path, NULL);
>                 list_del_rcu(&ns->siblings);
> +       }
>         mutex_unlock(&ns->ctrl->subsys->lock);
>
>         mutex_lock(&ns->ctrl->namespaces_mutex);
> @@ -3222,6 +3453,7 @@ int __init nvme_core_init(void)
>
>  void nvme_core_exit(void)
>  {
> +       ida_destroy(&nvme_disk_ida);
>         class_destroy(nvme_class);
>         __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
>         destroy_workqueue(nvme_wq);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index a724d2597c4c..2062e62c9769 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -94,6 +94,11 @@ struct nvme_request {
>         u16                     status;
>  };
>
> +/*
> + * Mark a bio as coming in through the mpath node.
> + */
> +#define REQ_NVME_MPATH         REQ_DRV
> +
>  enum {
>         NVME_REQ_CANCELLED              = (1 << 0),
>  };
> @@ -225,12 +230,18 @@ struct nvme_ns_ids {
>   * only ever has a single entry for private namespaces.
>   */
>  struct nvme_ns_head {
> +       struct nvme_ns __rcu    *current_path;
> +       struct gendisk          *disk;
>         struct list_head        list;
>         struct srcu_struct      srcu;
> +       struct bio_list         requeue_list;
> +       spinlock_t              requeue_lock;
> +       struct work_struct      requeue_work;
>         unsigned                ns_id;
>         struct nvme_ns_ids      ids;
>         struct list_head        entry;
>         struct kref             ref;
> +       int                     instance;
>  };
>
>  struct nvme_ns {
> --
> 2.14.1
>
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme



More information about the Linux-nvme mailing list