[PATCH 05/11] nvme: add ANA support
Christoph Hellwig
hch at lst.de
Mon May 14 00:56:40 PDT 2018
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/nvme/host/core.c | 28 +++++
drivers/nvme/host/multipath.c | 189 +++++++++++++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 27 +++++
3 files changed, 239 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index b4d55d2455ad..4c33daefd9e9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1443,6 +1443,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds;
if (ns->lba_shift == 0)
ns->lba_shift = 9;
+ ns->anagrpid = le32_to_cpu(id->anagrpid);
ns->noiob = le16_to_cpu(id->noiob);
ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms);
@@ -2355,6 +2356,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
+ ctrl->max_namespaces = le32_to_cpu(id->mnan);
+ ctrl->anacap = id->anacap;
+ ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
+ ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
if (id->rtd3e) {
/* us -> s */
@@ -2433,6 +2438,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
+ ret = nvme_configure_ana(ctrl);
+ if (ret < 0)
+ return ret;
+
ctrl->identified = true;
return 0;
@@ -2634,6 +2643,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_ana_grpid.attr,
+ &dev_attr_ana_state.attr,
+#endif
NULL,
};
@@ -2656,6 +2669,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0;
}
+#ifdef CONFIG_NVME_MULTIPATH
+ if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
+ if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
+ return 0;
+ if (!nvme_ctrl_has_ana(nvme_get_ns_from_dev(dev)->ctrl))
+ return 0;
+ }
+#endif
return a->mode;
}
@@ -3326,6 +3347,11 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work);
break;
+ case NVME_AER_NOTICE_ANA:
+ if (WARN_ON_ONCE(!ctrl->ana_log_buf))
+ break;
+ queue_work(nvme_wq, &ctrl->ana_work);
+ break;
default:
dev_warn(ctrl->device, "async event result %08x\n", result);
}
@@ -3361,6 +3387,7 @@ void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work);
+ cancel_work_sync(&ctrl->ana_work);
cancel_work_sync(&ctrl->fw_act_work);
if (ctrl->ops->stop_ctrl)
ctrl->ops->stop_ctrl(ctrl);
@@ -3394,6 +3421,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
+ nvme_deconfigure_ana(ctrl);
if (subsys) {
mutex_lock(&subsys->lock);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index b5a00853fbe2..4f096e9a65b1 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -51,7 +51,29 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
- nvme_reset_ctrl(ns->ctrl);
+ /*
+ * Reset the controller for any non-ANA error as we don't know what
+ * caused the error:
+ */
+ switch (nvme_req(req)->status & 0x7ff) {
+ case NVME_SC_ANA_TRANSITION:
+ // XXX: kick of a transition timer
+ case NVME_SC_ANA_PERSISTENT_LOSS:
+ case NVME_SC_ANA_INACCESSIBLE:
+ /*
+ * We could try to update the ANA group state here instead of
+ * waiting for the AER and log page read. But concurrency would
+ * be nasy.
+ */
+ nvme_mpath_clear_current_path(ns);
+ if (ns->head->disk)
+ kblockd_schedule_work(&ns->head->requeue_work);
+ break;
+ default:
+ nvme_reset_ctrl(ns->ctrl);
+ break;
+ }
+
kblockd_schedule_work(&ns->head->requeue_work);
}
@@ -76,12 +98,32 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem);
}
-static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+static inline enum nvme_ana_state nvme_ns_ana_state(struct nvme_ns *ns)
+{
+ if (!nvme_ctrl_has_ana(ns->ctrl))
+ return NVME_ANA_OPTIMIZED;
+ if (WARN_ON_ONCE(ns->anagrpid > ns->ctrl->anagrpmax))
+ return 0;
+ return ns->ctrl->ana_state[ns->anagrpid];
+}
+
+static const char *nvme_ana_state_names[] = {
+ [0] = "invalid state",
+ [NVME_ANA_OPTIMIZED] = "optimized",
+ [NVME_ANA_NONOPTIMIZED] = "non-optimized",
+ [NVME_ANA_INACCESSIBLE] = "inaccessible",
+ [NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
+ [NVME_ANA_CHANGE] = "change",
+};
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head,
+ u8 ana_state)
{
struct nvme_ns *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state == NVME_CTRL_LIVE) {
+ if (ns->ctrl->state == NVME_CTRL_LIVE &&
+ nvme_ns_ana_state(ns) == ana_state) {
rcu_assign_pointer(head->current_path, ns);
return ns;
}
@@ -94,8 +136,14 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
- if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
- ns = __nvme_find_path(head);
+ if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE &&
+ nvme_ns_ana_state(ns) == NVME_ANA_OPTIMIZED))
+ return ns;
+
+ ns = __nvme_find_path(head, NVME_ANA_OPTIMIZED);
+ if (!ns)
+ ns = __nvme_find_path(head, NVME_ANA_NONOPTIMIZED);
+ /* XXX: try an inaccessible path as last resort per 8.18.3.3 */
return ns;
}
@@ -248,3 +296,134 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue);
put_disk(head->disk);
}
+
+static int nvme_process_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
+{
+ void *base = ctrl->ana_log_buf;
+ size_t offset = sizeof(struct nvme_ana_rsp_hdr);
+ int error, i;
+
+ /*
+ * If anagrpid never changes we don't need to process the namespace
+ * lists.
+ */
+ if (ctrl->anacap & (1 << 7))
+ groups_only = true;
+
+ error = nvme_get_log_ext(ctrl, NULL, NVME_LOG_ANA,
+ groups_only ? NVME_ANA_LOG_RGO : 0,
+ ctrl->ana_log_buf, ctrl->ana_log_size, 0);
+ if (error) {
+ dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
+ return error;
+ }
+
+ for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
+ struct nvme_ana_group_desc *desc = base + offset;
+ u32 grpid = le32_to_cpu(desc->grpid);
+ u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
+ size_t nsid_buf_size = nr_nsids * sizeof(__le32);
+ struct nvme_ns *ns;
+
+ if (WARN_ON_ONCE(grpid == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(grpid > ctrl->anagrpmax))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state == 0))
+ return -EINVAL;
+ if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
+ return -EINVAL;
+
+ dev_info(ctrl->device, "ANA group %d: %s.\n",
+ grpid, nvme_ana_state_names[desc->state]);
+ ctrl->ana_state[grpid] = desc->state;
+ offset += sizeof(*desc);
+ if (!nr_nsids)
+ continue;
+
+ if (WARN_ON_ONCE(groups_only))
+ return -EINVAL;
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
+ return -EINVAL;
+
+ down_write(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ u32 nsid = le32_to_cpu(desc->nsids[n]);
+
+ if (ns->head->ns_id != nsid)
+ continue;
+ ns->anagrpid = grpid;
+ if (++n == nr_nsids)
+ break;
+ }
+ up_write(&ctrl->namespaces_rwsem);
+ WARN_ON_ONCE(n < nr_nsids);
+
+ offset += nsid_buf_size;
+ if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void nvme_ana_work(struct work_struct *work)
+{
+ struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
+
+ nvme_process_ana_log(ctrl, false);
+ nvme_kick_requeue_lists(ctrl);
+}
+
+int nvme_configure_ana(struct nvme_ctrl *ctrl)
+{
+ int error;
+
+ if (!nvme_ctrl_has_ana(ctrl))
+ return 0;
+
+ INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ ctrl->ana_state = kcalloc(ctrl->anagrpmax, sizeof(*ctrl->ana_state),
+ GFP_KERNEL);
+ if (!ctrl->ana_state)
+ return -ENOMEM;
+
+ ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
+ ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc) +
+ ctrl->max_namespaces * sizeof(__le32);
+ ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
+ if (!ctrl->ana_log_buf)
+ goto out_free_ana_state;
+
+ error = nvme_process_ana_log(ctrl, true);
+ if (error)
+ goto out_free_ana_log_buf;
+ return 0;
+out_free_ana_log_buf:
+ kfree(ctrl->ana_log_buf);
+out_free_ana_state:
+ return -ENOMEM;
+}
+
+void nvme_deconfigure_ana(struct nvme_ctrl *ctrl)
+{
+ kfree(ctrl->ana_log_buf);
+ kfree(ctrl->ana_state);
+}
+
+static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->anagrpid);
+}
+DEVICE_ATTR_RO(ana_grpid);
+
+static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
+ enum nvme_ana_state state = nvme_ns_ana_state(ns);
+
+ return sprintf(buf, "%s\n", nvme_ana_state_names[state]);
+}
+DEVICE_ATTR_RO(ana_state);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0e48ad5eb159..84c6445a0f53 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -168,6 +168,7 @@ struct nvme_ctrl {
u16 oacs;
u16 nssa;
u16 nr_streams;
+ u32 max_namespaces;
atomic_t abort_limit;
u8 vwc;
u32 vs;
@@ -188,6 +189,15 @@ struct nvme_ctrl {
struct nvme_command ka_cmd;
struct work_struct fw_act_work;
+ /* asymmetric namespace access: */
+ u8 anacap;
+ u32 anagrpmax;
+ u32 nanagrpid;
+ enum nvme_ana_state *ana_state;
+ size_t ana_log_size;
+ struct nvme_ana_rsp_hdr *ana_log_buf;
+ struct work_struct ana_work;
+
/* Power saving configuration */
u64 ps_max_latency_us;
bool apst_enabled;
@@ -293,6 +303,7 @@ struct nvme_ns {
#define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1
u16 noiob;
+ u32 anagrpid;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
struct nvme_fault_inject fault_inject;
@@ -435,6 +446,11 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops;
+static inline bool nvme_ctrl_has_ana(struct nvme_ctrl *ctrl)
+{
+ return ctrl->subsys->cmic & (1 << 3);
+}
+
#ifdef CONFIG_NVME_MULTIPATH
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags);
@@ -444,6 +460,8 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
+int nvme_configure_ana(struct nvme_ctrl *ctrl);
+void nvme_deconfigure_ana(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -462,6 +480,9 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+extern struct device_attribute dev_attr_ana_grpid;
+extern struct device_attribute dev_attr_ana_state;
+
#else
/*
* Without the multipath code enabled, multiple controller per subsystems are
@@ -501,6 +522,12 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
+static inline int nvme_configure_ana(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_deconfigure_ana(struct nvme_ctrl *ctrl)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
--
2.17.0
More information about the Linux-nvme
mailing list