[PATCH v2] nvme-pci: add NVMe controller statistics

Thu May 29 10:29:02 PDT 2025

This is to count the controller warning events.

Signed-off-by: Tokunori Ikegami <ikegami.t at gmail.com>
---
Changes since v1:
- Split the sysfs stats attribute to create 4 new files.
- Create stats subdirectory for the attibutes split.
- Change the device attributes to read-write version.

 drivers/nvme/host/nvme.h |   9 +++
 drivers/nvme/host/pci.c  | 127 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ad0c1f834f09..5a6d0aebc9f8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -277,6 +277,13 @@ enum nvme_ctrl_flags {
 	NVME_CTRL_FROZEN		= 6,
 };
 
+struct nvme_stats {
+	unsigned long timeouts;
+	unsigned long aborts;
+	unsigned long resets;
+	unsigned long disables;
+};
+
 struct nvme_ctrl {
 	bool comp_seen;
 	bool identified;
@@ -411,6 +418,8 @@ struct nvme_ctrl {
 	enum nvme_ctrl_type cntrltype;
 	enum nvme_dctype dctype;
 	u16 awupf; /* 0's based value. */
+
+	struct nvme_stats stats;
 };
 
 static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e0bfe04a2bc2..632b222b51ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1467,6 +1467,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 		dev_warn(dev->ctrl.device,
 			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
 			 csts, result);
+	dev->ctrl.stats.resets++;
 
 	if (csts != ~0)
 		return;
@@ -1528,6 +1529,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 		dev_warn(dev->ctrl.device,
 			 "I/O tag %d (%04x) QID %d timeout, completion polled\n",
 			 req->tag, nvme_cid(req), nvmeq->qid);
+		dev->ctrl.stats.timeouts++;
 		return BLK_EH_DONE;
 	}
 
@@ -1565,6 +1567,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 			 "I/O tag %d (%04x) opcode %#x (%s) QID %d timeout, reset controller\n",
 			 req->tag, nvme_cid(req), opcode,
 			 nvme_opcode_str(nvmeq->qid, opcode), nvmeq->qid);
+		dev->ctrl.stats.resets++;
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
 		goto disable;
 	}
@@ -1584,6 +1587,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 		 req->tag, nvme_cid(req), opcode, nvme_get_opcode_str(opcode),
 		 nvmeq->qid, blk_op_str(req_op(req)), req_op(req),
 		 blk_rq_bytes(req));
+	dev->ctrl.stats.aborts++;
 
 	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, nvme_req_op(&cmd),
 					 BLK_MQ_REQ_NOWAIT);
@@ -2424,9 +2428,130 @@ static const struct attribute_group nvme_pci_dev_attrs_group = {
 	.is_visible	= nvme_pci_attrs_are_visible,
 };
 
+static ssize_t timeouts_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", ctrl->stats.timeouts);
+}
+
+static ssize_t timeouts_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	unsigned long timeouts;
+	int err;
+
+	err = kstrtoul(buf, 10, &timeouts);
+	if (err)
+		return -EINVAL;
+
+	ctrl->stats.timeouts = timeouts;
+
+	return count;
+}
+static DEVICE_ATTR_RW(timeouts);
+
+static ssize_t aborts_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", ctrl->stats.aborts);
+}
+
+static ssize_t aborts_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	unsigned long aborts;
+	int err;
+
+	err = kstrtoul(buf, 10, &aborts);
+	if (err)
+		return -EINVAL;
+
+	ctrl->stats.aborts = aborts;
+
+	return count;
+}
+static DEVICE_ATTR_RW(aborts);
+
+static ssize_t resets_show(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", ctrl->stats.resets);
+}
+
+static ssize_t resets_store(struct device *dev, struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	unsigned long resets;
+	int err;
+
+	err = kstrtoul(buf, 10, &resets);
+	if (err)
+		return -EINVAL;
+
+	ctrl->stats.resets = resets;
+
+	return count;
+}
+static DEVICE_ATTR_RW(resets);
+
+static ssize_t disables_show(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", ctrl->stats.disables);
+}
+
+static ssize_t disables_store(struct device *dev, struct device_attribute *attr,
+			      const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	unsigned long disables;
+	int err;
+
+	err = kstrtoul(buf, 10, &disables);
+	if (err)
+		return -EINVAL;
+
+	ctrl->stats.disables = disables;
+
+	return count;
+}
+static DEVICE_ATTR_RW(disables);
+
+static umode_t nvme_stats_attrs_are_visible(struct kobject *kobj,
+					    struct attribute *a, int n)
+{
+	return a->mode;
+}
+
+static struct attribute *nvme_stats_attrs[] = {
+	&dev_attr_timeouts.attr,
+	&dev_attr_aborts.attr,
+	&dev_attr_resets.attr,
+	&dev_attr_disables.attr,
+	NULL,
+};
+
+static const struct attribute_group nvme_stats_attrs_group = {
+	.name		= "stats",
+	.attrs		= nvme_stats_attrs,
+	.is_visible	= nvme_stats_attrs_are_visible,
+};
+
 static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
 	&nvme_dev_attrs_group,
 	&nvme_pci_dev_attrs_group,
+	&nvme_stats_attrs_group,
 	NULL,
 };
 
@@ -3057,6 +3182,7 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
 		 result);
+	dev->ctrl.stats.disables++;
 	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
 	nvme_dev_disable(dev, true);
 	nvme_sync_queues(&dev->ctrl);
@@ -3593,6 +3719,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
 	case pci_channel_io_frozen:
 		dev_warn(dev->ctrl.device,
 			"frozen state error detected, reset controller\n");
+		dev->ctrl.stats.resets++;
 		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
 			nvme_dev_disable(dev, true);
 			return PCI_ERS_RESULT_DISCONNECT;
-- 
2.48.1