[PATCH 4/5] nvmet_tcp: activate new poll group

Thu Aug 27 21:01:00 EDT 2020

nvmet_tcp: activate new poll group

Shift worker focus within nvmet_tcp_io_work() from an
individual queue to the poll group.  The function will
process the group's active work_list for a bounded
time period. At the end of this polling time period.

Early exit from the polling period is tracked across
multiple iterations of the worker.  The worker will not
re-queue itself if a complete polling period has been
performed with no recorded activity.

A module parameter provides a means to set alternative
poll group process time periods.

Signed-off-by: Mark Wunderlich <mark.wunderlich at intel.com>
---
 drivers/nvme/target/tcp.c |   94 ++++++++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 84dd5b300a1d..3955dbe38f0f 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -29,9 +29,17 @@ static int so_priority;
 module_param(so_priority, int, 0644);
 MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
 
+/*
+ * Define an alternate time period (in msecs) that io_work() shall use
+ * over the default value.  Changing the value may benefit high group
+ * queue count scenarios.
+ */
+static int io_work_poll_budget;
+module_param(io_work_poll_budget, int, 0644);
+MODULE_PARM_DESC(io_work_poll_budget, "nvmet tcp io_work poll time budget");
+
 #define NVMET_TCP_RECV_BUDGET		8
 #define NVMET_TCP_SEND_BUDGET		8
-#define NVMET_TCP_IO_WORK_BUDGET	64
 
 enum nvmet_tcp_send_state {
 	NVMET_TCP_SEND_DATA_PDU,
@@ -93,7 +101,6 @@ enum nvmet_tcp_queue_state {
 struct nvmet_tcp_queue {
 	struct socket		*sock;
 	struct nvmet_tcp_port	*port;
-	struct work_struct	io_work;
 	struct nvmet_cq		nvme_cq;
 	struct nvmet_sq		nvme_sq;
 
@@ -128,7 +135,10 @@ struct nvmet_tcp_queue {
 
 	int			idx;
 	struct list_head	queue_list;
+
 	struct nvmet_tcp_queue_group *group;
+	struct list_head	glist_entry;
+	struct mutex		activate_mutex;
 
 	struct nvmet_tcp_cmd	connect;
 
@@ -157,6 +167,7 @@ struct nvmet_tcp_queue_group {
 	struct nvmet_tcp_group_napi napi[NVMET_TCP_GROUP_NAPI_LIMIT];
 	struct work_struct	io_work;
 	int			cpu;
+	unsigned long		deadline;
 	struct list_head	work_list;
 	struct list_head	release_list;
 };
@@ -202,6 +213,16 @@ static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
 		!cmd->rbytes_done;
 }
 
+static inline void nvmet_tcp_queue_work(struct nvmet_tcp_queue *queue)
+{
+	struct nvmet_tcp_queue_group *group;
+
+	group = queue->group;
+	if (likely(group)) {
+		queue_work_on(group->cpu, nvmet_tcp_wq, &group->io_work);
+	}
+}
+
 static inline struct nvmet_tcp_cmd *
 nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
 {
@@ -521,7 +542,7 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
 	struct nvmet_tcp_queue	*queue = cmd->queue;
 
 	llist_add(&cmd->lentry, &queue->resp_list);
-	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+	nvmet_tcp_queue_work(queue);
 }
 
 static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -1209,33 +1230,50 @@ static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_io_work(struct work_struct *w)
 {
-	struct nvmet_tcp_queue *queue =
-		container_of(w, struct nvmet_tcp_queue, io_work);
-	bool pending;
-	int ret, ops = 0;
+	struct nvmet_tcp_queue_group *group =
+		container_of(w, struct nvmet_tcp_queue_group, io_work);
+	struct nvmet_tcp_queue *queue, *next;
+	bool pending = false;
+	unsigned long deadline, bp_usec = 10000;
+	int ret, ops, grp_ops = 0;
 
-	do {
-		pending = false;
+	if (io_work_poll_budget > 0)
+		bp_usec = io_work_poll_budget;
+	deadline = jiffies + usecs_to_jiffies(bp_usec);
 
-		ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
-		if (ret > 0)
-			pending = true;
-		else if (ret < 0)
-			return;
+	do {
+		ops = 0;
+		list_for_each_entry_safe(queue, next, &group->work_list, glist_entry) {
+			ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
+			if (ret < 0)
+				return;
+
+			ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
+			if (ret < 0)
+				return;
+		}
+		grp_ops += ops;
+		if (!ops)
+			break;
+	} while (!time_after(jiffies, deadline));
 
-		ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
-		if (ret > 0)
+	if (grp_ops > 0) {
+		pending = true;
+		group->deadline = 0;
+	} else {
+		if (!group->deadline)
+			group->deadline = deadline;
+		if (!time_after(jiffies, group->deadline))
 			pending = true;
-		else if (ret < 0)
-			return;
+	}
 
-	} while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
+	cond_resched();
 
 	/*
-	 * We exahusted our budget, requeue our selves
+	 * We exhausted our budget, re-queue ourselves if pending activity
 	 */
 	if (pending)
-		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		queue_work_on(group->cpu, nvmet_tcp_wq, &group->io_work);
 }
 
 static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
@@ -1376,11 +1414,9 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	mutex_unlock(&nvmet_tcp_queue_mutex);
 
 	nvmet_tcp_restore_socket_callbacks(queue);
-	flush_work(&queue->io_work);
 
 	nvmet_tcp_uninit_data_in_cmds(queue);
 	nvmet_sq_destroy(&queue->nvme_sq);
-	cancel_work_sync(&queue->io_work);
 	sock_release(queue->sock);
 	nvmet_tcp_free_cmds(queue);
 	if (queue->hdr_digest || queue->data_digest)
@@ -1397,7 +1433,7 @@ static void nvmet_tcp_data_ready(struct sock *sk)
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
 	if (likely(queue))
-		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		nvmet_tcp_queue_work(queue);
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -1417,7 +1453,7 @@ static void nvmet_tcp_write_space(struct sock *sk)
 
 	if (sk_stream_is_writeable(sk)) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+		nvmet_tcp_queue_work(queue);
 	}
 out:
 	read_unlock_bh(&sk->sk_callback_lock);
@@ -1569,7 +1605,6 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 		return -ENOMEM;
 
 	INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
-	INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
 	queue->sock = newsock;
 	queue->port = port;
 	queue->nr_cmds = 0;
@@ -1599,6 +1634,8 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
 	mutex_unlock(&nvmet_tcp_queue_mutex);
 
+	mutex_init(&queue->activate_mutex);
+	INIT_LIST_HEAD(&queue->glist_entry);
 	if (!nvmet_tcp_add_to_group(queue))
 		goto out_destroy_sq;
 
@@ -1606,7 +1643,10 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
 	if (ret)
 		goto out_remove_from_group;
 
-	queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work);
+	if (!mutex_trylock(&queue->activate_mutex))
+		goto out_remove_from_group;
+	list_add_tail(&queue->glist_entry, &queue->group->work_list);
+	nvmet_tcp_queue_work(queue);
 
 	return 0;
 out_remove_from_group: