[RFC 4/5] io_uring: add support for big-cqe

Kanchan Joshi joshi.k at samsung.com
Fri Apr 1 04:03:09 PDT 2022


Add IORING_SETUP_CQE32 flag to allow setting up ring with big-cqe which
is 32 bytes in size. Also modify uring-cmd completion infra to accept
additional result and fill that up in big-cqe.

Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
 fs/io_uring.c                 | 82 +++++++++++++++++++++++++++++------
 include/linux/io_uring.h      | 10 +++--
 include/uapi/linux/io_uring.h | 11 +++++
 3 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index bd0e6b102a7b..b819c0ad47fc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -211,8 +211,8 @@ struct io_mapped_ubuf {
 struct io_ring_ctx;
 
 struct io_overflow_cqe {
-	struct io_uring_cqe cqe;
 	struct list_head list;
+	struct io_uring_cqe cqe; /* this must be kept at end */
 };
 
 struct io_fixed_file {
@@ -1713,6 +1713,13 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
 		return NULL;
 
 	tail = ctx->cached_cq_tail++;
+
+	/* double index for large CQE */
+	if (ctx->flags & IORING_SETUP_CQE32) {
+		mask = 2 * ctx->cq_entries - 1;
+		tail <<= 1;
+	}
+
 	return &rings->cqes[tail & mask];
 }
 
@@ -1792,13 +1799,16 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	while (!list_empty(&ctx->cq_overflow_list)) {
 		struct io_uring_cqe *cqe = io_get_cqe(ctx);
 		struct io_overflow_cqe *ocqe;
+		int cqeshift = 0;
 
 		if (!cqe && !force)
 			break;
+		/* copy more for big-cqe */
+		cqeshift = ctx->flags & IORING_SETUP_CQE32 ? 1 : 0;
 		ocqe = list_first_entry(&ctx->cq_overflow_list,
 					struct io_overflow_cqe, list);
 		if (cqe)
-			memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+			memcpy(cqe, &ocqe->cqe, sizeof(*cqe) << cqeshift);
 		else
 			io_account_cq_overflow(ctx);
 
@@ -1884,11 +1894,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
 }
 
 static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
-				     s32 res, u32 cflags)
+				     s32 res, u32 cflags, u64 res2,
+				     int bigcqe)
 {
 	struct io_overflow_cqe *ocqe;
+	int size = sizeof(*ocqe);
+
+	/* allocate more for big-cqe */
+	if (bigcqe)
+		size += sizeof(struct io_uring_cqe);
 
-	ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+	ocqe = kmalloc(size, GFP_ATOMIC | __GFP_ACCOUNT);
 	if (!ocqe) {
 		/*
 		 * If we're in ring overflow flush mode, or in task cancel mode,
@@ -1907,6 +1923,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 	ocqe->cqe.user_data = user_data;
 	ocqe->cqe.res = res;
 	ocqe->cqe.flags = cflags;
+	if (bigcqe) {
+		struct io_uring_cqe32 *bcqe = (struct io_uring_cqe32 *)&ocqe->cqe;
+
+		bcqe->res2 = res2;
+	}
 	list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
 	return true;
 }
@@ -1928,13 +1949,38 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
 		WRITE_ONCE(cqe->flags, cflags);
 		return true;
 	}
-	return io_cqring_event_overflow(ctx, user_data, res, cflags);
+	return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, false);
 }
 
+static inline bool __fill_big_cqe(struct io_ring_ctx *ctx, u64 user_data,
+				 s32 res, u32 cflags, u64 res2)
+{
+	struct io_uring_cqe32 *bcqe;
+
+	/*
+	 * If we can't get a cq entry, userspace overflowed the
+	 * submission (by quite a lot). Increment the overflow count in
+	 * the ring.
+	 */
+	bcqe = (struct io_uring_cqe32 *) io_get_cqe(ctx);
+	if (likely(bcqe)) {
+		WRITE_ONCE(bcqe->cqe.user_data, user_data);
+		WRITE_ONCE(bcqe->cqe.res, res);
+		WRITE_ONCE(bcqe->cqe.flags, cflags);
+		WRITE_ONCE(bcqe->res2, res2);
+		return true;
+	}
+	return io_cqring_event_overflow(ctx, user_data, res, cflags, res2,
+		       true);
+}
 static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags)
 {
 	trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
-	return __fill_cqe(req->ctx, req->user_data, res, cflags);
+	if (!(req->ctx->flags & IORING_SETUP_CQE32))
+		return __fill_cqe(req->ctx, req->user_data, res, cflags);
+	else
+		return __fill_big_cqe(req->ctx, req->user_data, res, cflags,
+				req->uring_cmd.res2);
 }
 
 static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
@@ -4126,10 +4172,12 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
  * Called by consumers of io_uring_cmd, if they originally returned
  * -EIOCBQUEUED upon receiving the command.
  */
-void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret)
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 {
 	struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
 
+	/* store secondary result in res2 */
+	req->uring_cmd.res2 = res2;
 	if (ret < 0)
 		req_set_fail(req);
 	io_req_complete(req, ret);
@@ -4163,7 +4211,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 	/* queued async, consumer will call io_uring_cmd_done() when complete */
 	if (ret == -EIOCBQUEUED)
 		return 0;
-	io_uring_cmd_done(ioucmd, ret);
+	io_uring_cmd_done(ioucmd, ret, 0);
 	return 0;
 }
 
@@ -9026,13 +9074,20 @@ static void *io_mem_alloc(size_t size)
 	return (void *) __get_free_pages(gfp_flags, get_order(size));
 }
 
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
-				size_t *sq_offset)
+static unsigned long rings_size(struct io_uring_params *p,
+		size_t *sq_offset)
 {
+	unsigned sq_entries, cq_entries;
 	struct io_rings *rings;
 	size_t off, sq_array_size;
 
-	off = struct_size(rings, cqes, cq_entries);
+	sq_entries = p->sq_entries;
+	cq_entries = p->cq_entries;
+
+	if (p->flags & IORING_SETUP_CQE32)
+		off = struct_size(rings, cqes, 2 * cq_entries);
+	else
+		off = struct_size(rings, cqes, cq_entries);
 	if (off == SIZE_MAX)
 		return SIZE_MAX;
 
@@ -10483,7 +10538,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	ctx->sq_entries = p->sq_entries;
 	ctx->cq_entries = p->cq_entries;
 
-	size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+	size = rings_size(p, &sq_array_offset);
 	if (size == SIZE_MAX)
 		return -EOVERFLOW;
 
@@ -10713,7 +10768,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
 			IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
 			IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
-			IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128))
+			IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128 |
+			IORING_SETUP_CQE32))
 		return -EINVAL;
 
 	return  io_uring_create(entries, &p, params);
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index cedc68201469..0aba7b50cde6 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -14,7 +14,10 @@ enum io_uring_cmd_flags {
 
 struct io_uring_cmd {
 	struct file     *file;
-	void            *cmd;
+	union {
+		void            *cmd; /* used on submission */
+		u64		res2; /* used on completion */
+	};
 	/* for irq-completion - if driver requires doing stuff in task-context*/
 	void (*driver_cb)(struct io_uring_cmd *cmd);
 	u32             flags;
@@ -25,7 +28,7 @@ struct io_uring_cmd {
 };
 
 #if defined(CONFIG_IO_URING)
-void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret);
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
 void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 			void (*driver_cb)(struct io_uring_cmd *));
 struct sock *io_uring_get_socket(struct file *file);
@@ -48,7 +51,8 @@ static inline void io_uring_free(struct task_struct *tsk)
 		__io_uring_free(tsk);
 }
 #else
-static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret)
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+		ssize_t ret2)
 {
 }
 static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d7a4bdb9bf3b..85b8ff046496 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -113,6 +113,7 @@ enum {
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SQE128	(1U << 7)	/* SQEs are 128b */
+#define IORING_SETUP_CQE32	(1U << 8)	/* CQEs are 32b */
 
 enum {
 	IORING_OP_NOP,
@@ -207,6 +208,16 @@ struct io_uring_cqe {
 	__u32	flags;
 };
 
+/*
+ * If the ring is initializefd with IORING_SETUP_CQE32, we setup large cqe.
+ * Large CQE is created by combining two adjacent regular CQES.
+ */
+struct io_uring_cqe32 {
+	struct io_uring_cqe	cqe;
+	__u64	res2;
+	__u64	unused;
+};
+
 /*
  * cqe->flags
  *
-- 
2.25.1




More information about the Linux-nvme mailing list