[RFC 4/5] io_uring: add support for big-cqe
Kanchan Joshi
joshi.k at samsung.com
Fri Apr 1 04:03:09 PDT 2022
Add IORING_SETUP_CQE32 flag to allow setting up ring with big-cqe which
is 32 bytes in size. Also modify uring-cmd completion infra to accept
additional result and fill that up in big-cqe.
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
fs/io_uring.c | 82 +++++++++++++++++++++++++++++------
include/linux/io_uring.h | 10 +++--
include/uapi/linux/io_uring.h | 11 +++++
3 files changed, 87 insertions(+), 16 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c
index bd0e6b102a7b..b819c0ad47fc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -211,8 +211,8 @@ struct io_mapped_ubuf {
struct io_ring_ctx;
struct io_overflow_cqe {
- struct io_uring_cqe cqe;
struct list_head list;
+ struct io_uring_cqe cqe; /* this must be kept at end */
};
struct io_fixed_file {
@@ -1713,6 +1713,13 @@ static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx)
return NULL;
tail = ctx->cached_cq_tail++;
+
+ /* double index for large CQE */
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ mask = 2 * ctx->cq_entries - 1;
+ tail <<= 1;
+ }
+
return &rings->cqes[tail & mask];
}
@@ -1792,13 +1799,16 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
while (!list_empty(&ctx->cq_overflow_list)) {
struct io_uring_cqe *cqe = io_get_cqe(ctx);
struct io_overflow_cqe *ocqe;
+ int cqeshift = 0;
if (!cqe && !force)
break;
+ /* copy more for big-cqe */
+ cqeshift = ctx->flags & IORING_SETUP_CQE32 ? 1 : 0;
ocqe = list_first_entry(&ctx->cq_overflow_list,
struct io_overflow_cqe, list);
if (cqe)
- memcpy(cqe, &ocqe->cqe, sizeof(*cqe));
+ memcpy(cqe, &ocqe->cqe, sizeof(*cqe) << cqeshift);
else
io_account_cq_overflow(ctx);
@@ -1884,11 +1894,17 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
}
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
- s32 res, u32 cflags)
+ s32 res, u32 cflags, u64 res2,
+ int bigcqe)
{
struct io_overflow_cqe *ocqe;
+ int size = sizeof(*ocqe);
+
+ /* allocate more for big-cqe */
+ if (bigcqe)
+ size += sizeof(struct io_uring_cqe);
- ocqe = kmalloc(sizeof(*ocqe), GFP_ATOMIC | __GFP_ACCOUNT);
+ ocqe = kmalloc(size, GFP_ATOMIC | __GFP_ACCOUNT);
if (!ocqe) {
/*
* If we're in ring overflow flush mode, or in task cancel mode,
@@ -1907,6 +1923,11 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
ocqe->cqe.user_data = user_data;
ocqe->cqe.res = res;
ocqe->cqe.flags = cflags;
+ if (bigcqe) {
+ struct io_uring_cqe32 *bcqe = (struct io_uring_cqe32 *)&ocqe->cqe;
+
+ bcqe->res2 = res2;
+ }
list_add_tail(&ocqe->list, &ctx->cq_overflow_list);
return true;
}
@@ -1928,13 +1949,38 @@ static inline bool __fill_cqe(struct io_ring_ctx *ctx, u64 user_data,
WRITE_ONCE(cqe->flags, cflags);
return true;
}
- return io_cqring_event_overflow(ctx, user_data, res, cflags);
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, false);
}
+static inline bool __fill_big_cqe(struct io_ring_ctx *ctx, u64 user_data,
+ s32 res, u32 cflags, u64 res2)
+{
+ struct io_uring_cqe32 *bcqe;
+
+ /*
+ * If we can't get a cq entry, userspace overflowed the
+ * submission (by quite a lot). Increment the overflow count in
+ * the ring.
+ */
+ bcqe = (struct io_uring_cqe32 *) io_get_cqe(ctx);
+ if (likely(bcqe)) {
+ WRITE_ONCE(bcqe->cqe.user_data, user_data);
+ WRITE_ONCE(bcqe->cqe.res, res);
+ WRITE_ONCE(bcqe->cqe.flags, cflags);
+ WRITE_ONCE(bcqe->res2, res2);
+ return true;
+ }
+ return io_cqring_event_overflow(ctx, user_data, res, cflags, res2,
+ true);
+}
static inline bool __io_fill_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{
trace_io_uring_complete(req->ctx, req, req->user_data, res, cflags);
- return __fill_cqe(req->ctx, req->user_data, res, cflags);
+ if (!(req->ctx->flags & IORING_SETUP_CQE32))
+ return __fill_cqe(req->ctx, req->user_data, res, cflags);
+ else
+ return __fill_big_cqe(req->ctx, req->user_data, res, cflags,
+ req->uring_cmd.res2);
}
static noinline void io_fill_cqe_req(struct io_kiocb *req, s32 res, u32 cflags)
@@ -4126,10 +4172,12 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
* Called by consumers of io_uring_cmd, if they originally returned
* -EIOCBQUEUED upon receiving the command.
*/
-void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret)
+void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
{
struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
+ /* store secondary result in res2 */
+ req->uring_cmd.res2 = res2;
if (ret < 0)
req_set_fail(req);
io_req_complete(req, ret);
@@ -4163,7 +4211,7 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
/* queued async, consumer will call io_uring_cmd_done() when complete */
if (ret == -EIOCBQUEUED)
return 0;
- io_uring_cmd_done(ioucmd, ret);
+ io_uring_cmd_done(ioucmd, ret, 0);
return 0;
}
@@ -9026,13 +9074,20 @@ static void *io_mem_alloc(size_t size)
return (void *) __get_free_pages(gfp_flags, get_order(size));
}
-static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
- size_t *sq_offset)
+static unsigned long rings_size(struct io_uring_params *p,
+ size_t *sq_offset)
{
+ unsigned sq_entries, cq_entries;
struct io_rings *rings;
size_t off, sq_array_size;
- off = struct_size(rings, cqes, cq_entries);
+ sq_entries = p->sq_entries;
+ cq_entries = p->cq_entries;
+
+ if (p->flags & IORING_SETUP_CQE32)
+ off = struct_size(rings, cqes, 2 * cq_entries);
+ else
+ off = struct_size(rings, cqes, cq_entries);
if (off == SIZE_MAX)
return SIZE_MAX;
@@ -10483,7 +10538,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
- size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
+ size = rings_size(p, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -10713,7 +10768,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ |
- IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128))
+ IORING_SETUP_R_DISABLED | IORING_SETUP_SQE128 |
+ IORING_SETUP_CQE32))
return -EINVAL;
return io_uring_create(entries, &p, params);
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index cedc68201469..0aba7b50cde6 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -14,7 +14,10 @@ enum io_uring_cmd_flags {
struct io_uring_cmd {
struct file *file;
- void *cmd;
+ union {
+ void *cmd; /* used on submission */
+ u64 res2; /* used on completion */
+ };
/* for irq-completion - if driver requires doing stuff in task-context*/
void (*driver_cb)(struct io_uring_cmd *cmd);
u32 flags;
@@ -25,7 +28,7 @@ struct io_uring_cmd {
};
#if defined(CONFIG_IO_URING)
-void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret);
+void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
void (*driver_cb)(struct io_uring_cmd *));
struct sock *io_uring_get_socket(struct file *file);
@@ -48,7 +51,8 @@ static inline void io_uring_free(struct task_struct *tsk)
__io_uring_free(tsk);
}
#else
-static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret)
+static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
+ ssize_t ret2)
{
}
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index d7a4bdb9bf3b..85b8ff046496 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -113,6 +113,7 @@ enum {
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#define IORING_SETUP_SQE128 (1U << 7) /* SQEs are 128b */
+#define IORING_SETUP_CQE32 (1U << 8) /* CQEs are 32b */
enum {
IORING_OP_NOP,
@@ -207,6 +208,16 @@ struct io_uring_cqe {
__u32 flags;
};
+/*
+ * If the ring is initializefd with IORING_SETUP_CQE32, we setup large cqe.
+ * Large CQE is created by combining two adjacent regular CQES.
+ */
+struct io_uring_cqe32 {
+ struct io_uring_cqe cqe;
+ __u64 res2;
+ __u64 unused;
+};
+
/*
* cqe->flags
*
--
2.25.1
More information about the Linux-nvme
mailing list