Commit f88262e6 authored by Dylan Yudaken's avatar Dylan Yudaken Committed by Jens Axboe

io_uring: lockless task list

With networking use cases we see contention on the spinlock used to
protect the task_list when multiple threads try and add completions at once.
Instead we can use a lockless list, and assume that the first caller to
add to the list is responsible for kicking off task work.
Signed-off-by: default avatarDylan Yudaken <dylany@fb.com>
Link: https://lore.kernel.org/r/20220622134028.2013417-4-dylany@fb.comSigned-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent c34398a8
...@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); ...@@ -428,7 +428,7 @@ typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
struct io_task_work { struct io_task_work {
union { union {
struct io_wq_work_node node; struct llist_node node;
struct llist_node fallback_node; struct llist_node fallback_node;
}; };
io_req_tw_func_t func; io_req_tw_func_t func;
......
...@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) ...@@ -986,11 +986,12 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked)
percpu_ref_put(&ctx->refs); percpu_ref_put(&ctx->refs);
} }
static void handle_tw_list(struct io_wq_work_node *node,
static void handle_tw_list(struct llist_node *node,
struct io_ring_ctx **ctx, bool *locked) struct io_ring_ctx **ctx, bool *locked)
{ {
do { do {
struct io_wq_work_node *next = node->next; struct llist_node *next = node->next;
struct io_kiocb *req = container_of(node, struct io_kiocb, struct io_kiocb *req = container_of(node, struct io_kiocb,
io_task_work.node); io_task_work.node);
...@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb) ...@@ -1014,23 +1015,11 @@ void tctx_task_work(struct callback_head *cb)
struct io_ring_ctx *ctx = NULL; struct io_ring_ctx *ctx = NULL;
struct io_uring_task *tctx = container_of(cb, struct io_uring_task, struct io_uring_task *tctx = container_of(cb, struct io_uring_task,
task_work); task_work);
struct llist_node *node = llist_del_all(&tctx->task_list);
while (1) { if (node) {
struct io_wq_work_node *node;
spin_lock_irq(&tctx->task_lock);
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
if (!node)
tctx->task_running = false;
spin_unlock_irq(&tctx->task_lock);
if (!node)
break;
handle_tw_list(node, &ctx, &uring_locked); handle_tw_list(node, &ctx, &uring_locked);
cond_resched(); cond_resched();
if (data_race(!tctx->task_list.first) && uring_locked)
io_submit_flush_completions(ctx);
} }
ctx_flush_and_put(ctx, &uring_locked); ctx_flush_and_put(ctx, &uring_locked);
...@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req) ...@@ -1044,16 +1033,10 @@ void io_req_task_work_add(struct io_kiocb *req)
{ {
struct io_uring_task *tctx = req->task->io_uring; struct io_uring_task *tctx = req->task->io_uring;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_wq_work_node *node; struct llist_node *node;
unsigned long flags;
bool running; bool running;
spin_lock_irqsave(&tctx->task_lock, flags); running = !llist_add(&req->io_task_work.node, &tctx->task_list);
wq_list_add_tail(&req->io_task_work.node, &tctx->task_list);
running = tctx->task_running;
if (!running)
tctx->task_running = true;
spin_unlock_irqrestore(&tctx->task_lock, flags);
/* task_work already pending, we're done */ /* task_work already pending, we're done */
if (running) if (running)
...@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req) ...@@ -1065,11 +1048,8 @@ void io_req_task_work_add(struct io_kiocb *req)
if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
return; return;
spin_lock_irqsave(&tctx->task_lock, flags);
tctx->task_running = false; node = llist_del_all(&tctx->task_list);
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
spin_unlock_irqrestore(&tctx->task_lock, flags);
while (node) { while (node) {
req = container_of(node, struct io_kiocb, io_task_work.node); req = container_of(node, struct io_kiocb, io_task_work.node);
......
...@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, ...@@ -86,8 +86,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->in_idle, 0);
atomic_set(&tctx->inflight_tracked, 0); atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx; task->io_uring = tctx;
spin_lock_init(&tctx->task_lock); init_llist_head(&tctx->task_list);
INIT_WQ_LIST(&tctx->task_list);
init_task_work(&tctx->task_work, tctx_task_work); init_task_work(&tctx->task_work, tctx_task_work);
return 0; return 0;
} }
......
// SPDX-License-Identifier: GPL-2.0 // SPDX-License-Identifier: GPL-2.0
#include <linux/llist.h>
/* /*
* Arbitrary limit, can be raised if need be * Arbitrary limit, can be raised if need be
*/ */
...@@ -19,9 +21,7 @@ struct io_uring_task { ...@@ -19,9 +21,7 @@ struct io_uring_task {
struct percpu_counter inflight; struct percpu_counter inflight;
struct { /* task_work */ struct { /* task_work */
spinlock_t task_lock; struct llist_head task_list;
bool task_running;
struct io_wq_work_list task_list;
struct callback_head task_work; struct callback_head task_work;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment