Commit 60f8fbaa authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.15/io_uring-2021-09-04' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
 "As sometimes happens, two reports came in around the merge window open
  that led to some fixes. Hence this one is a bit bigger than usual
  followup fixes, but most of it will be going towards stable, outside
  of the fixes that are addressing regressions from this merge window.

  In detail:

   - postgres is a heavy user of signals between tasks, and if we're
     unlucky this can interfere with io-wq worker creation. Make sure
     we're resilient against unrelated signal handling. This set of
     changes also includes hardening against allocation failures, which
     could previously had led to stalls.

   - Some use cases that end up having a mix of bounded and unbounded
     work would have starvation issues related to that. Split the
     pending work lists to handle that better.

   - Completion trace int -> unsigned -> long fix

   - Fix issue with REGISTER_IOWQ_MAX_WORKERS and SQPOLL

   - Fix regression with hash wait lock in this merge window

   - Fix retry issued on block devices (Ming)

   - Fix regression with links in this merge window (Pavel)

   - Fix race with multi-shot poll and completions (Xiaoguang)

   - Ensure regular file IO doesn't inadvertently skip completion
     batching (Pavel)

   - Ensure submissions are flushed after running task_work (Pavel)"

* tag 'for-5.15/io_uring-2021-09-04' of git://git.kernel.dk/linux-block:
  io_uring: io_uring_complete() trace should take an integer
  io_uring: fix possible poll event lost in multi shot mode
  io_uring: prolong tctx_task_work() with flushing
  io_uring: don't disable kiocb_done() CQE batching
  io_uring: ensure IORING_REGISTER_IOWQ_MAX_WORKERS works with SQPOLL
  io-wq: make worker creation resilient against signals
  io-wq: get rid of FIXED worker flag
  io-wq: only exit on fatal signals
  io-wq: split bounded and unbounded work into separate lists
  io-wq: fix queue stalling race
  io_uring: don't submit half-prepared drain request
  io_uring: fix queueing half-created requests
  io-wq: ensure that hash wait lock is IRQ disabling
  io_uring: retry in case of short read on block device
  io_uring: IORING_OP_WRITE needs hash_reg_file set
  io-wq: fix race between adding work and activating a free worker
parents 20fbb11f 2fc2a7a6
This diff is collapsed.
......@@ -1021,6 +1021,7 @@ static const struct io_op_def io_op_defs[] = {
},
[IORING_OP_WRITE] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
.plug = 1,
......@@ -1851,6 +1852,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
io_req_complete_post(req, res, 0);
}
static void io_req_complete_fail_submit(struct io_kiocb *req)
{
/*
* We don't submit, fail them all, for that replace hardlinks with
* normal links. Extra REQ_F_LINK is tolerated.
*/
req->flags &= ~REQ_F_HARDLINK;
req->flags |= REQ_F_LINK;
io_req_complete_failed(req, req->result);
}
/*
* Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations.
......@@ -2119,6 +2131,9 @@ static void tctx_task_work(struct callback_head *cb)
while (1) {
struct io_wq_work_node *node;
if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock);
node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list);
......@@ -2673,7 +2688,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
{
if (__io_complete_rw_common(req, res))
return;
__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req));
__io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
}
static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
......@@ -3410,6 +3425,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
return -EINVAL;
}
static bool need_read_all(struct io_kiocb *req)
{
return req->flags & REQ_F_ISREG ||
S_ISBLK(file_inode(req->file)->i_mode);
}
static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
......@@ -3464,7 +3485,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
} else if (ret == -EIOCBQUEUED) {
goto out_free;
} else if (ret <= 0 || ret == io_size || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) {
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */
goto done;
}
......@@ -5249,7 +5270,7 @@ static void io_poll_remove_double(struct io_kiocb *req)
}
}
static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
__must_hold(&req->ctx->completion_lock)
{
struct io_ring_ctx *ctx = req->ctx;
......@@ -5271,10 +5292,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (flags & IORING_CQE_F_MORE)
ctx->cq_extra++;
io_commit_cqring(ctx);
return !(flags & IORING_CQE_F_MORE);
}
static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
__must_hold(&req->ctx->completion_lock)
{
bool done;
done = __io_poll_complete(req, mask);
io_commit_cqring(req->ctx);
return done;
}
static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{
struct io_ring_ctx *ctx = req->ctx;
......@@ -5285,7 +5315,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
} else {
bool done;
done = io_poll_complete(req, req->result);
done = __io_poll_complete(req, req->result);
if (done) {
io_poll_remove_double(req);
hash_del(&req->hash_node);
......@@ -5293,6 +5323,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
req->result = 0;
add_wait_queue(req->poll.head, &req->poll.wait);
}
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
......@@ -6398,6 +6429,11 @@ static bool io_drain_req(struct io_kiocb *req)
int ret;
u32 seq;
if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
return true;
}
/*
* If we need to drain a request in the middle of a link, drain the
* head request and the next request/link after the current link.
......@@ -6914,7 +6950,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
__io_queue_sqe(req);
} else if (req->flags & REQ_F_FAIL) {
io_req_complete_failed(req, req->result);
io_req_complete_fail_submit(req);
} else {
int ret = io_req_prep_async(req);
......@@ -10498,26 +10534,46 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg)
{
struct io_uring_task *tctx = current->io_uring;
struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2];
int i, ret;
if (!tctx || !tctx->io_wq)
return -EINVAL;
if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX)
return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
mutex_lock(&sqd->lock);
tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
ret = -EINVAL;
if (!tctx || !tctx->io_wq)
goto err;
ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret)
return ret;
goto err;
if (sqd)
mutex_unlock(&sqd->lock);
if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT;
return 0;
err:
if (sqd)
mutex_unlock(&sqd->lock);
return ret;
}
static bool io_register_op_must_quiesce(int op)
......
......@@ -295,14 +295,14 @@ TRACE_EVENT(io_uring_fail_link,
*/
TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res, unsigned cflags),
TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u64, user_data )
__field( long, res )
__field( int, res )
__field( unsigned, cflags )
),
......@@ -313,7 +313,7 @@ TRACE_EVENT(io_uring_complete,
__entry->cflags = cflags;
),
TP_printk("ring %p, user_data 0x%llx, result %ld, cflags %x",
TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data,
__entry->res, __entry->cflags)
);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment