Commit 60f8fbaa authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.15/io_uring-2021-09-04' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
 "As sometimes happens, two reports came in around the merge window open
  that led to some fixes. Hence this one is a bit bigger than usual
  followup fixes, but most of it will be going towards stable, outside
  of the fixes that are addressing regressions from this merge window.

  In detail:

   - postgres is a heavy user of signals between tasks, and if we're
     unlucky this can interfere with io-wq worker creation. Make sure
     we're resilient against unrelated signal handling. This set of
     changes also includes hardening against allocation failures, which
     could previously had led to stalls.

   - Some use cases that end up having a mix of bounded and unbounded
     work would have starvation issues related to that. Split the
     pending work lists to handle that better.

   - Completion trace int -> unsigned -> long fix

   - Fix issue with REGISTER_IOWQ_MAX_WORKERS and SQPOLL

   - Fix regression with hash wait lock in this merge window

   - Fix retry issued on block devices (Ming)

   - Fix regression with links in this merge window (Pavel)

   - Fix race with multi-shot poll and completions (Xiaoguang)

   - Ensure regular file IO doesn't inadvertently skip completion
     batching (Pavel)

   - Ensure submissions are flushed after running task_work (Pavel)"

* tag 'for-5.15/io_uring-2021-09-04' of git://git.kernel.dk/linux-block:
  io_uring: io_uring_complete() trace should take an integer
  io_uring: fix possible poll event lost in multi shot mode
  io_uring: prolong tctx_task_work() with flushing
  io_uring: don't disable kiocb_done() CQE batching
  io_uring: ensure IORING_REGISTER_IOWQ_MAX_WORKERS works with SQPOLL
  io-wq: make worker creation resilient against signals
  io-wq: get rid of FIXED worker flag
  io-wq: only exit on fatal signals
  io-wq: split bounded and unbounded work into separate lists
  io-wq: fix queue stalling race
  io_uring: don't submit half-prepared drain request
  io_uring: fix queueing half-created requests
  io-wq: ensure that hash wait lock is IRQ disabling
  io_uring: retry in case of short read on block device
  io_uring: IORING_OP_WRITE needs hash_reg_file set
  io-wq: fix race between adding work and activating a free worker
parents 20fbb11f 2fc2a7a6
...@@ -23,8 +23,7 @@ enum { ...@@ -23,8 +23,7 @@ enum {
IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_UP = 1, /* up and active */
IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_RUNNING = 2, /* account as running */
IO_WORKER_F_FREE = 4, /* worker on free list */ IO_WORKER_F_FREE = 4, /* worker on free list */
IO_WORKER_F_FIXED = 8, /* static idle worker */ IO_WORKER_F_BOUND = 8, /* is doing bounded work */
IO_WORKER_F_BOUND = 16, /* is doing bounded work */
}; };
enum { enum {
...@@ -32,7 +31,7 @@ enum { ...@@ -32,7 +31,7 @@ enum {
}; };
enum { enum {
IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
}; };
/* /*
...@@ -55,7 +54,10 @@ struct io_worker { ...@@ -55,7 +54,10 @@ struct io_worker {
struct callback_head create_work; struct callback_head create_work;
int create_index; int create_index;
union {
struct rcu_head rcu; struct rcu_head rcu;
struct work_struct work;
};
}; };
#if BITS_PER_LONG == 64 #if BITS_PER_LONG == 64
...@@ -71,25 +73,24 @@ struct io_wqe_acct { ...@@ -71,25 +73,24 @@ struct io_wqe_acct {
unsigned max_workers; unsigned max_workers;
int index; int index;
atomic_t nr_running; atomic_t nr_running;
struct io_wq_work_list work_list;
unsigned long flags;
}; };
enum { enum {
IO_WQ_ACCT_BOUND, IO_WQ_ACCT_BOUND,
IO_WQ_ACCT_UNBOUND, IO_WQ_ACCT_UNBOUND,
IO_WQ_ACCT_NR,
}; };
/* /*
* Per-node worker thread pool * Per-node worker thread pool
*/ */
struct io_wqe { struct io_wqe {
struct {
raw_spinlock_t lock; raw_spinlock_t lock;
struct io_wq_work_list work_list; struct io_wqe_acct acct[2];
unsigned flags;
} ____cacheline_aligned_in_smp;
int node; int node;
struct io_wqe_acct acct[2];
struct hlist_nulls_head free_list; struct hlist_nulls_head free_list;
struct list_head all_list; struct list_head all_list;
...@@ -133,8 +134,11 @@ struct io_cb_cancel_data { ...@@ -133,8 +134,11 @@ struct io_cb_cancel_data {
bool cancel_all; bool cancel_all;
}; };
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first); static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index);
static void io_wqe_dec_running(struct io_worker *worker); static void io_wqe_dec_running(struct io_worker *worker);
static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match);
static bool io_worker_get(struct io_worker *worker) static bool io_worker_get(struct io_worker *worker)
{ {
...@@ -195,11 +199,10 @@ static void io_worker_exit(struct io_worker *worker) ...@@ -195,11 +199,10 @@ static void io_worker_exit(struct io_worker *worker)
do_exit(0); do_exit(0);
} }
static inline bool io_wqe_run_queue(struct io_wqe *wqe) static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
__must_hold(wqe->lock)
{ {
if (!wq_list_empty(&wqe->work_list) && if (!wq_list_empty(&acct->work_list) &&
!(wqe->flags & IO_WQE_FLAG_STALLED)) !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
return true; return true;
return false; return false;
} }
...@@ -208,7 +211,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe) ...@@ -208,7 +211,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
* Check head of free list for an available worker. If one isn't available, * Check head of free list for an available worker. If one isn't available,
* caller must create one. * caller must create one.
*/ */
static bool io_wqe_activate_free_worker(struct io_wqe *wqe) static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
struct io_wqe_acct *acct)
__must_hold(RCU) __must_hold(RCU)
{ {
struct hlist_nulls_node *n; struct hlist_nulls_node *n;
...@@ -222,6 +226,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) ...@@ -222,6 +226,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
if (!io_worker_get(worker)) if (!io_worker_get(worker))
continue; continue;
if (io_wqe_get_acct(worker) != acct) {
io_worker_release(worker);
continue;
}
if (wake_up_process(worker->task)) { if (wake_up_process(worker->task)) {
io_worker_release(worker); io_worker_release(worker);
return true; return true;
...@@ -236,9 +244,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) ...@@ -236,9 +244,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
* We need a worker. If we find a free one, we're good. If not, and we're * We need a worker. If we find a free one, we're good. If not, and we're
* below the max number of workers, create one. * below the max number of workers, create one.
*/ */
static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
{ {
bool ret; bool do_create = false;
/* /*
* Most likely an attempt to queue unbounded work on an io_wq that * Most likely an attempt to queue unbounded work on an io_wq that
...@@ -247,17 +255,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) ...@@ -247,17 +255,8 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (unlikely(!acct->max_workers)) if (unlikely(!acct->max_workers))
pr_warn_once("io-wq is not configured for unbound workers"); pr_warn_once("io-wq is not configured for unbound workers");
rcu_read_lock();
ret = io_wqe_activate_free_worker(wqe);
rcu_read_unlock();
if (!ret) {
bool do_create = false, first = false;
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) { if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers)
first = true;
acct->nr_workers++; acct->nr_workers++;
do_create = true; do_create = true;
} }
...@@ -265,9 +264,10 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) ...@@ -265,9 +264,10 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct)
if (do_create) { if (do_create) {
atomic_inc(&acct->nr_running); atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs); atomic_inc(&wqe->wq->worker_refs);
create_io_worker(wqe->wq, wqe, acct->index, first); return create_io_worker(wqe->wq, wqe, acct->index);
}
} }
return true;
} }
static void io_wqe_inc_running(struct io_worker *worker) static void io_wqe_inc_running(struct io_worker *worker)
...@@ -283,7 +283,7 @@ static void create_worker_cb(struct callback_head *cb) ...@@ -283,7 +283,7 @@ static void create_worker_cb(struct callback_head *cb)
struct io_wq *wq; struct io_wq *wq;
struct io_wqe *wqe; struct io_wqe *wqe;
struct io_wqe_acct *acct; struct io_wqe_acct *acct;
bool do_create = false, first = false; bool do_create = false;
worker = container_of(cb, struct io_worker, create_work); worker = container_of(cb, struct io_worker, create_work);
wqe = worker->wqe; wqe = worker->wqe;
...@@ -291,14 +291,12 @@ static void create_worker_cb(struct callback_head *cb) ...@@ -291,14 +291,12 @@ static void create_worker_cb(struct callback_head *cb)
acct = &wqe->acct[worker->create_index]; acct = &wqe->acct[worker->create_index];
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
if (acct->nr_workers < acct->max_workers) { if (acct->nr_workers < acct->max_workers) {
if (!acct->nr_workers)
first = true;
acct->nr_workers++; acct->nr_workers++;
do_create = true; do_create = true;
} }
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (do_create) { if (do_create) {
create_io_worker(wq, wqe, worker->create_index, first); create_io_worker(wq, wqe, worker->create_index);
} else { } else {
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
io_worker_ref_put(wq); io_worker_ref_put(wq);
...@@ -307,9 +305,11 @@ static void create_worker_cb(struct callback_head *cb) ...@@ -307,9 +305,11 @@ static void create_worker_cb(struct callback_head *cb)
io_worker_release(worker); io_worker_release(worker);
} }
static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, static bool io_queue_worker_create(struct io_worker *worker,
struct io_wqe_acct *acct) struct io_wqe_acct *acct,
task_work_func_t func)
{ {
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
/* raced with exit, just ignore create call */ /* raced with exit, just ignore create call */
...@@ -327,16 +327,17 @@ static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, ...@@ -327,16 +327,17 @@ static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker,
test_and_set_bit_lock(0, &worker->create_state)) test_and_set_bit_lock(0, &worker->create_state))
goto fail_release; goto fail_release;
init_task_work(&worker->create_work, create_worker_cb); init_task_work(&worker->create_work, func);
worker->create_index = acct->index; worker->create_index = acct->index;
if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL))
return; return true;
clear_bit_unlock(0, &worker->create_state); clear_bit_unlock(0, &worker->create_state);
fail_release: fail_release:
io_worker_release(worker); io_worker_release(worker);
fail: fail:
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
io_worker_ref_put(wq); io_worker_ref_put(wq);
return false;
} }
static void io_wqe_dec_running(struct io_worker *worker) static void io_wqe_dec_running(struct io_worker *worker)
...@@ -348,10 +349,10 @@ static void io_wqe_dec_running(struct io_worker *worker) ...@@ -348,10 +349,10 @@ static void io_wqe_dec_running(struct io_worker *worker)
if (!(worker->flags & IO_WORKER_F_UP)) if (!(worker->flags & IO_WORKER_F_UP))
return; return;
if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
atomic_inc(&acct->nr_running); atomic_inc(&acct->nr_running);
atomic_inc(&wqe->wq->worker_refs); atomic_inc(&wqe->wq->worker_refs);
io_queue_worker_create(wqe, worker, acct); io_queue_worker_create(worker, acct, create_worker_cb);
} }
} }
...@@ -363,29 +364,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, ...@@ -363,29 +364,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
struct io_wq_work *work) struct io_wq_work *work)
__must_hold(wqe->lock) __must_hold(wqe->lock)
{ {
bool worker_bound, work_bound;
BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1);
if (worker->flags & IO_WORKER_F_FREE) { if (worker->flags & IO_WORKER_F_FREE) {
worker->flags &= ~IO_WORKER_F_FREE; worker->flags &= ~IO_WORKER_F_FREE;
hlist_nulls_del_init_rcu(&worker->nulls_node); hlist_nulls_del_init_rcu(&worker->nulls_node);
} }
/*
* If worker is moving from bound to unbound (or vice versa), then
* ensure we update the running accounting.
*/
worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
if (worker_bound != work_bound) {
int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND;
io_wqe_dec_running(worker);
worker->flags ^= IO_WORKER_F_BOUND;
wqe->acct[index].nr_workers--;
wqe->acct[index ^ 1].nr_workers++;
io_wqe_inc_running(worker);
}
} }
/* /*
...@@ -413,7 +395,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) ...@@ -413,7 +395,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
{ {
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
spin_lock(&wq->hash->wait.lock); spin_lock_irq(&wq->hash->wait.lock);
if (list_empty(&wqe->wait.entry)) { if (list_empty(&wqe->wait.entry)) {
__add_wait_queue(&wq->hash->wait, &wqe->wait); __add_wait_queue(&wq->hash->wait, &wqe->wait);
if (!test_bit(hash, &wq->hash->map)) { if (!test_bit(hash, &wq->hash->map)) {
...@@ -421,48 +403,26 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) ...@@ -421,48 +403,26 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
list_del_init(&wqe->wait.entry); list_del_init(&wqe->wait.entry);
} }
} }
spin_unlock(&wq->hash->wait.lock); spin_unlock_irq(&wq->hash->wait.lock);
}
/*
* We can always run the work if the worker is currently the same type as
* the work (eg both are bound, or both are unbound). If they are not the
* same, only allow it if incrementing the worker count would be allowed.
*/
static bool io_worker_can_run_work(struct io_worker *worker,
struct io_wq_work *work)
{
struct io_wqe_acct *acct;
if (!(worker->flags & IO_WORKER_F_BOUND) !=
!(work->flags & IO_WQ_WORK_UNBOUND))
return true;
/* not the same type, check if we'd go over the limit */
acct = io_work_get_acct(worker->wqe, work);
return acct->nr_workers < acct->max_workers;
} }
static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
struct io_worker *worker, struct io_worker *worker)
bool *stalled)
__must_hold(wqe->lock) __must_hold(wqe->lock)
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
struct io_wq_work *work, *tail; struct io_wq_work *work, *tail;
unsigned int stall_hash = -1U; unsigned int stall_hash = -1U;
struct io_wqe *wqe = worker->wqe;
wq_list_for_each(node, prev, &wqe->work_list) { wq_list_for_each(node, prev, &acct->work_list) {
unsigned int hash; unsigned int hash;
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
if (!io_worker_can_run_work(worker, work))
break;
/* not hashed, can run anytime */ /* not hashed, can run anytime */
if (!io_wq_is_hashed(work)) { if (!io_wq_is_hashed(work)) {
wq_list_del(&wqe->work_list, node, prev); wq_list_del(&acct->work_list, node, prev);
return work; return work;
} }
...@@ -473,7 +433,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, ...@@ -473,7 +433,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
/* hashed, can run if not already running */ /* hashed, can run if not already running */
if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
wqe->hash_tail[hash] = NULL; wqe->hash_tail[hash] = NULL;
wq_list_cut(&wqe->work_list, &tail->list, prev); wq_list_cut(&acct->work_list, &tail->list, prev);
return work; return work;
} }
if (stall_hash == -1U) if (stall_hash == -1U)
...@@ -483,10 +443,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, ...@@ -483,10 +443,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
} }
if (stall_hash != -1U) { if (stall_hash != -1U) {
/*
* Set this before dropping the lock to avoid racing with new
* work being added and clearing the stalled bit.
*/
set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
io_wait_on_hash(wqe, stall_hash); io_wait_on_hash(wqe, stall_hash);
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
*stalled = true;
} }
return NULL; return NULL;
...@@ -520,13 +484,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); ...@@ -520,13 +484,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
static void io_worker_handle_work(struct io_worker *worker) static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock) __releases(wqe->lock)
{ {
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe; struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
do { do {
struct io_wq_work *work; struct io_wq_work *work;
bool stalled;
get_next: get_next:
/* /*
* If we got some work, mark us as busy. If we didn't, but * If we got some work, mark us as busy. If we didn't, but
...@@ -535,12 +499,9 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -535,12 +499,9 @@ static void io_worker_handle_work(struct io_worker *worker)
* can't make progress, any work completion or insertion will * can't make progress, any work completion or insertion will
* clear the stalled flag. * clear the stalled flag.
*/ */
stalled = false; work = io_get_next_work(acct, worker);
work = io_get_next_work(wqe, worker, &stalled);
if (work) if (work)
__io_worker_busy(wqe, worker, work); __io_worker_busy(wqe, worker, work);
else if (stalled)
wqe->flags |= IO_WQE_FLAG_STALLED;
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (!work) if (!work)
...@@ -572,10 +533,10 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -572,10 +533,10 @@ static void io_worker_handle_work(struct io_worker *worker)
if (hash != -1U && !next_hashed) { if (hash != -1U && !next_hashed) {
clear_bit(hash, &wq->hash->map); clear_bit(hash, &wq->hash->map);
clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
if (wq_has_sleeper(&wq->hash->wait)) if (wq_has_sleeper(&wq->hash->wait))
wake_up(&wq->hash->wait); wake_up(&wq->hash->wait);
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
/* skip unnecessary unlock-lock wqe->lock */ /* skip unnecessary unlock-lock wqe->lock */
if (!work) if (!work)
goto get_next; goto get_next;
...@@ -590,8 +551,10 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -590,8 +551,10 @@ static void io_worker_handle_work(struct io_worker *worker)
static int io_wqe_worker(void *data) static int io_wqe_worker(void *data)
{ {
struct io_worker *worker = data; struct io_worker *worker = data;
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
struct io_wqe *wqe = worker->wqe; struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
bool last_timeout = false;
char buf[TASK_COMM_LEN]; char buf[TASK_COMM_LEN];
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
...@@ -605,10 +568,17 @@ static int io_wqe_worker(void *data) ...@@ -605,10 +568,17 @@ static int io_wqe_worker(void *data)
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
loop: loop:
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
if (io_wqe_run_queue(wqe)) { if (io_acct_run_queue(acct)) {
io_worker_handle_work(worker); io_worker_handle_work(worker);
goto loop; goto loop;
} }
/* timed out, exit unless we're the last worker */
if (last_timeout && acct->nr_workers > 1) {
raw_spin_unlock(&wqe->lock);
__set_current_state(TASK_RUNNING);
break;
}
last_timeout = false;
__io_worker_idle(wqe, worker); __io_worker_idle(wqe, worker);
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (io_flush_signals()) if (io_flush_signals())
...@@ -619,13 +589,11 @@ static int io_wqe_worker(void *data) ...@@ -619,13 +589,11 @@ static int io_wqe_worker(void *data)
if (!get_signal(&ksig)) if (!get_signal(&ksig))
continue; continue;
if (fatal_signal_pending(current))
break; break;
}
if (ret)
continue; continue;
/* timed out, exit unless we're the fixed worker */ }
if (!(worker->flags & IO_WORKER_F_FIXED)) last_timeout = !ret;
break;
} }
if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
...@@ -676,7 +644,91 @@ void io_wq_worker_sleeping(struct task_struct *tsk) ...@@ -676,7 +644,91 @@ void io_wq_worker_sleeping(struct task_struct *tsk)
raw_spin_unlock(&worker->wqe->lock); raw_spin_unlock(&worker->wqe->lock);
} }
static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker,
struct task_struct *tsk)
{
tsk->pf_io_worker = worker;
worker->task = tsk;
set_cpus_allowed_ptr(tsk, wqe->cpu_mask);
tsk->flags |= PF_NO_SETAFFINITY;
raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
raw_spin_unlock(&wqe->lock);
wake_up_new_task(tsk);
}
static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
return true;
}
static inline bool io_should_retry_thread(long err)
{
switch (err) {
case -EAGAIN:
case -ERESTARTSYS:
case -ERESTARTNOINTR:
case -ERESTARTNOHAND:
return true;
default:
return false;
}
}
static void create_worker_cont(struct callback_head *cb)
{
struct io_worker *worker;
struct task_struct *tsk;
struct io_wqe *wqe;
worker = container_of(cb, struct io_worker, create_work);
clear_bit_unlock(0, &worker->create_state);
wqe = worker->wqe;
tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
if (!IS_ERR(tsk)) {
io_init_new_worker(wqe, worker, tsk);
io_worker_release(worker);
return;
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
atomic_dec(&acct->nr_running);
raw_spin_lock(&wqe->lock);
acct->nr_workers--;
if (!acct->nr_workers) {
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_all,
.cancel_all = true,
};
while (io_acct_cancel_pending_work(wqe, acct, &match))
raw_spin_lock(&wqe->lock);
}
raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wqe->wq);
return;
}
/* re-create attempts grab a new worker ref, drop the existing one */
io_worker_release(worker);
schedule_work(&worker->work);
}
static void io_workqueue_create(struct work_struct *work)
{
struct io_worker *worker = container_of(work, struct io_worker, work);
struct io_wqe_acct *acct = io_wqe_get_acct(worker);
if (!io_queue_worker_create(worker, acct, create_worker_cont)) {
clear_bit_unlock(0, &worker->create_state);
io_worker_release(worker);
}
}
static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index)
{ {
struct io_wqe_acct *acct = &wqe->acct[index]; struct io_wqe_acct *acct = &wqe->acct[index];
struct io_worker *worker; struct io_worker *worker;
...@@ -685,42 +737,35 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo ...@@ -685,42 +737,35 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bo
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node);
if (!worker) if (!worker) {
goto fail;
refcount_set(&worker->ref, 1);
worker->nulls_node.pprev = NULL;
worker->wqe = wqe;
spin_lock_init(&worker->lock);
init_completion(&worker->ref_done);
tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
if (IS_ERR(tsk)) {
kfree(worker);
fail: fail:
atomic_dec(&acct->nr_running); atomic_dec(&acct->nr_running);
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
acct->nr_workers--; acct->nr_workers--;
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
io_worker_ref_put(wq); io_worker_ref_put(wq);
return; return false;
} }
tsk->pf_io_worker = worker; refcount_set(&worker->ref, 1);
worker->task = tsk; worker->wqe = wqe;
set_cpus_allowed_ptr(tsk, wqe->cpu_mask); spin_lock_init(&worker->lock);
tsk->flags |= PF_NO_SETAFFINITY; init_completion(&worker->ref_done);
raw_spin_lock(&wqe->lock);
hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list);
list_add_tail_rcu(&worker->all_list, &wqe->all_list);
worker->flags |= IO_WORKER_F_FREE;
if (index == IO_WQ_ACCT_BOUND) if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND; worker->flags |= IO_WORKER_F_BOUND;
if (first && (worker->flags & IO_WORKER_F_BOUND))
worker->flags |= IO_WORKER_F_FIXED; tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
raw_spin_unlock(&wqe->lock); if (!IS_ERR(tsk)) {
wake_up_new_task(tsk); io_init_new_worker(wqe, worker, tsk);
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
goto fail;
} else {
INIT_WORK(&worker->work, io_workqueue_create);
schedule_work(&worker->work);
}
return true;
} }
/* /*
...@@ -755,11 +800,6 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) ...@@ -755,11 +800,6 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
return false; return false;
} }
static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
{
return true;
}
static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
{ {
struct io_wq *wq = wqe->wq; struct io_wq *wq = wqe->wq;
...@@ -773,12 +813,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) ...@@ -773,12 +813,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash; unsigned int hash;
struct io_wq_work *tail; struct io_wq_work *tail;
if (!io_wq_is_hashed(work)) { if (!io_wq_is_hashed(work)) {
append: append:
wq_list_add_tail(&work->list, &wqe->work_list); wq_list_add_tail(&work->list, &acct->work_list);
return; return;
} }
...@@ -788,13 +829,14 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) ...@@ -788,13 +829,14 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
if (!tail) if (!tail)
goto append; goto append;
wq_list_add_after(&work->list, &tail->list, &wqe->work_list); wq_list_add_after(&work->list, &tail->list, &acct->work_list);
} }
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work); struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
bool do_wake; unsigned work_flags = work->flags;
bool do_create;
/* /*
* If io-wq is exiting for this task, or if the request has explicitly * If io-wq is exiting for this task, or if the request has explicitly
...@@ -802,19 +844,36 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) ...@@ -802,19 +844,36 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
*/ */
if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) || if (test_bit(IO_WQ_BIT_EXIT, &wqe->wq->state) ||
(work->flags & IO_WQ_WORK_CANCEL)) { (work->flags & IO_WQ_WORK_CANCEL)) {
run_cancel:
io_run_cancel(work, wqe); io_run_cancel(work, wqe);
return; return;
} }
raw_spin_lock(&wqe->lock); raw_spin_lock(&wqe->lock);
io_wqe_insert_work(wqe, work); io_wqe_insert_work(wqe, work);
wqe->flags &= ~IO_WQE_FLAG_STALLED; clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running); rcu_read_lock();
do_create = !io_wqe_activate_free_worker(wqe, acct);
rcu_read_unlock();
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
if (do_wake) if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) ||
io_wqe_wake_worker(wqe, acct); !atomic_read(&acct->nr_running))) {
bool did_create;
did_create = io_wqe_create_worker(wqe, acct);
if (unlikely(!did_create)) {
raw_spin_lock(&wqe->lock);
/* fatal condition, failed to create the first worker */
if (!acct->nr_workers) {
raw_spin_unlock(&wqe->lock);
goto run_cancel;
}
raw_spin_unlock(&wqe->lock);
}
}
} }
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
...@@ -859,6 +918,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, ...@@ -859,6 +918,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
struct io_wq_work *work, struct io_wq_work *work,
struct io_wq_work_node *prev) struct io_wq_work_node *prev)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
unsigned int hash = io_get_work_hash(work); unsigned int hash = io_get_work_hash(work);
struct io_wq_work *prev_work = NULL; struct io_wq_work *prev_work = NULL;
...@@ -870,18 +930,18 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, ...@@ -870,18 +930,18 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
else else
wqe->hash_tail[hash] = NULL; wqe->hash_tail[hash] = NULL;
} }
wq_list_del(&wqe->work_list, &work->list, prev); wq_list_del(&acct->work_list, &work->list, prev);
} }
static void io_wqe_cancel_pending_work(struct io_wqe *wqe, static bool io_acct_cancel_pending_work(struct io_wqe *wqe,
struct io_wqe_acct *acct,
struct io_cb_cancel_data *match) struct io_cb_cancel_data *match)
__releases(wqe->lock)
{ {
struct io_wq_work_node *node, *prev; struct io_wq_work_node *node, *prev;
struct io_wq_work *work; struct io_wq_work *work;
retry: wq_list_for_each(node, prev, &acct->work_list) {
raw_spin_lock(&wqe->lock);
wq_list_for_each(node, prev, &wqe->work_list) {
work = container_of(node, struct io_wq_work, list); work = container_of(node, struct io_wq_work, list);
if (!match->fn(work, match->data)) if (!match->fn(work, match->data))
continue; continue;
...@@ -889,11 +949,27 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, ...@@ -889,11 +949,27 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
io_run_cancel(work, wqe); io_run_cancel(work, wqe);
match->nr_pending++; match->nr_pending++;
if (!match->cancel_all)
return;
/* not safe to continue after unlock */ /* not safe to continue after unlock */
return true;
}
return false;
}
static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
struct io_cb_cancel_data *match)
{
int i;
retry:
raw_spin_lock(&wqe->lock);
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
if (io_acct_cancel_pending_work(wqe, acct, match)) {
if (match->cancel_all)
goto retry; goto retry;
return;
}
} }
raw_spin_unlock(&wqe->lock); raw_spin_unlock(&wqe->lock);
} }
...@@ -954,18 +1030,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, ...@@ -954,18 +1030,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key) int sync, void *key)
{ {
struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
int i;
list_del_init(&wait->entry); list_del_init(&wait->entry);
rcu_read_lock(); rcu_read_lock();
io_wqe_activate_free_worker(wqe); for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
io_wqe_activate_free_worker(wqe, acct);
}
rcu_read_unlock(); rcu_read_unlock();
return 1; return 1;
} }
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
{ {
int ret, node; int ret, node, i;
struct io_wq *wq; struct io_wq *wq;
if (WARN_ON_ONCE(!data->free_work || !data->do_work)) if (WARN_ON_ONCE(!data->free_work || !data->do_work))
...@@ -1000,18 +1082,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1000,18 +1082,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); cpumask_copy(wqe->cpu_mask, cpumask_of_node(node));
wq->wqes[node] = wqe; wq->wqes[node] = wqe;
wqe->node = alloc_node; wqe->node = alloc_node;
wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND;
wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC); task_rlimit(current, RLIMIT_NPROC);
atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
wqe->wait.func = io_wqe_hash_wake;
INIT_LIST_HEAD(&wqe->wait.entry); INIT_LIST_HEAD(&wqe->wait.entry);
wqe->wait.func = io_wqe_hash_wake;
for (i = 0; i < IO_WQ_ACCT_NR; i++) {
struct io_wqe_acct *acct = &wqe->acct[i];
acct->index = i;
atomic_set(&acct->nr_running, 0);
INIT_WQ_LIST(&acct->work_list);
}
wqe->wq = wq; wqe->wq = wq;
raw_spin_lock_init(&wqe->lock); raw_spin_lock_init(&wqe->lock);
INIT_WQ_LIST(&wqe->work_list);
INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
INIT_LIST_HEAD(&wqe->all_list); INIT_LIST_HEAD(&wqe->all_list);
} }
...@@ -1038,7 +1122,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) ...@@ -1038,7 +1122,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data)
{ {
struct io_worker *worker; struct io_worker *worker;
if (cb->func != create_worker_cb) if (cb->func != create_worker_cb || cb->func != create_worker_cont)
return false; return false;
worker = container_of(cb, struct io_worker, create_work); worker = container_of(cb, struct io_worker, create_work);
return worker->wqe->wq == data; return worker->wqe->wq == data;
...@@ -1193,7 +1277,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) ...@@ -1193,7 +1277,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count)
for_each_node(node) { for_each_node(node) {
struct io_wqe_acct *acct; struct io_wqe_acct *acct;
for (i = 0; i < 2; i++) { for (i = 0; i < IO_WQ_ACCT_NR; i++) {
acct = &wq->wqes[node]->acct[i]; acct = &wq->wqes[node]->acct[i];
prev = max_t(int, acct->max_workers, prev); prev = max_t(int, acct->max_workers, prev);
if (new_count[i]) if (new_count[i])
......
...@@ -1021,6 +1021,7 @@ static const struct io_op_def io_op_defs[] = { ...@@ -1021,6 +1021,7 @@ static const struct io_op_def io_op_defs[] = {
}, },
[IORING_OP_WRITE] = { [IORING_OP_WRITE] = {
.needs_file = 1, .needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1, .unbound_nonreg_file = 1,
.pollout = 1, .pollout = 1,
.plug = 1, .plug = 1,
...@@ -1851,6 +1852,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res) ...@@ -1851,6 +1852,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res)
io_req_complete_post(req, res, 0); io_req_complete_post(req, res, 0);
} }
static void io_req_complete_fail_submit(struct io_kiocb *req)
{
/*
* We don't submit, fail them all, for that replace hardlinks with
* normal links. Extra REQ_F_LINK is tolerated.
*/
req->flags &= ~REQ_F_HARDLINK;
req->flags |= REQ_F_LINK;
io_req_complete_failed(req, req->result);
}
/* /*
* Don't initialise the fields below on every allocation, but do that in * Don't initialise the fields below on every allocation, but do that in
* advance and keep them valid across allocations. * advance and keep them valid across allocations.
...@@ -2119,6 +2131,9 @@ static void tctx_task_work(struct callback_head *cb) ...@@ -2119,6 +2131,9 @@ static void tctx_task_work(struct callback_head *cb)
while (1) { while (1) {
struct io_wq_work_node *node; struct io_wq_work_node *node;
if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr)
io_submit_flush_completions(ctx);
spin_lock_irq(&tctx->task_lock); spin_lock_irq(&tctx->task_lock);
node = tctx->task_list.first; node = tctx->task_list.first;
INIT_WQ_LIST(&tctx->task_list); INIT_WQ_LIST(&tctx->task_list);
...@@ -2673,7 +2688,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, ...@@ -2673,7 +2688,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
{ {
if (__io_complete_rw_common(req, res)) if (__io_complete_rw_common(req, res))
return; return;
__io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req));
} }
static void io_complete_rw(struct kiocb *kiocb, long res, long res2) static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
...@@ -3410,6 +3425,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) ...@@ -3410,6 +3425,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
return -EINVAL; return -EINVAL;
} }
static bool need_read_all(struct io_kiocb *req)
{
return req->flags & REQ_F_ISREG ||
S_ISBLK(file_inode(req->file)->i_mode);
}
static int io_read(struct io_kiocb *req, unsigned int issue_flags) static int io_read(struct io_kiocb *req, unsigned int issue_flags)
{ {
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
...@@ -3464,7 +3485,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ...@@ -3464,7 +3485,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags)
} else if (ret == -EIOCBQUEUED) { } else if (ret == -EIOCBQUEUED) {
goto out_free; goto out_free;
} else if (ret <= 0 || ret == io_size || !force_nonblock || } else if (ret <= 0 || ret == io_size || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) {
/* read all, failed, already did sync or don't want to retry */ /* read all, failed, already did sync or don't want to retry */
goto done; goto done;
} }
...@@ -5249,7 +5270,7 @@ static void io_poll_remove_double(struct io_kiocb *req) ...@@ -5249,7 +5270,7 @@ static void io_poll_remove_double(struct io_kiocb *req)
} }
} }
static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask)
__must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->completion_lock)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
...@@ -5271,10 +5292,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) ...@@ -5271,10 +5292,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
if (flags & IORING_CQE_F_MORE) if (flags & IORING_CQE_F_MORE)
ctx->cq_extra++; ctx->cq_extra++;
io_commit_cqring(ctx);
return !(flags & IORING_CQE_F_MORE); return !(flags & IORING_CQE_F_MORE);
} }
static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask)
__must_hold(&req->ctx->completion_lock)
{
bool done;
done = __io_poll_complete(req, mask);
io_commit_cqring(req->ctx);
return done;
}
static void io_poll_task_func(struct io_kiocb *req, bool *locked) static void io_poll_task_func(struct io_kiocb *req, bool *locked)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
...@@ -5285,7 +5315,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) ...@@ -5285,7 +5315,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
} else { } else {
bool done; bool done;
done = io_poll_complete(req, req->result); done = __io_poll_complete(req, req->result);
if (done) { if (done) {
io_poll_remove_double(req); io_poll_remove_double(req);
hash_del(&req->hash_node); hash_del(&req->hash_node);
...@@ -5293,6 +5323,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) ...@@ -5293,6 +5323,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked)
req->result = 0; req->result = 0;
add_wait_queue(req->poll.head, &req->poll.wait); add_wait_queue(req->poll.head, &req->poll.wait);
} }
io_commit_cqring(ctx);
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
io_cqring_ev_posted(ctx); io_cqring_ev_posted(ctx);
...@@ -6398,6 +6429,11 @@ static bool io_drain_req(struct io_kiocb *req) ...@@ -6398,6 +6429,11 @@ static bool io_drain_req(struct io_kiocb *req)
int ret; int ret;
u32 seq; u32 seq;
if (req->flags & REQ_F_FAIL) {
io_req_complete_fail_submit(req);
return true;
}
/* /*
* If we need to drain a request in the middle of a link, drain the * If we need to drain a request in the middle of a link, drain the
* head request and the next request/link after the current link. * head request and the next request/link after the current link.
...@@ -6914,7 +6950,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) ...@@ -6914,7 +6950,7 @@ static inline void io_queue_sqe(struct io_kiocb *req)
if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) {
__io_queue_sqe(req); __io_queue_sqe(req);
} else if (req->flags & REQ_F_FAIL) { } else if (req->flags & REQ_F_FAIL) {
io_req_complete_failed(req, req->result); io_req_complete_fail_submit(req);
} else { } else {
int ret = io_req_prep_async(req); int ret = io_req_prep_async(req);
...@@ -10498,26 +10534,46 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) ...@@ -10498,26 +10534,46 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, static int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
void __user *arg) void __user *arg)
{ {
struct io_uring_task *tctx = current->io_uring; struct io_uring_task *tctx = NULL;
struct io_sq_data *sqd = NULL;
__u32 new_count[2]; __u32 new_count[2];
int i, ret; int i, ret;
if (!tctx || !tctx->io_wq)
return -EINVAL;
if (copy_from_user(new_count, arg, sizeof(new_count))) if (copy_from_user(new_count, arg, sizeof(new_count)))
return -EFAULT; return -EFAULT;
for (i = 0; i < ARRAY_SIZE(new_count); i++) for (i = 0; i < ARRAY_SIZE(new_count); i++)
if (new_count[i] > INT_MAX) if (new_count[i] > INT_MAX)
return -EINVAL; return -EINVAL;
if (ctx->flags & IORING_SETUP_SQPOLL) {
sqd = ctx->sq_data;
if (sqd) {
mutex_lock(&sqd->lock);
tctx = sqd->thread->io_uring;
}
} else {
tctx = current->io_uring;
}
ret = -EINVAL;
if (!tctx || !tctx->io_wq)
goto err;
ret = io_wq_max_workers(tctx->io_wq, new_count); ret = io_wq_max_workers(tctx->io_wq, new_count);
if (ret) if (ret)
return ret; goto err;
if (sqd)
mutex_unlock(&sqd->lock);
if (copy_to_user(arg, new_count, sizeof(new_count))) if (copy_to_user(arg, new_count, sizeof(new_count)))
return -EFAULT; return -EFAULT;
return 0; return 0;
err:
if (sqd)
mutex_unlock(&sqd->lock);
return ret;
} }
static bool io_register_op_must_quiesce(int op) static bool io_register_op_must_quiesce(int op)
......
...@@ -295,14 +295,14 @@ TRACE_EVENT(io_uring_fail_link, ...@@ -295,14 +295,14 @@ TRACE_EVENT(io_uring_fail_link,
*/ */
TRACE_EVENT(io_uring_complete, TRACE_EVENT(io_uring_complete,
TP_PROTO(void *ctx, u64 user_data, long res, unsigned cflags), TP_PROTO(void *ctx, u64 user_data, int res, unsigned cflags),
TP_ARGS(ctx, user_data, res, cflags), TP_ARGS(ctx, user_data, res, cflags),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( u64, user_data ) __field( u64, user_data )
__field( long, res ) __field( int, res )
__field( unsigned, cflags ) __field( unsigned, cflags )
), ),
...@@ -313,7 +313,7 @@ TRACE_EVENT(io_uring_complete, ...@@ -313,7 +313,7 @@ TRACE_EVENT(io_uring_complete,
__entry->cflags = cflags; __entry->cflags = cflags;
), ),
TP_printk("ring %p, user_data 0x%llx, result %ld, cflags %x", TP_printk("ring %p, user_data 0x%llx, result %d, cflags %x",
__entry->ctx, (unsigned long long)__entry->user_data, __entry->ctx, (unsigned long long)__entry->user_data,
__entry->res, __entry->cflags) __entry->res, __entry->cflags)
); );
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment