Commit 53ae7e11 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux

Pull more io_uring updates from Jens Axboe:
 "Here's a set of fixes/changes that didn't make the first cut, either
  because they got queued before I sent the early merge request, or
  fixes that came in afterwards. In detail:

   - Don't set MSG_NOSIGNAL on recv/recvmsg opcodes, as AF_PACKET will
     error out (David)

   - Fix for spurious poll wakeups (me)

   - Fix for a file leak for buffered reads in certain conditions
     (Joseph)

   - Don't allow registered buffers of mixed types (Pavel)

   - Improve handling of huge pages for registered buffers (Pavel)

   - Provided buffer ring size calculation fix (Wojciech)

   - Minor cleanups (me)"

* tag 'io_uring-6.3-2023-03-03' of git://git.kernel.dk/linux:
  io_uring/poll: don't pass in wake func to io_init_poll_iocb()
  io_uring: fix fget leak when fs don't support nowait buffered read
  io_uring/poll: allow some retries for poll triggering spuriously
  io_uring: remove MSG_NOSIGNAL from recvmsg
  io_uring/rsrc: always initialize 'folio' to NULL
  io_uring/rsrc: optimise registered huge pages
  io_uring/rsrc: optimise single entry advance
  io_uring/rsrc: disallow multi-source reg buffers
  io_uring: remove unused wq_list_merge
  io_uring: fix size calculation when registering buf ring
  io_uring/rsrc: fix a comment in io_import_fixed()
  io_uring: rename 'in_idle' to 'in_cancel'
  io_uring: consolidate the put_ref-and-return section of adding work
parents 9d0281b5 1947ddf9
...@@ -58,7 +58,7 @@ struct io_uring_task { ...@@ -58,7 +58,7 @@ struct io_uring_task {
struct xarray xa; struct xarray xa;
struct wait_queue_head wait; struct wait_queue_head wait;
atomic_t in_idle; atomic_t in_cancel;
atomic_t inflight_tracked; atomic_t inflight_tracked;
struct percpu_counter inflight; struct percpu_counter inflight;
......
...@@ -719,7 +719,7 @@ static void io_put_task_remote(struct task_struct *task, int nr) ...@@ -719,7 +719,7 @@ static void io_put_task_remote(struct task_struct *task, int nr)
struct io_uring_task *tctx = task->io_uring; struct io_uring_task *tctx = task->io_uring;
percpu_counter_sub(&tctx->inflight, nr); percpu_counter_sub(&tctx->inflight, nr);
if (unlikely(atomic_read(&tctx->in_idle))) if (unlikely(atomic_read(&tctx->in_cancel)))
wake_up(&tctx->wait); wake_up(&tctx->wait);
put_task_struct_many(task, nr); put_task_struct_many(task, nr);
} }
...@@ -1258,8 +1258,8 @@ void tctx_task_work(struct callback_head *cb) ...@@ -1258,8 +1258,8 @@ void tctx_task_work(struct callback_head *cb)
ctx_flush_and_put(ctx, &uring_locked); ctx_flush_and_put(ctx, &uring_locked);
/* relaxed read is enough as only the task itself sets ->in_idle */ /* relaxed read is enough as only the task itself sets ->in_cancel */
if (unlikely(atomic_read(&tctx->in_idle))) if (unlikely(atomic_read(&tctx->in_cancel)))
io_uring_drop_tctx_refs(current); io_uring_drop_tctx_refs(current);
trace_io_uring_task_work_run(tctx, count, loops); trace_io_uring_task_work_run(tctx, count, loops);
...@@ -1285,17 +1285,15 @@ static void io_req_local_work_add(struct io_kiocb *req) ...@@ -1285,17 +1285,15 @@ static void io_req_local_work_add(struct io_kiocb *req)
percpu_ref_get(&ctx->refs); percpu_ref_get(&ctx->refs);
if (!llist_add(&req->io_task_work.node, &ctx->work_llist)) { if (!llist_add(&req->io_task_work.node, &ctx->work_llist))
percpu_ref_put(&ctx->refs); goto put_ref;
return;
}
/* needed for the following wake up */ /* needed for the following wake up */
smp_mb__after_atomic(); smp_mb__after_atomic();
if (unlikely(atomic_read(&req->task->io_uring->in_idle))) { if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) {
io_move_task_work_from_local(ctx); io_move_task_work_from_local(ctx);
percpu_ref_put(&ctx->refs); goto put_ref;
return;
} }
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
...@@ -1305,6 +1303,8 @@ static void io_req_local_work_add(struct io_kiocb *req) ...@@ -1305,6 +1303,8 @@ static void io_req_local_work_add(struct io_kiocb *req)
if (READ_ONCE(ctx->cq_waiting)) if (READ_ONCE(ctx->cq_waiting))
wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
put_ref:
percpu_ref_put(&ctx->refs); percpu_ref_put(&ctx->refs);
} }
...@@ -1777,7 +1777,7 @@ int io_req_prep_async(struct io_kiocb *req) ...@@ -1777,7 +1777,7 @@ int io_req_prep_async(struct io_kiocb *req)
const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct io_issue_def *def = &io_issue_defs[req->opcode];
/* assign early for deferred execution for non-fixed file */ /* assign early for deferred execution for non-fixed file */
if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE)) if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file)
req->file = io_file_get_normal(req, req->cqe.fd); req->file = io_file_get_normal(req, req->cqe.fd);
if (!cdef->prep_async) if (!cdef->prep_async)
return 0; return 0;
...@@ -2937,12 +2937,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb) ...@@ -2937,12 +2937,12 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb)
work = container_of(cb, struct io_tctx_exit, task_work); work = container_of(cb, struct io_tctx_exit, task_work);
/* /*
* When @in_idle, we're in cancellation and it's racy to remove the * When @in_cancel, we're in cancellation and it's racy to remove the
* node. It'll be removed by the end of cancellation, just ignore it. * node. It'll be removed by the end of cancellation, just ignore it.
* tctx can be NULL if the queueing of this task_work raced with * tctx can be NULL if the queueing of this task_work raced with
* work cancelation off the exec path. * work cancelation off the exec path.
*/ */
if (tctx && !atomic_read(&tctx->in_idle)) if (tctx && !atomic_read(&tctx->in_cancel))
io_uring_del_tctx_node((unsigned long)work->ctx); io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion); complete(&work->completion);
} }
...@@ -3210,7 +3210,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) ...@@ -3210,7 +3210,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
if (tctx->io_wq) if (tctx->io_wq)
io_wq_exit_start(tctx->io_wq); io_wq_exit_start(tctx->io_wq);
atomic_inc(&tctx->in_idle); atomic_inc(&tctx->in_cancel);
do { do {
bool loop = false; bool loop = false;
...@@ -3261,9 +3261,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) ...@@ -3261,9 +3261,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
if (cancel_all) { if (cancel_all) {
/* /*
* We shouldn't run task_works after cancel, so just leave * We shouldn't run task_works after cancel, so just leave
* ->in_idle set for normal exit. * ->in_cancel set for normal exit.
*/ */
atomic_dec(&tctx->in_idle); atomic_dec(&tctx->in_cancel);
/* for exec all current's requests should be gone, kill tctx */ /* for exec all current's requests should be gone, kill tctx */
__io_uring_free(current); __io_uring_free(current);
} }
......
...@@ -505,7 +505,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -505,7 +505,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
} }
pages = io_pin_pages(reg.ring_addr, pages = io_pin_pages(reg.ring_addr,
struct_size(br, bufs, reg.ring_entries), flex_array_size(br, bufs, reg.ring_entries),
&nr_pages); &nr_pages);
if (IS_ERR(pages)) { if (IS_ERR(pages)) {
kfree(free_bl); kfree(free_bl);
......
...@@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -567,7 +567,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->flags = READ_ONCE(sqe->ioprio); sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~(RECVMSG_FLAGS)) if (sr->flags & ~(RECVMSG_FLAGS))
return -EINVAL; return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; sr->msg_flags = READ_ONCE(sqe->msg_flags);
if (sr->msg_flags & MSG_DONTWAIT) if (sr->msg_flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT; req->flags |= REQ_F_NOWAIT;
if (sr->msg_flags & MSG_ERRQUEUE) if (sr->msg_flags & MSG_ERRQUEUE)
......
...@@ -51,6 +51,9 @@ struct io_poll_table { ...@@ -51,6 +51,9 @@ struct io_poll_table {
#define IO_WQE_F_DOUBLE 1 #define IO_WQE_F_DOUBLE 1
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key);
static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe) static inline struct io_kiocb *wqe_to_req(struct wait_queue_entry *wqe)
{ {
unsigned long priv = (unsigned long)wqe->private; unsigned long priv = (unsigned long)wqe->private;
...@@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) ...@@ -164,15 +167,14 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked)
} }
} }
static void io_init_poll_iocb(struct io_poll *poll, __poll_t events, static void io_init_poll_iocb(struct io_poll *poll, __poll_t events)
wait_queue_func_t wake_func)
{ {
poll->head = NULL; poll->head = NULL;
#define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP) #define IO_POLL_UNMASK (EPOLLERR|EPOLLHUP|EPOLLNVAL|EPOLLRDHUP)
/* mask in events that we always want/need */ /* mask in events that we always want/need */
poll->events = events | IO_POLL_UNMASK; poll->events = events | IO_POLL_UNMASK;
INIT_LIST_HEAD(&poll->wait.entry); INIT_LIST_HEAD(&poll->wait.entry);
init_waitqueue_func_entry(&poll->wait, wake_func); init_waitqueue_func_entry(&poll->wait, io_poll_wake);
} }
static inline void io_poll_remove_entry(struct io_poll *poll) static inline void io_poll_remove_entry(struct io_poll *poll)
...@@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt, ...@@ -508,7 +510,7 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
/* mark as double wq entry */ /* mark as double wq entry */
wqe_private |= IO_WQE_F_DOUBLE; wqe_private |= IO_WQE_F_DOUBLE;
io_init_poll_iocb(poll, first->events, first->wait.func); io_init_poll_iocb(poll, first->events);
if (!io_poll_double_prepare(req)) { if (!io_poll_double_prepare(req)) {
/* the request is completing, just back off */ /* the request is completing, just back off */
kfree(poll); kfree(poll);
...@@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, ...@@ -569,7 +571,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
INIT_HLIST_NODE(&req->hash_node); INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq); req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask, io_poll_wake); io_init_poll_iocb(poll, mask);
poll->file = req->file; poll->file = req->file;
req->apoll_events = poll->events; req->apoll_events = poll->events;
...@@ -650,6 +652,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, ...@@ -650,6 +652,14 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
__io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll);
} }
/*
* We can't reliably detect loops in repeated poll triggers and issue
* subsequently failing. But rather than fail these immediately, allow a
* certain amount of retries before we give up. Given that this condition
* should _rarely_ trigger even once, we should be fine with a larger value.
*/
#define APOLL_MAX_RETRY 128
static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
unsigned issue_flags) unsigned issue_flags)
{ {
...@@ -665,14 +675,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, ...@@ -665,14 +675,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req,
if (entry == NULL) if (entry == NULL)
goto alloc_apoll; goto alloc_apoll;
apoll = container_of(entry, struct async_poll, cache); apoll = container_of(entry, struct async_poll, cache);
apoll->poll.retries = APOLL_MAX_RETRY;
} else { } else {
alloc_apoll: alloc_apoll:
apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC);
if (unlikely(!apoll)) if (unlikely(!apoll))
return NULL; return NULL;
apoll->poll.retries = APOLL_MAX_RETRY;
} }
apoll->double_poll = NULL; apoll->double_poll = NULL;
req->apoll = apoll; req->apoll = apoll;
if (unlikely(!--apoll->poll.retries))
return NULL;
return apoll; return apoll;
} }
...@@ -694,8 +708,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) ...@@ -694,8 +708,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
return IO_APOLL_ABORTED; return IO_APOLL_ABORTED;
if (!file_can_poll(req->file)) if (!file_can_poll(req->file))
return IO_APOLL_ABORTED; return IO_APOLL_ABORTED;
if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED)
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT)) if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT; mask |= EPOLLONESHOT;
......
...@@ -12,6 +12,7 @@ struct io_poll { ...@@ -12,6 +12,7 @@ struct io_poll {
struct file *file; struct file *file;
struct wait_queue_head *head; struct wait_queue_head *head;
__poll_t events; __poll_t events;
int retries;
struct wait_queue_entry wait; struct wait_queue_entry wait;
}; };
......
...@@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) ...@@ -1162,14 +1162,17 @@ struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages)
pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
pages, vmas); pages, vmas);
if (pret == nr_pages) { if (pret == nr_pages) {
struct file *file = vmas[0]->vm_file;
/* don't support file backed memory */ /* don't support file backed memory */
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
struct vm_area_struct *vma = vmas[i]; if (vmas[i]->vm_file != file) {
ret = -EINVAL;
if (vma_is_shmem(vma)) break;
}
if (!file)
continue; continue;
if (vma->vm_file && if (!vma_is_shmem(vmas[i]) && !is_file_hugepages(file)) {
!is_file_hugepages(vma->vm_file)) {
ret = -EOPNOTSUPP; ret = -EOPNOTSUPP;
break; break;
} }
...@@ -1207,6 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -1207,6 +1210,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off; unsigned long off;
size_t size; size_t size;
int ret, nr_pages, i; int ret, nr_pages, i;
struct folio *folio = NULL;
*pimu = ctx->dummy_ubuf; *pimu = ctx->dummy_ubuf;
if (!iov->iov_base) if (!iov->iov_base)
...@@ -1221,6 +1225,21 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -1221,6 +1225,21 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
/* If it's a huge page, try to coalesce them into a single bvec entry */
if (nr_pages > 1) {
folio = page_folio(pages[0]);
for (i = 1; i < nr_pages; i++) {
if (page_folio(pages[i]) != folio) {
folio = NULL;
break;
}
}
if (folio) {
folio_put_refs(folio, nr_pages - 1);
nr_pages = 1;
}
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu) if (!imu)
goto done; goto done;
...@@ -1233,6 +1252,17 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -1233,6 +1252,17 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
off = (unsigned long) iov->iov_base & ~PAGE_MASK; off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len; size = iov->iov_len;
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
if (folio) {
bvec_set_page(&imu->bvec[0], pages[0], size, off);
goto done;
}
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
size_t vec_len; size_t vec_len;
...@@ -1241,12 +1271,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -1241,12 +1271,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
off = 0; off = 0;
size -= vec_len; size -= vec_len;
} }
/* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
*pimu = imu;
ret = 0;
done: done:
if (ret) if (ret)
kvfree(imu); kvfree(imu);
...@@ -1335,7 +1359,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1335,7 +1359,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return -EFAULT; return -EFAULT;
/* /*
* May not be a start of buffer, set size appropriately * Might not be a start of buffer, set size appropriately
* and advance us to the beginning. * and advance us to the beginning.
*/ */
offset = buf_addr - imu->ubuf; offset = buf_addr - imu->ubuf;
...@@ -1361,7 +1385,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1361,7 +1385,15 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec; const struct bio_vec *bvec = imu->bvec;
if (offset <= bvec->bv_len) { if (offset <= bvec->bv_len) {
iov_iter_advance(iter, offset); /*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec;
iter->nr_segs = bvec->bv_len;
iter->count -= offset;
iter->iov_offset = offset;
} else { } else {
unsigned long seg_skip; unsigned long seg_skip;
......
...@@ -27,28 +27,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, ...@@ -27,28 +27,6 @@ static inline void wq_list_add_after(struct io_wq_work_node *node,
list->last = node; list->last = node;
} }
/**
* wq_list_merge - merge the second list to the first one.
* @list0: the first list
* @list1: the second list
* Return the first node after mergence.
*/
static inline struct io_wq_work_node *wq_list_merge(struct io_wq_work_list *list0,
struct io_wq_work_list *list1)
{
struct io_wq_work_node *ret;
if (!list0->first) {
ret = list1->first;
} else {
ret = list0->first;
list0->last->next = list1->first;
}
INIT_WQ_LIST(list0);
INIT_WQ_LIST(list1);
return ret;
}
static inline void wq_list_add_tail(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node,
struct io_wq_work_list *list) struct io_wq_work_list *list)
{ {
......
...@@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, ...@@ -83,7 +83,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
xa_init(&tctx->xa); xa_init(&tctx->xa);
init_waitqueue_head(&tctx->wait); init_waitqueue_head(&tctx->wait);
atomic_set(&tctx->in_idle, 0); atomic_set(&tctx->in_cancel, 0);
atomic_set(&tctx->inflight_tracked, 0); atomic_set(&tctx->inflight_tracked, 0);
task->io_uring = tctx; task->io_uring = tctx;
init_llist_head(&tctx->task_list); init_llist_head(&tctx->task_list);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment