Commit 3a56e241 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:
 "Here are the io_uring updates queued up for 6.11.

  Nothing major this time around, various minor improvements and
  cleanups/fixes. This contains:

   - Add bind/listen opcodes. Main motivation is to support direct
     descriptors, to avoid needing a regular fd just for doing these two
     operations (Gabriel)

   - Probe fixes (Gabriel)

   - Treat io-wq work flags as atomics. Not fixing a real issue, but may
     as well and it silences a KCSAN warning (me)

   - Cleanup of rsrc __set_current_state() usage (me)

   - Add 64-bit for {m,f}advise operations (me)

   - Improve performance of data ring messages (me)

   - Fix for ring message overflow posting (Pavel)

   - Fix for freezer interaction with TWA_NOTIFY_SIGNAL. Not strictly an
     io_uring thing, but since TWA_NOTIFY_SIGNAL was originally added
     for faster task_work signaling for io_uring, bundling it with this
     pull (Pavel)

   - Add Pavel as a co-maintainer

   - Various cleanups (me, Thorsten)"

* tag 'for-6.11/io_uring-20240714' of git://git.kernel.dk/linux: (28 commits)
  io_uring/net: check socket is valid in io_bind()/io_listen()
  kernel: rerun task_work while freezing in get_signal()
  io_uring/io-wq: limit retrying worker initialisation
  io_uring/napi: Remove unnecessary s64 cast
  io_uring/net: cleanup io_recv_finish() bundle handling
  io_uring/msg_ring: fix overflow posting
  MAINTAINERS: change Pavel Begunkov from io_uring reviewer to maintainer
  io_uring/msg_ring: use kmem_cache_free() to free request
  io_uring/msg_ring: check for dead submitter task
  io_uring/msg_ring: add an alloc cache for io_kiocb entries
  io_uring/msg_ring: improve handling of target CQE posting
  io_uring: add io_add_aux_cqe() helper
  io_uring: add remote task_work execution helper
  io_uring/msg_ring: tighten requirement for remote posting
  io_uring: Allocate only necessary memory in io_probe
  io_uring: Fix probe of disabled operations
  io_uring: Introduce IORING_OP_LISTEN
  io_uring: Introduce IORING_OP_BIND
  net: Split a __sys_listen helper for io_uring
  net: Split a __sys_bind helper for io_uring
  ...
parents 4f5e249e ad00e629
......@@ -11551,7 +11551,7 @@ F: include/linux/iosys-map.h
IO_URING
M: Jens Axboe <axboe@kernel.dk>
R: Pavel Begunkov <asml.silence@gmail.com>
M: Pavel Begunkov <asml.silence@gmail.com>
L: io-uring@vger.kernel.org
S: Maintained
T: git git://git.kernel.dk/linux-block
......
......@@ -50,7 +50,7 @@ struct io_wq_work_list {
struct io_wq_work {
struct io_wq_work_node list;
unsigned flags;
atomic_t flags;
/* place it here instead of io_kiocb as it fills padding and saves 4B */
int cancel_seq;
};
......@@ -210,14 +210,6 @@ struct io_submit_state {
struct blk_plug plug;
};
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
atomic_t refs;
atomic_t ops;
};
struct io_alloc_cache {
void **entries;
unsigned int nr_cached;
......@@ -372,7 +364,6 @@ struct io_ring_ctx {
struct io_restriction restrictions;
/* slow path rsrc auxilary data, used by update/register */
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
......@@ -405,6 +396,9 @@ struct io_ring_ctx {
struct callback_head poll_wq_task_work;
struct list_head defer_list;
struct io_alloc_cache msg_cache;
spinlock_t msg_lock;
#ifdef CONFIG_NET_RX_BUSY_POLL
struct list_head napi_list; /* track busy poll napi_id */
spinlock_t napi_lock; /* napi_list lock */
......
......@@ -442,11 +442,14 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
extern int __sys_socket(int family, int type, int protocol);
extern struct file *__sys_socket_file(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
extern int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
int addrlen);
extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
int addrlen, int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen);
extern int __sys_listen(int fd, int backlog);
extern int __sys_listen_socket(struct socket *sock, int backlog);
extern int __sys_getsockname(int fd, struct sockaddr __user *usockaddr,
int __user *usockaddr_len);
extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr,
......
......@@ -257,6 +257,8 @@ enum io_uring_op {
IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
IORING_OP_FTRUNCATE,
IORING_OP_BIND,
IORING_OP_LISTEN,
/* this goes last, obviously */
IORING_OP_LAST,
......
......@@ -4,9 +4,9 @@
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \
uring_cmd.o openclose.o sqpoll.o \
xattr.o nop.o fs.o splice.o sync.o \
msg_ring.o advise.o openclose.o \
eventfd.o uring_cmd.o openclose.o \
sqpoll.o xattr.o nop.o fs.o splice.o \
sync.o msg_ring.o advise.o openclose.o \
epoll.o statx.o timeout.o fdinfo.o \
cancel.o waitid.o register.o \
truncate.o memmap.o
......
......@@ -17,14 +17,14 @@
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u64 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u64 len;
u32 advice;
};
......@@ -33,11 +33,13 @@ int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = io_kiocb_to_cmd(req, struct io_madvise);
if (sqe->buf_index || sqe->off || sqe->splice_fd_in)
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
ma->addr = READ_ONCE(sqe->addr);
ma->len = READ_ONCE(sqe->len);
ma->len = READ_ONCE(sqe->off);
if (!ma->len)
ma->len = READ_ONCE(sqe->len);
ma->advice = READ_ONCE(sqe->fadvise_advice);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
......@@ -78,11 +80,13 @@ int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_fadvise *fa = io_kiocb_to_cmd(req, struct io_fadvise);
if (sqe->buf_index || sqe->addr || sqe->splice_fd_in)
if (sqe->buf_index || sqe->splice_fd_in)
return -EINVAL;
fa->offset = READ_ONCE(sqe->off);
fa->len = READ_ONCE(sqe->len);
fa->len = READ_ONCE(sqe->addr);
if (!fa->len)
fa->len = READ_ONCE(sqe->len);
fa->advice = READ_ONCE(sqe->fadvise_advice);
if (io_fadvise_force_async(fa))
req->flags |= REQ_F_FORCE_ASYNC;
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/eventfd.h>
#include <linux/eventpoll.h>
#include <linux/io_uring.h>
#include <linux/io_uring_types.h>
#include "io-wq.h"
#include "eventfd.h"
struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1;
struct rcu_head rcu;
atomic_t refs;
atomic_t ops;
};
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
};
static void io_eventfd_free(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
static void io_eventfd_do_signal(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
if (atomic_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu);
}
void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd = NULL;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
return;
guard(rcu)();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists incase an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (unlikely(!ev_fd))
return;
if (!atomic_inc_not_zero(&ev_fd->refs))
return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
if (likely(eventfd_signal_allowed())) {
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) {
call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal);
return;
}
}
out:
if (atomic_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
}
void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
bool skip;
spin_lock(&ctx->completion_lock);
/*
* Eventfd should only get triggered when at least one event has been
* posted. Some applications rely on the eventfd notification count
* only changing IFF a new CQE has been added to the CQ ring. There's
* no depedency on 1:1 relationship between how many times this
* function is called (and hence the eventfd count) and number of CQEs
* posted to the CQ ring.
*/
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (skip)
return;
io_eventfd_signal(ctx);
}
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (atomic_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0;
}
return -ENXIO;
}
struct io_ring_ctx;
int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async);
int io_eventfd_unregister(struct io_ring_ctx *ctx);
void io_eventfd_flush_signal(struct io_ring_ctx *ctx);
void io_eventfd_signal(struct io_ring_ctx *ctx);
......@@ -23,6 +23,7 @@
#include "io_uring.h"
#define WORKER_IDLE_TIMEOUT (5 * HZ)
#define WORKER_INIT_LIMIT 3
enum {
IO_WORKER_F_UP = 0, /* up and active */
......@@ -58,6 +59,7 @@ struct io_worker {
unsigned long create_state;
struct callback_head create_work;
int init_retries;
union {
struct rcu_head rcu;
......@@ -159,7 +161,7 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound)
static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq,
struct io_wq_work *work)
{
return io_get_acct(wq, !(work->flags & IO_WQ_WORK_UNBOUND));
return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND));
}
static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker)
......@@ -451,7 +453,7 @@ static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker)
static inline unsigned int io_get_work_hash(struct io_wq_work *work)
{
return work->flags >> IO_WQ_HASH_SHIFT;
return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT;
}
static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash)
......@@ -592,8 +594,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
next_hashed = wq_next_work(work);
if (unlikely(do_kill) && (work->flags & IO_WQ_WORK_UNBOUND))
work->flags |= IO_WQ_WORK_CANCEL;
if (do_kill &&
(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND))
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
io_assign_current_work(worker, NULL);
......@@ -744,7 +747,7 @@ static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
return true;
}
static inline bool io_should_retry_thread(long err)
static inline bool io_should_retry_thread(struct io_worker *worker, long err)
{
/*
* Prevent perpetual task_work retry, if the task (or its group) is
......@@ -752,6 +755,8 @@ static inline bool io_should_retry_thread(long err)
*/
if (fatal_signal_pending(current))
return false;
if (worker->init_retries++ >= WORKER_INIT_LIMIT)
return false;
switch (err) {
case -EAGAIN:
......@@ -778,7 +783,7 @@ static void create_worker_cont(struct callback_head *cb)
io_init_new_worker(wq, worker, tsk);
io_worker_release(worker);
return;
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
} else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
struct io_wq_acct *acct = io_wq_get_acct(worker);
atomic_dec(&acct->nr_running);
......@@ -845,7 +850,7 @@ static bool create_io_worker(struct io_wq *wq, int index)
tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE);
if (!IS_ERR(tsk)) {
io_init_new_worker(wq, worker, tsk);
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
} else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) {
kfree(worker);
goto fail;
} else {
......@@ -891,7 +896,7 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data)
static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
{
do {
work->flags |= IO_WQ_WORK_CANCEL;
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
work = wq->free_work(work);
} while (work);
......@@ -926,7 +931,7 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data)
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
{
struct io_wq_acct *acct = io_work_get_acct(wq, work);
unsigned long work_flags = work->flags;
unsigned int work_flags = atomic_read(&work->flags);
struct io_cb_cancel_data match = {
.fn = io_wq_work_match_item,
.data = work,
......@@ -939,7 +944,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work)
* been marked as one that should not get executed, cancel it here.
*/
if (test_bit(IO_WQ_BIT_EXIT, &wq->state) ||
(work->flags & IO_WQ_WORK_CANCEL)) {
(work_flags & IO_WQ_WORK_CANCEL)) {
io_run_cancel(work, wq);
return;
}
......@@ -982,7 +987,7 @@ void io_wq_hash_work(struct io_wq_work *work, void *val)
unsigned int bit;
bit = hash_ptr(val, IO_WQ_HASH_ORDER);
work->flags |= (IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT));
atomic_or(IO_WQ_WORK_HASHED | (bit << IO_WQ_HASH_SHIFT), &work->flags);
}
static bool __io_wq_worker_cancel(struct io_worker *worker,
......@@ -990,7 +995,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
struct io_wq_work *work)
{
if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL;
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
__set_notify_signal(worker->task);
return true;
}
......
......@@ -56,7 +56,7 @@ bool io_wq_worker_stopped(void);
static inline bool io_wq_is_hashed(struct io_wq_work *work)
{
return work->flags & IO_WQ_WORK_HASHED;
return atomic_read(&work->flags) & IO_WQ_WORK_HASHED;
}
typedef bool (work_cancel_fn)(struct io_wq_work *, void *);
......
......@@ -95,12 +95,14 @@
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "timeout.h"
#include "poll.h"
#include "rw.h"
#include "alloc_cache.h"
#include "eventfd.h"
#define IORING_MAX_ENTRIES 32768
#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES)
......@@ -314,6 +316,9 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
sizeof(struct io_async_rw));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct uring_cache));
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
sizeof(struct io_kiocb));
ret |= io_futex_cache_init(ctx);
if (ret)
goto err;
......@@ -350,6 +355,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
kfree(ctx->cancel_table.hbs);
kfree(ctx->cancel_table_locked.hbs);
......@@ -461,9 +467,9 @@ static void io_prep_async_work(struct io_kiocb *req)
}
req->work.list.next = NULL;
req->work.flags = 0;
atomic_set(&req->work.flags, 0);
if (req->flags & REQ_F_FORCE_ASYNC)
req->work.flags |= IO_WQ_WORK_CONCURRENT;
atomic_or(IO_WQ_WORK_CONCURRENT, &req->work.flags);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(req->file);
......@@ -479,7 +485,7 @@ static void io_prep_async_work(struct io_kiocb *req)
io_wq_hash_work(&req->work, file_inode(req->file));
} else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
atomic_or(IO_WQ_WORK_UNBOUND, &req->work.flags);
}
}
......@@ -519,7 +525,7 @@ static void io_queue_iowq(struct io_kiocb *req)
* worker for it).
*/
if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
req->work.flags |= IO_WQ_WORK_CANCEL;
atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
io_wq_enqueue(tctx->io_wq, &req->work);
......@@ -541,84 +547,6 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
}
}
void io_eventfd_ops(struct rcu_head *rcu)
{
struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu);
int ops = atomic_xchg(&ev_fd->ops, 0);
if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
/* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
* ordering in a race but if references are 0 we know we have to free
* it regardless.
*/
if (atomic_dec_and_test(&ev_fd->refs)) {
eventfd_ctx_put(ev_fd->cq_ev_fd);
kfree(ev_fd);
}
}
static void io_eventfd_signal(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd = NULL;
rcu_read_lock();
/*
* rcu_dereference ctx->io_ev_fd once and use it for both for checking
* and eventfd_signal
*/
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists incase an io_eventfd_unregister call
* completed between the NULL check of ctx->io_ev_fd at the start of
* the function and rcu_read_lock.
*/
if (unlikely(!ev_fd))
goto out;
if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
goto out;
if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out;
if (likely(eventfd_signal_allowed())) {
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
} else {
atomic_inc(&ev_fd->refs);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops))
call_rcu_hurry(&ev_fd->rcu, io_eventfd_ops);
else
atomic_dec(&ev_fd->refs);
}
out:
rcu_read_unlock();
}
static void io_eventfd_flush_signal(struct io_ring_ctx *ctx)
{
bool skip;
spin_lock(&ctx->completion_lock);
/*
* Eventfd should only get triggered when at least one event has been
* posted. Some applications rely on the eventfd notification count
* only changing IFF a new CQE has been added to the CQ ring. There's
* no depedency on 1:1 relationship between how many times this
* function is called (and hence the eventfd count) and number of CQEs
* posted to the CQ ring.
*/
skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail;
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
if (skip)
return;
io_eventfd_signal(ctx);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{
if (ctx->poll_activated)
......@@ -878,19 +806,42 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
return false;
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = io_fill_cqe_aux(ctx, user_data, res, cflags);
if (!filled)
filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
return filled;
}
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
bool filled;
io_cq_lock(ctx);
filled = __io_post_aux_cqe(ctx, user_data, res, cflags);
io_cq_unlock_post(ctx);
return filled;
}
/*
* Must be called from inline task_work so we now a flush will happen later,
* and obviously with ctx->uring_lock held (tw always has that).
*/
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{
if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) {
spin_lock(&ctx->completion_lock);
io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0);
spin_unlock(&ctx->completion_lock);
}
ctx->submit_state.cq_flush = true;
}
/*
* A helper for multishot requests posting additional CQEs.
* Should only be used from a task_work including IO_URING_F_MULTISHOT.
......@@ -1175,9 +1126,10 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
static inline void io_req_local_work_add(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
unsigned nr_wait, nr_tw, nr_tw_prev;
struct llist_node *head;
......@@ -1191,6 +1143,8 @@ static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK))
flags &= ~IOU_F_TWQ_LAZY_WAKE;
guard(rcu)();
head = READ_ONCE(ctx->work_llist.first);
do {
nr_tw_prev = 0;
......@@ -1272,13 +1226,18 @@ static void io_req_normal_work_add(struct io_kiocb *req)
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{
if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
rcu_read_lock();
io_req_local_work_add(req, flags);
rcu_read_unlock();
} else {
if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_req_local_work_add(req, req->ctx, flags);
else
io_req_normal_work_add(req);
}
}
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags)
{
if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)))
return;
io_req_local_work_add(req, ctx, flags);
}
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
......@@ -1467,7 +1426,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
}
__io_cq_unlock_post(ctx);
if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
if (!wq_list_empty(&state->compl_reqs)) {
io_free_batch_list(ctx, state->compl_reqs.first);
INIT_WQ_LIST(&state->compl_reqs);
}
......@@ -1813,14 +1772,14 @@ void io_wq_submit_work(struct io_wq_work *work)
io_arm_ltimeout(req);
/* either cancelled or io-wq is dying, so don't touch tctx->iowq */
if (work->flags & IO_WQ_WORK_CANCEL) {
if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) {
fail:
io_req_task_queue_fail(req, err);
return;
}
if (!io_assign_file(req, def, issue_flags)) {
err = -EBADF;
work->flags |= IO_WQ_WORK_CANCEL;
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
goto fail;
}
......@@ -2649,6 +2608,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
mutex_unlock(&ctx->uring_lock);
......
......@@ -65,6 +65,7 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
void io_req_defer_failed(struct io_kiocb *req, s32 res);
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
......@@ -73,6 +74,8 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned issue_flags);
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags);
bool io_alloc_async_data(struct io_kiocb *req);
void io_req_task_queue(struct io_kiocb *req);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
......@@ -104,12 +107,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all);
enum {
IO_EVENTFD_OP_SIGNAL_BIT,
IO_EVENTFD_OP_FREE_BIT,
};
void io_eventfd_ops(struct rcu_head *rcu);
void io_activate_pollwq(struct io_ring_ctx *ctx);
static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
......
......@@ -11,9 +11,9 @@
#include "io_uring.h"
#include "rsrc.h"
#include "filetable.h"
#include "alloc_cache.h"
#include "msg_ring.h"
/* All valid masks for MSG_RING */
#define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \
IORING_MSG_RING_FLAGS_PASS)
......@@ -68,59 +68,70 @@ void io_msg_ring_cleanup(struct io_kiocb *req)
static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx)
{
if (!target_ctx->task_complete)
return false;
return current != target_ctx->submitter_task;
return target_ctx->task_complete;
}
static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func)
static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
{
struct io_ring_ctx *ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct task_struct *task = READ_ONCE(ctx->submitter_task);
struct io_ring_ctx *ctx = req->ctx;
if (unlikely(!task))
return -EOWNERDEAD;
io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags);
if (spin_trylock(&ctx->msg_lock)) {
if (io_alloc_cache_put(&ctx->msg_cache, req))
req = NULL;
spin_unlock(&ctx->msg_lock);
}
if (req)
kmem_cache_free(req_cachep, req);
percpu_ref_put(&ctx->refs);
}
init_task_work(&msg->tw, func);
if (task_work_add(task, &msg->tw, TWA_SIGNAL))
static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
int res, u32 cflags, u64 user_data)
{
req->task = READ_ONCE(ctx->submitter_task);
if (!req->task) {
kmem_cache_free(req_cachep, req);
return -EOWNERDEAD;
}
req->cqe.user_data = user_data;
io_req_set_res(req, res, cflags);
percpu_ref_get(&ctx->refs);
req->ctx = ctx;
req->io_task_work.func = io_msg_tw_complete;
io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE);
return 0;
}
return IOU_ISSUE_SKIP_COMPLETE;
static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx)
{
struct io_kiocb *req = NULL;
if (spin_trylock(&ctx->msg_lock)) {
req = io_alloc_cache_get(&ctx->msg_cache);
spin_unlock(&ctx->msg_lock);
}
if (req)
return req;
return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN);
}
static void io_msg_tw_complete(struct callback_head *head)
static int io_msg_data_remote(struct io_kiocb *req)
{
struct io_msg *msg = container_of(head, struct io_msg, tw);
struct io_kiocb *req = cmd_to_io_kiocb(msg);
struct io_ring_ctx *target_ctx = req->file->private_data;
int ret = 0;
if (current->flags & PF_EXITING) {
ret = -EOWNERDEAD;
} else {
u32 flags = 0;
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
/*
* If the target ring is using IOPOLL mode, then we need to be
* holding the uring_lock for posting completions. Other ring
* types rely on the regular completion locking, which is
* handled while posting.
*/
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_lock(&target_ctx->uring_lock);
if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags))
ret = -EOVERFLOW;
if (target_ctx->flags & IORING_SETUP_IOPOLL)
mutex_unlock(&target_ctx->uring_lock);
}
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct io_kiocb *target;
u32 flags = 0;
if (ret < 0)
req_set_fail(req);
io_req_queue_tw_complete(req, ret);
target = io_msg_get_kiocb(req->ctx);
if (unlikely(!target))
return -ENOMEM;
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
return io_msg_remote_post(target_ctx, target, msg->len, flags,
msg->user_data);
}
static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
......@@ -138,7 +149,7 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags)
return -EBADFD;
if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_complete);
return io_msg_data_remote(req);
if (msg->flags & IORING_MSG_RING_FLAGS_PASS)
flags = msg->cqe_flags;
......@@ -218,6 +229,22 @@ static void io_msg_tw_fd_complete(struct callback_head *head)
io_req_queue_tw_complete(req, ret);
}
static int io_msg_fd_remote(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->file->private_data;
struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg);
struct task_struct *task = READ_ONCE(ctx->submitter_task);
if (unlikely(!task))
return -EOWNERDEAD;
init_task_work(&msg->tw, io_msg_tw_fd_complete);
if (task_work_add(task, &msg->tw, TWA_SIGNAL))
return -EOWNERDEAD;
return IOU_ISSUE_SKIP_COMPLETE;
}
static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ring_ctx *target_ctx = req->file->private_data;
......@@ -240,7 +267,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags)
}
if (io_msg_need_remote(target_ctx))
return io_msg_exec_remote(req, io_msg_tw_fd_complete);
return io_msg_fd_remote(req);
return io_msg_install_complete(req, issue_flags);
}
......@@ -294,3 +321,10 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags)
io_req_set_res(req, ret, 0);
return IOU_OK;
}
void io_msg_cache_free(const void *entry)
{
struct io_kiocb *req = (struct io_kiocb *) entry;
kmem_cache_free(req_cachep, req);
}
......@@ -3,3 +3,4 @@
int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags);
void io_msg_ring_cleanup(struct io_kiocb *req);
void io_msg_cache_free(const void *entry);
......@@ -283,7 +283,7 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
s64 poll_to_ns = timespec64_to_ns(ts);
if (poll_to_ns > 0) {
u64 val = poll_to_ns + 999;
do_div(val, (s64) 1000);
do_div(val, 1000);
poll_to = val;
}
}
......
......@@ -51,6 +51,16 @@ struct io_connect {
bool seen_econnaborted;
};
struct io_bind {
struct file *file;
int addr_len;
};
struct io_listen {
struct file *file;
int backlog;
};
struct io_sr_msg {
struct file *file;
union {
......@@ -817,20 +827,20 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
bool mshot_finished, unsigned issue_flags)
{
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
unsigned int cflags;
if (sr->flags & IORING_RECVSEND_BUNDLE)
cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
issue_flags);
else
cflags = io_put_kbuf(req, issue_flags);
unsigned int cflags = 0;
if (kmsg->msg.msg_inq > 0)
cflags |= IORING_CQE_F_SOCK_NONEMPTY;
/* bundle with no more immediate buffers, we're done */
if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY)
goto finish;
if (sr->flags & IORING_RECVSEND_BUNDLE) {
cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
issue_flags);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
goto finish;
} else {
cflags |= io_put_kbuf(req, issue_flags);
}
/*
* Fill CQE for this receive and see if we should keep trying to
......@@ -1717,6 +1727,70 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
return IOU_OK;
}
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct sockaddr __user *uaddr;
struct io_async_msghdr *io;
if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
return -EINVAL;
uaddr = u64_to_user_ptr(READ_ONCE(sqe->addr));
bind->addr_len = READ_ONCE(sqe->addr2);
io = io_msg_alloc_async(req);
if (unlikely(!io))
return -ENOMEM;
return move_addr_to_kernel(uaddr, bind->addr_len, &io->addr);
}
int io_bind(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_bind *bind = io_kiocb_to_cmd(req, struct io_bind);
struct io_async_msghdr *io = req->async_data;
struct socket *sock;
int ret;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
ret = __sys_bind_socket(sock, &io->addr, bind->addr_len);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return 0;
}
int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
if (sqe->addr || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in || sqe->addr2)
return -EINVAL;
listen->backlog = READ_ONCE(sqe->len);
return 0;
}
int io_listen(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_listen *listen = io_kiocb_to_cmd(req, struct io_listen);
struct socket *sock;
int ret;
sock = sock_from_file(req->file);
if (unlikely(!sock))
return -ENOTSOCK;
ret = __sys_listen_socket(sock, listen->backlog);
if (ret < 0)
req_set_fail(req);
io_req_set_res(req, ret, 0);
return 0;
}
void io_netmsg_cache_free(const void *entry)
{
struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry;
......
......@@ -49,6 +49,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags);
int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
void io_send_zc_cleanup(struct io_kiocb *req);
int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_bind(struct io_kiocb *req, unsigned int issue_flags);
int io_listen_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_listen(struct io_kiocb *req, unsigned int issue_flags);
void io_netmsg_cache_free(const void *entry);
#else
static inline void io_netmsg_cache_free(const void *entry)
......
......@@ -495,6 +495,26 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_ftruncate_prep,
.issue = io_ftruncate,
},
[IORING_OP_BIND] = {
#if defined(CONFIG_NET)
.needs_file = 1,
.prep = io_bind_prep,
.issue = io_bind,
.async_size = sizeof(struct io_async_msghdr),
#else
.prep = io_eopnotsupp_prep,
#endif
},
[IORING_OP_LISTEN] = {
#if defined(CONFIG_NET)
.needs_file = 1,
.prep = io_listen_prep,
.issue = io_listen,
.async_size = sizeof(struct io_async_msghdr),
#else
.prep = io_eopnotsupp_prep,
#endif
},
};
const struct io_cold_def io_cold_defs[] = {
......@@ -716,6 +736,12 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FTRUNCATE] = {
.name = "FTRUNCATE",
},
[IORING_OP_BIND] = {
.name = "BIND",
},
[IORING_OP_LISTEN] = {
.name = "LISTEN",
},
};
const char *io_uring_get_opcode(u8 opcode)
......@@ -725,6 +751,14 @@ const char *io_uring_get_opcode(u8 opcode)
return "INVALID";
}
bool io_uring_op_supported(u8 opcode)
{
if (opcode < IORING_OP_LAST &&
io_issue_defs[opcode].prep != io_eopnotsupp_prep)
return true;
return false;
}
void __init io_uring_optable_init(void)
{
int i;
......
......@@ -17,8 +17,6 @@ struct io_issue_def {
unsigned poll_exclusive : 1;
/* op supports buffer selection */
unsigned buffer_select : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* skip auditing */
unsigned audit_skip : 1;
/* supports ioprio */
......@@ -47,5 +45,7 @@ struct io_cold_def {
extern const struct io_issue_def io_issue_defs[];
extern const struct io_cold_def io_cold_defs[];
bool io_uring_op_supported(u8 opcode);
void io_uring_optable_init(void);
#endif
......@@ -27,65 +27,11 @@
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"
#include "eventfd.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
static int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned int eventfd_async)
{
struct io_ev_fd *ev_fd;
__s32 __user *fds = arg;
int fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd)
return -EBUSY;
if (copy_from_user(&fd, fds, sizeof(*fds)))
return -EFAULT;
ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL);
if (!ev_fd)
return -ENOMEM;
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd);
return ret;
}
spin_lock(&ctx->completion_lock);
ctx->evfd_last_cq_tail = ctx->cached_cq_tail;
spin_unlock(&ctx->completion_lock);
ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true;
rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
atomic_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0);
return 0;
}
int io_eventfd_unregister(struct io_ring_ctx *ctx)
{
struct io_ev_fd *ev_fd;
ev_fd = rcu_dereference_protected(ctx->io_ev_fd,
lockdep_is_held(&ctx->uring_lock));
if (ev_fd) {
ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_FREE_BIT), &ev_fd->ops))
call_rcu(&ev_fd->rcu, io_eventfd_ops);
return 0;
}
return -ENXIO;
}
static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
......@@ -93,9 +39,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
size_t size;
int i, ret;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
......@@ -108,12 +55,10 @@ static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_issue_defs[i].not_supported)
if (io_uring_op_supported(i))
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
......
......@@ -85,31 +85,6 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst,
void __user *arg, unsigned index)
{
struct iovec __user *src;
#ifdef CONFIG_COMPAT
if (ctx->compat) {
struct compat_iovec __user *ciovs;
struct compat_iovec ciov;
ciovs = (struct compat_iovec __user *) arg;
if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov)))
return -EFAULT;
dst->iov_base = u64_to_user_ptr((u64)ciov.iov_base);
dst->iov_len = ciov.iov_len;
return 0;
}
#endif
src = (struct iovec __user *) arg;
if (copy_from_user(dst, &src[index], sizeof(*dst)))
return -EFAULT;
return 0;
}
static int io_buffer_validate(struct iovec *iov)
{
unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
......@@ -249,7 +224,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
ret = io_run_task_work_sig(ctx);
if (ret < 0) {
__set_current_state(TASK_RUNNING);
finish_wait(&ctx->rsrc_quiesce_wq, &we);
mutex_lock(&ctx->uring_lock);
if (list_empty(&ctx->rsrc_ref_list))
ret = 0;
......@@ -257,7 +232,6 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
}
schedule();
__set_current_state(TASK_RUNNING);
mutex_lock(&ctx->uring_lock);
ret = 0;
} while (!list_empty(&ctx->rsrc_ref_list));
......@@ -420,8 +394,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up,
unsigned int nr_args)
{
struct iovec __user *uvec = u64_to_user_ptr(up->data);
u64 __user *tags = u64_to_user_ptr(up->tags);
struct iovec iov, __user *iovs = u64_to_user_ptr(up->data);
struct iovec fast_iov, *iov;
struct page *last_hpage = NULL;
__u32 done;
int i, err;
......@@ -435,21 +410,23 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
struct io_mapped_ubuf *imu;
u64 tag = 0;
err = io_copy_iov(ctx, &iov, iovs, done);
if (err)
iov = iovec_from_user(&uvec[done], 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
err = PTR_ERR(iov);
break;
}
if (tags && copy_from_user(&tag, &tags[done], sizeof(tag))) {
err = -EFAULT;
break;
}
err = io_buffer_validate(&iov);
err = io_buffer_validate(iov);
if (err)
break;
if (!iov.iov_base && tag) {
if (!iov->iov_base && tag) {
err = -EINVAL;
break;
}
err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
if (err)
break;
......@@ -971,8 +948,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
{
struct page *last_hpage = NULL;
struct io_rsrc_data *data;
struct iovec fast_iov, *iov = &fast_iov;
const struct iovec __user *uvec = (struct iovec * __user) arg;
int i, ret;
struct iovec iov;
BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
......@@ -989,24 +967,27 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
if (!arg)
memset(iov, 0, sizeof(*iov));
for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
if (arg) {
ret = io_copy_iov(ctx, &iov, arg, i);
if (ret)
iov = iovec_from_user(&uvec[i], 1, 1, &fast_iov, ctx->compat);
if (IS_ERR(iov)) {
ret = PTR_ERR(iov);
break;
ret = io_buffer_validate(&iov);
}
ret = io_buffer_validate(iov);
if (ret)
break;
} else {
memset(&iov, 0, sizeof(iov));
}
if (!iov.iov_base && *io_get_tag_slot(data, i)) {
if (!iov->iov_base && *io_get_tag_slot(data, i)) {
ret = -EINVAL;
break;
}
ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
&last_hpage);
if (ret)
break;
......
......@@ -2600,6 +2600,14 @@ static void do_freezer_trap(void)
spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen();
schedule();
/*
* We could've been woken by task_work, run it to clear
* TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
*/
clear_notify_signal();
if (unlikely(task_work_pending(current)))
task_work_run();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
......
......@@ -1822,6 +1822,20 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol,
return __sys_socketpair(family, type, protocol, usockvec);
}
int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address,
int addrlen)
{
int err;
err = security_socket_bind(sock, (struct sockaddr *)address,
addrlen);
if (!err)
err = READ_ONCE(sock->ops)->bind(sock,
(struct sockaddr *)address,
addrlen);
return err;
}
/*
* Bind a name to a socket. Nothing much to do here since it's
* the protocol's responsibility to handle the local address.
......@@ -1839,15 +1853,8 @@ int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
err = move_addr_to_kernel(umyaddr, addrlen, &address);
if (!err) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err)
err = READ_ONCE(sock->ops)->bind(sock,
(struct sockaddr *)
&address, addrlen);
}
if (!err)
err = __sys_bind_socket(sock, &address, addrlen);
fput_light(sock->file, fput_needed);
}
return err;
......@@ -1863,23 +1870,28 @@ SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
int __sys_listen_socket(struct socket *sock, int backlog)
{
int somaxconn, err;
somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
if ((unsigned int)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
err = READ_ONCE(sock->ops)->listen(sock, backlog);
return err;
}
int __sys_listen(int fd, int backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
if ((unsigned int)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
err = READ_ONCE(sock->ops)->listen(sock, backlog);
err = __sys_listen_socket(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment