Commit 896f8d23 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Support for various new opcodes (fallocate, openat, close, statx,
   fadvise, madvise, openat2, non-vectored read/write, send/recv, and
   epoll_ctl)

 - Faster ring quiesce for fileset updates

 - Optimizations for overflow condition checking

 - Support for max-sized clamping

 - Support for probing what opcodes are supported

 - Support for io-wq backend sharing between "sibling" rings

 - Support for registering personalities

 - Lots of little fixes and improvements

* tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits)
  io_uring: add support for epoll_ctl(2)
  eventpoll: support non-blocking do_epoll_ctl() calls
  eventpoll: abstract out epoll_ctl() handler
  io_uring: fix linked command file table usage
  io_uring: support using a registered personality for commands
  io_uring: allow registering credentials
  io_uring: add io-wq workqueue sharing
  io-wq: allow grabbing existing io-wq
  io_uring/io-wq: don't use static creds/mm assignments
  io-wq: make the io_wq ref counted
  io_uring: fix refcounting with batched allocations at OOM
  io_uring: add comment for drain_next
  io_uring: don't attempt to copy iovec for READ/WRITE
  io_uring: honor IOSQE_ASYNC for linked reqs
  io_uring: prep req when do IOSQE_ASYNC
  io_uring: use labeled array init in io_op_defs
  io_uring: optimise sqe-to-req flags translation
  io_uring: remove REQ_F_IO_DRAINED
  io_uring: file switch work needs to get flushed on exit
  io_uring: hide uring_fd in ctx
  ...
parents 33c84e89 3e4827b0
......@@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
return;
init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file);
if (twcb->file)
if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, true);
else
} else {
kfree(twcb);
}
}
static void binder_transaction_buffer_release(struct binder_proc *proc,
......
......@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi;
}
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
/* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls)
{
......@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0);
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
bool nonblock)
{
if (!nonblock) {
mutex_lock_nested(mutex, depth);
return 0;
}
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{
int error;
int full_check = 0;
struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF;
f = fdget(epfd);
if (!f.file)
......@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds);
ep_take_care_of_epollwakeup(epds);
/*
* We have to check that the file structure underneath the file descriptor
......@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) {
if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
(epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
......@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations.
*/
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) {
full_check = 1;
mutex_unlock(&ep->mtx);
mutex_lock(&epmutex);
error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
full_check = 1;
if (is_file_epoll(tf.file)) {
error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) {
......@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
list_add(&tf.file->f_tfile_llink,
&tfile_check_list);
mutex_lock_nested(&ep->mtx, 0);
error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error) {
out_del:
list_del(&tf.file->f_tfile_llink);
goto error_tgt_fput;
}
if (is_file_epoll(tf.file)) {
tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, 1);
error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
goto out_del;
}
}
}
}
......@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, epds, tf.file, fd, full_check);
} else
error = -EEXIST;
if (full_check)
......@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD:
if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds.events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, &epds);
epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, epds);
}
} else
error = -ENOENT;
......@@ -2231,6 +2239,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return error;
}
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/*
* Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2).
......
......@@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/*
* variant of __close_fd that gets a ref on the file for later fput
* variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/
int __close_fd_get_file(unsigned int fd, struct file **res)
{
......@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock);
get_file(file);
*res = file;
return filp_close(file, files);
return 0;
out_unlock:
spin_unlock(&files->file_lock);
......
......@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode);
......@@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
int cp_statx(const struct kstat *stat, struct statx __user *buffer);
......@@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu;
struct mm_struct *mm;
const struct cred *creds;
const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files;
};
......@@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager;
struct user_struct *user;
const struct cred *creds;
struct mm_struct *mm;
refcount_t refs;
struct completion done;
refcount_t use_refs;
};
static bool io_worker_get(struct io_worker *worker)
......@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{
bool dropped_lock = false;
if (worker->creds) {
revert_creds(worker->creds);
worker->creds = NULL;
if (worker->saved_creds) {
revert_creds(worker->saved_creds);
worker->cur_creds = worker->saved_creds = NULL;
}
if (current->files != worker->restore_files) {
......@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL;
}
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
if (!work->mm) {
set_fs(KERNEL_DS);
return;
}
if (mmget_not_zero(work->mm)) {
use_mm(work->mm);
if (!worker->mm)
set_fs(USER_DS);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
return;
}
/* failed grabbing mm, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->creds);
worker->cur_creds = work->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
}
static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock)
{
......@@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker)
if (work->flags & IO_WQ_WORK_CB)
work->func(&work);
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) &&
current->files != work->files) {
if (work->files && current->files != work->files) {
task_lock(current);
current->files = work->files;
task_unlock(current);
}
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm &&
wq->mm) {
if (mmget_not_zero(wq->mm)) {
use_mm(wq->mm);
set_fs(USER_DS);
worker->mm = wq->mm;
} else {
work->flags |= IO_WQ_WORK_CANCEL;
}
}
if (!worker->creds)
worker->creds = override_creds(wq->creds);
if (work->mm != worker->mm)
io_wq_switch_mm(worker, work);
if (worker->cur_creds != work->creds)
io_wq_switch_creds(worker, work);
/*
* OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
* the worker function will do the right thing.
*/
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm)
......@@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{
struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags;
/*
......@@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return;
}
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags);
if (!atomic_read(&acct->nr_running))
if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct);
}
......@@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
......@@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false;
spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work == work) {
if (worker->cur_work == work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1);
ret = true;
}
......@@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */
wq->user = data->user;
wq->creds = data->creds;
for_each_node(node) {
struct io_wqe *wqe;
......@@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done);
/* caller must have already done mmgrab() on this mm */
wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager);
......@@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM;
goto err;
}
refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done);
return wq;
}
......@@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(ret);
}
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
}
static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{
wake_up_process(worker->task);
return false;
}
void io_wq_destroy(struct io_wq *wq)
static void __io_wq_destroy(struct io_wq *wq)
{
int node;
......@@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes);
kfree(wq);
}
void io_wq_destroy(struct io_wq *wq)
{
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
......@@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_NEEDS_USER = 8,
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
};
......@@ -72,6 +72,8 @@ struct io_wq_work {
};
void (*func)(struct io_wq_work **);
struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
unsigned flags;
};
......@@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \
(work)->flags = 0; \
(work)->files = NULL; \
(work)->mm = NULL; \
(work)->creds = NULL; \
} while (0) \
typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user;
const struct cred *creds;
get_work_fn *get_work;
put_work_fn *put_work;
};
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
......
......@@ -46,6 +46,7 @@
#include <linux/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
......@@ -70,6 +71,10 @@
#include <linux/sizes.h>
#include <linux/hugetlb.h>
#include <linux/highmem.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
......@@ -177,6 +182,21 @@ struct fixed_file_table {
struct file **files;
};
enum {
FFD_F_ATOMIC,
};
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
unsigned long state;
struct work_struct ref_work;
struct completion done;
};
struct io_ring_ctx {
struct {
struct percpu_ref refs;
......@@ -184,10 +204,11 @@ struct io_ring_ctx {
struct {
unsigned int flags;
bool compat;
bool account_mem;
bool cq_overflow_flushed;
bool drain_next;
int compat: 1;
int account_mem: 1;
int cq_overflow_flushed: 1;
int drain_next: 1;
int eventfd_async: 1;
/*
* Ring buffer of indices into array of io_uring_sqe, which is
......@@ -207,13 +228,14 @@ struct io_ring_ctx {
unsigned sq_thread_idle;
unsigned cached_sq_dropped;
atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes;
unsigned long sq_check_overflow;
struct list_head defer_list;
struct list_head timeout_list;
struct list_head cq_overflow_list;
wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp;
struct io_rings *rings;
......@@ -229,8 +251,10 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2).
*/
struct fixed_file_table *file_table;
struct fixed_file_data *file_data;
unsigned nr_user_files;
int ring_fd;
struct file *ring_file;
/* if used, fixed mapped user buffers */
unsigned nr_user_bufs;
......@@ -250,11 +274,14 @@ struct io_ring_ctx {
struct socket *ring_sock;
#endif
struct idr personality_idr;
struct {
unsigned cached_cq_tail;
unsigned cq_entries;
unsigned cq_mask;
atomic_t cq_timeouts;
unsigned long cq_check_overflow;
struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd;
......@@ -267,7 +294,8 @@ struct io_ring_ctx {
struct {
spinlock_t completion_lock;
bool poll_multi_file;
struct llist_head poll_llist;
/*
* ->poll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
......@@ -277,6 +305,7 @@ struct io_ring_ctx {
struct list_head poll_list;
struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
bool poll_multi_file;
spinlock_t inflight_lock;
struct list_head inflight_list;
......@@ -299,6 +328,12 @@ struct io_poll_iocb {
struct wait_queue_entry wait;
};
struct io_close {
struct file *file;
struct file *put_file;
int fd;
};
struct io_timeout_data {
struct io_kiocb *req;
struct hrtimer timer;
......@@ -319,6 +354,7 @@ struct io_sync {
loff_t len;
loff_t off;
int flags;
int mode;
};
struct io_cancel {
......@@ -348,8 +384,52 @@ struct io_connect {
struct io_sr_msg {
struct file *file;
struct user_msghdr __user *msg;
union {
struct user_msghdr __user *msg;
void __user *buf;
};
int msg_flags;
size_t len;
};
struct io_open {
struct file *file;
int dfd;
union {
unsigned mask;
};
struct filename *filename;
struct statx __user *buffer;
struct open_how how;
};
struct io_files_update {
struct file *file;
u64 arg;
u32 nr_args;
u32 offset;
};
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u32 advice;
};
struct io_epoll {
struct file *file;
int epfd;
int op;
int fd;
struct epoll_event event;
};
struct io_async_connect {
......@@ -370,15 +450,79 @@ struct io_async_rw {
ssize_t size;
};
struct io_async_open {
struct filename *filename;
};
struct io_async_ctx {
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
struct io_async_open open;
};
};
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_IOPOLL_COMPLETED_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
REQ_F_MUST_PUNT_BIT,
REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
};
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* polled IO has completed */
REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* timeout request */
REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* must be punted even for NONBLOCK */
REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
/* no timeout sequence */
REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
};
/*
* NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can
......@@ -396,11 +540,19 @@ struct io_kiocb {
struct io_timeout timeout;
struct io_connect connect;
struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
struct io_files_update files_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
};
struct io_async_ctx *io;
struct file *ring_file;
int ring_fd;
/*
* llist_node is only used for poll deferred completions
*/
struct llist_node llist_node;
bool has_user;
bool in_async;
bool needs_fixed_file;
......@@ -414,23 +566,6 @@ struct io_kiocb {
struct list_head link_list;
unsigned int flags;
refcount_t refs;
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
u64 user_data;
u32 result;
u32 sequence;
......@@ -463,14 +598,162 @@ struct io_submit_state {
unsigned int ios_left;
};
struct io_op_def {
/* needs req->io allocated for deferral/async */
unsigned async_ctx : 1;
/* needs current->mm setup, does mm access */
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
/* needs req->file assigned IFF fd is >= 0 */
unsigned fd_non_neg : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* needs file table */
unsigned file_table : 1;
};
static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
},
[IORING_OP_SENDMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
.needs_mm = 1,
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.async_ctx = 1,
.needs_mm = 1,
},
[IORING_OP_CONNECT] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
},
[IORING_OP_OPENAT] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
.needs_mm = 1,
.file_table = 1,
},
[IORING_OP_STATX] = {
.needs_mm = 1,
.needs_file = 1,
.fd_non_neg = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
},
[IORING_OP_MADVISE] = {
.needs_mm = 1,
},
[IORING_OP_SEND] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
};
static void io_wq_submit_work(struct io_wq_work **workptr);
static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep;
......@@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock);
init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list);
......@@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req)
static inline bool req_need_defer(struct io_kiocb *req)
{
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN)
if (unlikely(req->flags & REQ_F_IO_DRAIN))
return __req_need_defer(req);
return false;
......@@ -606,53 +891,53 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
/* order cqe stores with ring update */
smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
/* order cqe stores with ring update */
smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
if (wq_has_sleeper(&ctx->cq_wait)) {
wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
}
}
static inline void io_req_work_grab_env(struct io_kiocb *req,
const struct io_op_def *def)
{
if (!req->work.mm && def->needs_mm) {
mmgrab(current->mm);
req->work.mm = current->mm;
}
if (!req->work.creds)
req->work.creds = get_current_cred();
}
static inline bool io_req_needs_user(struct io_kiocb *req)
static inline void io_req_work_drop_env(struct io_kiocb *req)
{
return !(req->opcode == IORING_OP_READ_FIXED ||
req->opcode == IORING_OP_WRITE_FIXED);
if (req->work.mm) {
mmdrop(req->work.mm);
req->work.mm = NULL;
}
if (req->work.creds) {
put_cred(req->work.creds);
req->work.creds = NULL;
}
}
static inline bool io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link)
{
const struct io_op_def *def = &io_op_defs[req->opcode];
bool do_hashed = false;
switch (req->opcode) {
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
/* only regular files should be hashed for writes */
if (req->flags & REQ_F_ISREG)
if (req->flags & REQ_F_ISREG) {
if (def->hash_reg_file)
do_hashed = true;
/* fall-through */
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
case IORING_OP_ACCEPT:
case IORING_OP_POLL_ADD:
case IORING_OP_CONNECT:
/*
* We know REQ_F_ISREG is not set on some of these
* opcodes, but this enables us to keep the check in
* just one place.
*/
if (!(req->flags & REQ_F_ISREG))
} else {
if (def->unbound_nonreg_file)
req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
}
if (io_req_needs_user(req))
req->work.flags |= IO_WQ_WORK_NEEDS_USER;
io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req);
return do_hashed;
......@@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) {
req->flags |= REQ_F_IO_DRAINED;
while ((req = io_get_deferred_req(ctx)) != NULL)
io_queue_async_work(req);
}
}
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
......@@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask];
}
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker() || in_interrupt();
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{
if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait);
if (ctx->cq_ev_fd)
if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1);
}
......@@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
/* if force is set, the ring is going away. always drop after that */
if (force)
ctx->cq_overflow_flushed = true;
ctx->cq_overflow_flushed = 1;
cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) {
......@@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
}
io_commit_cqring(ctx);
if (cqe) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
......@@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
} else {
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
}
refcount_inc(&req->refs);
req->result = res;
list_add_tail(&req->list, &ctx->cq_overflow_list);
......@@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
return NULL;
if (!state) {
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
......@@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
got_it:
req->io = NULL;
req->ring_file = NULL;
req->file = NULL;
req->ctx = ctx;
req->flags = 0;
......@@ -915,24 +1209,35 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return NULL;
}
static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
static void __io_req_do_free(struct io_kiocb *req)
{
if (*nr) {
kmem_cache_free_bulk(req_cachep, *nr, reqs);
percpu_ref_put_many(&ctx->refs, *nr);
*nr = 0;
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
static void __io_req_aux_free(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
kfree(req->io);
if (req->file) {
if (req->flags & REQ_F_FIXED_FILE)
percpu_ref_put(&ctx->file_data->refs);
else
fput(req->file);
}
io_req_work_drop_env(req);
}
static void __io_free_req(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
__io_req_aux_free(req);
if (req->io)
kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file);
if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags);
......@@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
}
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req);
else
clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
percpu_ref_put(&req->ctx->refs);
__io_req_do_free(req);
}
struct req_batch {
void *reqs[IO_IOPOLL_BATCH];
int to_free;
int need_iter;
};
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
int fixed_refs = rb->to_free;
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
fixed_refs++;
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
__io_req_aux_free(req);
}
if (!inflight)
goto do_free;
spin_lock_irqsave(&ctx->inflight_lock, flags);
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_INFLIGHT) {
list_del(&req->inflight_entry);
if (!--inflight)
break;
}
}
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
if (waitqueue_active(&ctx->inflight_wait))
wake_up(&ctx->inflight_wait);
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
if (fixed_refs)
percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
}
static bool io_link_cancel_timeout(struct io_kiocb *req)
......@@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{
struct io_rings *rings = ctx->rings;
/*
* noflush == true is from the waitqueue handler, just ensure we wake
* up the task, and the next invocation will flush the entries. We
* cannot safely to it from here.
*/
if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U;
if (test_bit(0, &ctx->cq_check_overflow)) {
/*
* noflush == true is from the waitqueue handler, just ensure
* we wake up the task, and the next invocation will flush the
* entries. We cannot safely to it from here.
*/
if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U;
io_cqring_overflow_flush(ctx, false);
io_cqring_overflow_flush(ctx, false);
}
/* See comment at the top of this file */
smp_rmb();
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head);
return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
}
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
......@@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
}
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
return false;
if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
rb->need_iter++;
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
io_free_req_many(req->ctx, rb);
return true;
}
/*
* Find and free completed poll iocbs
*/
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct list_head *done)
{
void *reqs[IO_IOPOLL_BATCH];
struct req_batch rb;
struct io_kiocb *req;
int to_free;
to_free = 0;
rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) {
req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list);
......@@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_cqring_fill_event(req, req->result);
(*nr_events)++;
if (refcount_dec_and_test(&req->refs)) {
/* If we're not using fixed files, we have to pair the
* completion part with the file put. Use regular
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
!req->io) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
} else {
io_free_req(req);
}
}
if (refcount_dec_and_test(&req->refs) &&
!io_req_multi_free(&rb, req))
io_free_req(req);
}
io_commit_cqring(ctx);
io_free_req_many(ctx, reqs, &to_free);
io_free_req_many(ctx, &rb);
}
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
......@@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = req->file->f_pos;
}
kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
......@@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async)
{
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret);
else
......@@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->rw.kiocb.private)
return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
ssize_t ret;
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
return ret;
}
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
......@@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
static int io_alloc_async_ctx(struct io_kiocb *req)
{
if (!io_op_defs[req->opcode].async_ctx)
return 0;
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL;
}
......@@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
if (req->opcode == IORING_OP_READ_FIXED ||
req->opcode == IORING_OP_WRITE_FIXED)
if (!io_op_defs[req->opcode].async_ctx)
return 0;
if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM;
......@@ -2101,35 +2476,14 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
return 0;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->file)
return -EBADF;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->len);
req->sync.flags = READ_ONCE(sqe->sync_range_flags);
return 0;
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
static void io_fallocate_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
req->sync.len);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
......@@ -2138,100 +2492,537 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
return 0;
}
static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */
/* fallocate always requiring blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_sync_file_range_finish;
req->work.func = io_fallocate_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_sync_file_range_finish(&work);
io_fallocate_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
return 0;
}
#if defined(CONFIG_NET)
static void io_sendrecv_async(struct io_wq_work **workptr)
static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct iovec *iov = NULL;
if (req->io->rw.iov != req->io->rw.fast_iov)
iov = req->io->msg.iov;
io_wq_submit_work(workptr);
kfree(iov);
}
#endif
const char __user *fname;
int ret;
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.dfd = READ_ONCE(sqe->fd);
req->open.how.mode = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.how.flags = READ_ONCE(sqe->open_flags);
if (!io)
return 0;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
io->msg.iov = io->msg.fast_iov;
return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
#else
return -EOPNOTSUPP;
#endif
return 0;
}
static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
struct open_how __user *how;
const char __user *fname;
size_t len;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
struct sockaddr_storage addr;
unsigned flags;
req->open.dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (req->io) {
kmsg = &req->io->msg;
kmsg->msg.msg_name = &addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
struct io_sr_msg *sr = &req->sr_msg;
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
kmsg = &io.msg;
kmsg->msg.msg_name = &addr;
ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
len);
if (ret)
return ret;
io.msg.iov = io.msg.fast_iov;
ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
sr->msg_flags, &io.msg.iov);
if (ret)
return ret;
}
if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
return 0;
}
static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct open_flags op;
struct file *file;
int ret;
if (force_nonblock)
return -EAGAIN;
ret = build_open_flags(&req->open.how, &op);
if (ret)
goto err;
ret = get_unused_fd_flags(req->open.how.flags);
if (ret < 0)
goto err;
file = do_filp_open(req->open.dfd, req->open.filename, &op);
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
} else {
fsnotify_open(file);
fd_install(ret, file);
}
err:
putname(req->open.filename);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, nxt, force_nonblock);
}
static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
req->epoll.op = READ_ONCE(sqe->len);
req->epoll.fd = READ_ONCE(sqe->off);
if (ep_op_has_event(req->epoll.op)) {
struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
return -EFAULT;
}
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
int ret;
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
if (sqe->ioprio || sqe->buf_index || sqe->off)
return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
req->madvise.len = READ_ONCE(sqe->len);
req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
int ret;
if (force_nonblock)
return -EAGAIN;
ret = do_madvise(ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->addr)
return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
req->fadvise.len = READ_ONCE(sqe->len);
req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
return 0;
}
static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
/* DONTNEED may block, others _should_ not */
if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
return -EAGAIN;
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *fname;
unsigned lookup_flags;
int ret;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.mask = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->open.how.flags = READ_ONCE(sqe->statx_flags);
if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
return -EINVAL;
req->open.filename = getname_flags(fname, lookup_flags, NULL);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
return 0;
}
static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_open *ctx = &req->open;
unsigned lookup_flags;
struct path path;
struct kstat stat;
int ret;
if (force_nonblock)
return -EAGAIN;
if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
return -EINVAL;
retry:
/* filename_lookup() drops it, keep a reference */
ctx->filename->refcnt++;
ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
NULL);
if (ret)
goto err;
ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
path_put(&path);
if (retry_estale(ret, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
if (!ret)
ret = cp_statx(&stat, ctx->buffer);
err:
putname(ctx->filename);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
/*
* If we queue this for async, it must not be cancellable. That would
* leave the 'file' in an undeterminate state.
*/
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (sqe->flags & IOSQE_FIXED_FILE)
return -EINVAL;
req->close.fd = READ_ONCE(sqe->fd);
if (req->file->f_op == &io_uring_fops ||
req->close.fd == req->ctx->ring_fd)
return -EBADF;
return 0;
}
static void io_close_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
/* Invoked with files, we need to do the close */
if (req->work.files) {
int ret;
ret = filp_close(req->close.put_file, req->work.files);
if (ret < 0) {
req_set_fail_links(req);
}
io_cqring_add_event(req, ret);
}
fput(req->close.put_file);
/* we bypassed the re-issue, drop the submission reference */
io_put_req(req);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
int ret;
req->close.put_file = NULL;
ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
if (ret < 0)
return ret;
/* if the file has a flush method, be safe and punt to async */
if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
goto eagain;
/*
* No ->flush(), safely close from here and just punt the
* fput() to async context.
*/
ret = filp_close(req->close.put_file, current->files);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
if (io_wq_current_is_worker()) {
struct io_wq_work *old_work, *work;
old_work = work = &req->work;
io_close_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
return 0;
}
eagain:
req->work.func = io_close_finish;
return -EAGAIN;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->file)
return -EBADF;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->len);
req->sync.flags = READ_ONCE(sqe->sync_range_flags);
return 0;
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_sync_file_range_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
return 0;
}
#if defined(CONFIG_NET)
static void io_sendrecv_async(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct iovec *iov = NULL;
if (req->io->rw.iov != req->io->rw.fast_iov)
iov = req->io->msg.iov;
io_wq_submit_work(workptr);
kfree(iov);
}
#endif
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
if (!io || req->opcode == IORING_OP_SEND)
return 0;
io->msg.iov = io->msg.fast_iov;
return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
#else
return -EOPNOTSUPP;
#endif
}
static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_NET)
struct io_async_msghdr *kmsg = NULL;
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_async_ctx io;
struct sockaddr_storage addr;
unsigned flags;
if (req->io) {
kmsg = &req->io->msg;
kmsg->msg.msg_name = &addr;
/* if iov is set, it's allocated already */
if (!kmsg->iov)
kmsg->iov = kmsg->fast_iov;
kmsg->msg.msg_iter.iov = kmsg->iov;
} else {
struct io_sr_msg *sr = &req->sr_msg;
kmsg = &io.msg;
kmsg->msg.msg_name = &addr;
io.msg.iov = io.msg.fast_iov;
ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
sr->msg_flags, &io.msg.iov);
if (ret)
return ret;
}
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
if (force_nonblock && ret == -EAGAIN) {
......@@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif
}
static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
struct iovec iov;
unsigned flags;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
&msg.msg_iter);
if (ret)
return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &msg, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
......@@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (!io)
if (!io || req->opcode == IORING_OP_RECV)
return 0;
io->msg.iov = io->msg.fast_iov;
......@@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif
}
static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
struct iovec iov;
unsigned flags;
ret = import_single_range(READ, sr->buf, sr->len, &iov,
&msg.msg_iter);
if (ret)
return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
msg.msg_iocb = NULL;
msg.msg_flags = 0;
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
......@@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __io_accept(req, nxt, force_nonblock);
if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish;
req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
io_put_req(req);
return -EAGAIN;
}
......@@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt);
}
static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
{
struct io_kiocb *req, *tmp;
struct req_batch rb;
rb.to_free = rb.need_iter = 0;
spin_lock_irq(&ctx->completion_lock);
llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
if (refcount_dec_and_test(&req->refs) &&
!io_req_multi_free(&rb, req)) {
req->flags |= REQ_F_COMP_LOCKED;
io_free_req(req);
}
}
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
static void io_poll_flush(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct llist_node *nodes;
nodes = llist_del_all(&req->ctx->poll_llist);
if (nodes)
__io_poll_flush(req->ctx, nodes);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key)
{
......@@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key);
unsigned long flags;
/* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events))
......@@ -2656,17 +3581,31 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* If we have a link timeout we're going to need the completion_lock
* for finalizing the request, mark us as having grabbed that already.
*/
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
hash_del(&req->hash_node);
io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (mask) {
unsigned long flags;
io_cqring_ev_posted(ctx);
} else {
io_queue_async_work(req);
if (llist_empty(&ctx->poll_llist) &&
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
hash_del(&req->hash_node);
io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
req = NULL;
} else {
req->result = mask;
req->llist_node.next = NULL;
/* if the list wasn't empty, we're done */
if (!llist_add(&req->llist_node, &ctx->poll_llist))
req = NULL;
else
req->work.func = io_poll_flush;
}
}
if (req)
io_queue_async_work(req);
return 1;
}
......@@ -3063,11 +4002,48 @@ static int io_async_cancel_prep(struct io_kiocb *req,
return 0;
}
static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
return 0;
}
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->flags || sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
req->files_update.nr_args = READ_ONCE(sqe->len);
if (!req->files_update.nr_args)
return -EINVAL;
req->files_update.arg = READ_ONCE(sqe->addr);
return 0;
}
static int io_files_update(struct io_kiocb *req, bool force_nonblock)
{
struct io_ring_ctx *ctx = req->ctx;
struct io_uring_files_update up;
int ret;
io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
if (force_nonblock)
return -EAGAIN;
up.offset = req->files_update.offset;
up.fds = req->files_update.arg;
mutex_lock(&ctx->uring_lock);
ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
return 0;
}
......@@ -3076,15 +4052,25 @@ static int io_req_defer_prep(struct io_kiocb *req,
{
ssize_t ret = 0;
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
return ret;
}
io_req_work_grab_env(req, &io_op_defs[req->opcode]);
switch (req->opcode) {
case IORING_OP_NOP:
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
ret = io_read_prep(req, sqe, true);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
ret = io_write_prep(req, sqe, true);
break;
case IORING_OP_POLL_ADD:
......@@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
ret = io_prep_sfr(req, sqe);
break;
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
ret = io_sendmsg_prep(req, sqe);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
ret = io_recvmsg_prep(req, sqe);
break;
case IORING_OP_CONNECT:
......@@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_ACCEPT:
ret = io_accept_prep(req, sqe);
break;
case IORING_OP_FALLOCATE:
ret = io_fallocate_prep(req, sqe);
break;
case IORING_OP_OPENAT:
ret = io_openat_prep(req, sqe);
break;
case IORING_OP_CLOSE:
ret = io_close_prep(req, sqe);
break;
case IORING_OP_FILES_UPDATE:
ret = io_files_update_prep(req, sqe);
break;
case IORING_OP_STATX:
ret = io_statx_prep(req, sqe);
break;
case IORING_OP_FADVISE:
ret = io_fadvise_prep(req, sqe);
break;
case IORING_OP_MADVISE:
ret = io_madvise_prep(req, sqe);
break;
case IORING_OP_OPENAT2:
ret = io_openat2_prep(req, sqe);
break;
case IORING_OP_EPOLL_CTL:
ret = io_epoll_ctl_prep(req, sqe);
break;
default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode);
......@@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
case IORING_OP_READ:
if (sqe) {
ret = io_read_prep(req, sqe, force_nonblock);
if (ret < 0)
......@@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
if (sqe) {
ret = io_write_prep(req, sqe, force_nonblock);
if (ret < 0)
......@@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_sync_file_range(req, nxt, force_nonblock);
break;
case IORING_OP_SENDMSG:
case IORING_OP_SEND:
if (sqe) {
ret = io_sendmsg_prep(req, sqe);
if (ret < 0)
break;
}
ret = io_sendmsg(req, nxt, force_nonblock);
if (req->opcode == IORING_OP_SENDMSG)
ret = io_sendmsg(req, nxt, force_nonblock);
else
ret = io_send(req, nxt, force_nonblock);
break;
case IORING_OP_RECVMSG:
case IORING_OP_RECV:
if (sqe) {
ret = io_recvmsg_prep(req, sqe);
if (ret)
break;
}
ret = io_recvmsg(req, nxt, force_nonblock);
if (req->opcode == IORING_OP_RECVMSG)
ret = io_recvmsg(req, nxt, force_nonblock);
else
ret = io_recv(req, nxt, force_nonblock);
break;
case IORING_OP_TIMEOUT:
if (sqe) {
......@@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
}
ret = io_async_cancel(req, nxt);
break;
case IORING_OP_FALLOCATE:
if (sqe) {
ret = io_fallocate_prep(req, sqe);
if (ret)
break;
}
ret = io_fallocate(req, nxt, force_nonblock);
break;
case IORING_OP_OPENAT:
if (sqe) {
ret = io_openat_prep(req, sqe);
if (ret)
break;
}
ret = io_openat(req, nxt, force_nonblock);
break;
case IORING_OP_CLOSE:
if (sqe) {
ret = io_close_prep(req, sqe);
if (ret)
break;
}
ret = io_close(req, nxt, force_nonblock);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
ret = io_files_update_prep(req, sqe);
if (ret)
break;
}
ret = io_files_update(req, force_nonblock);
break;
case IORING_OP_STATX:
if (sqe) {
ret = io_statx_prep(req, sqe);
if (ret)
break;
}
ret = io_statx(req, nxt, force_nonblock);
break;
case IORING_OP_FADVISE:
if (sqe) {
ret = io_fadvise_prep(req, sqe);
if (ret)
break;
}
ret = io_fadvise(req, nxt, force_nonblock);
break;
case IORING_OP_MADVISE:
if (sqe) {
ret = io_madvise_prep(req, sqe);
if (ret)
break;
}
ret = io_madvise(req, nxt, force_nonblock);
break;
case IORING_OP_OPENAT2:
if (sqe) {
ret = io_openat2_prep(req, sqe);
if (ret)
break;
}
ret = io_openat2(req, nxt, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
if (sqe) {
ret = io_epoll_ctl_prep(req, sqe);
if (ret)
break;
}
ret = io_epoll_ctl(req, nxt, force_nonblock);
break;
default:
ret = -EINVAL;
break;
......@@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
struct io_kiocb *nxt = NULL;
int ret = 0;
if (work->flags & IO_WQ_WORK_CANCEL)
/* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
ret = -ECANCELED;
}
if (!ret) {
req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
......@@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt);
}
static bool io_req_op_valid(int op)
static int io_req_needs_file(struct io_kiocb *req, int fd)
{
return op >= IORING_OP_NOP && op < IORING_OP_LAST;
}
static int io_req_needs_file(struct io_kiocb *req)
{
switch (req->opcode) {
case IORING_OP_NOP:
case IORING_OP_POLL_REMOVE:
case IORING_OP_TIMEOUT:
case IORING_OP_TIMEOUT_REMOVE:
case IORING_OP_ASYNC_CANCEL:
case IORING_OP_LINK_TIMEOUT:
if (!io_op_defs[req->opcode].needs_file)
return 0;
default:
if (io_req_op_valid(req->opcode))
return 1;
return -EINVAL;
}
if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
return 0;
return 1;
}
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
......@@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
{
struct fixed_file_table *table;
table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK];
table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK];;
}
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
......@@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
{
struct io_ring_ctx *ctx = req->ctx;
unsigned flags;
int fd, ret;
int fd;
flags = READ_ONCE(sqe->flags);
fd = READ_ONCE(sqe->fd);
if (flags & IOSQE_IO_DRAIN)
req->flags |= REQ_F_IO_DRAIN;
ret = io_req_needs_file(req);
if (ret <= 0)
return ret;
if (!io_req_needs_file(req, fd))
return 0;
if (flags & IOSQE_FIXED_FILE) {
if (unlikely(!ctx->file_table ||
if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files))
return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files);
......@@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
if (!req->file)
return -EBADF;
req->flags |= REQ_F_FIXED_FILE;
percpu_ref_get(&ctx->file_data->refs);
} else {
if (req->needs_fixed_file)
return -EBADF;
......@@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx;
if (req->work.files)
return 0;
if (!ctx->ring_file)
return -EBADF;
rcu_read_lock();
spin_lock_irq(&ctx->inflight_lock);
/*
......@@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req)
* the fd has changed since we started down this path, and disallow
* this operation if it has.
*/
if (fcheck(req->ring_fd) == req->ring_file) {
if (fcheck(ctx->ring_fd) == ctx->ring_file) {
list_add(&req->inflight_entry, &ctx->inflight_list);
req->flags |= REQ_F_INFLIGHT;
req->work.files = current->files;
......@@ -3532,7 +4621,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
*/
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) {
if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
punt:
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (ret)
goto err;
......@@ -3567,6 +4657,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (nxt) {
req = nxt;
nxt = NULL;
if (req->flags & REQ_F_FORCE_ASYNC)
goto punt;
goto again;
}
}
......@@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
int ret;
if (unlikely(req->ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
req->ctx->drain_next = false;
}
req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
ret = io_req_defer(req, sqe);
if (ret) {
if (ret != -EIOCBQUEUED) {
fail_req:
io_cqring_add_event(req, ret);
req_set_fail_links(req);
io_double_put_req(req);
}
} else
} else if (req->flags & REQ_F_FORCE_ASYNC) {
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret < 0))
goto fail_req;
/*
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
__io_queue_sqe(req, sqe);
}
}
static inline void io_queue_link_head(struct io_kiocb *req)
......@@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req)
}
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK)
IOSQE_IO_HARDLINK | IOSQE_ASYNC)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link)
{
const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx;
int ret;
unsigned int sqe_flags;
int ret, id;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */
if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) {
if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
id = READ_ONCE(sqe->personality);
if (id) {
const struct cred *personality_creds;
personality_creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!personality_creds)) {
ret = -EINVAL;
goto err_req;
}
old_creds = override_creds(personality_creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
IOSQE_ASYNC);
ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) {
err_req:
io_cqring_add_event(req, ret);
io_double_put_req(req);
if (old_creds)
revert_creds(old_creds);
return false;
}
......@@ -3632,14 +4753,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* conditions are true (normal request), then just queue it.
*/
if (*link) {
struct io_kiocb *prev = *link;
if (sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
struct io_kiocb *head = *link;
/*
* Taking sequential execution of a link, draining both sides
* of the link also fullfils IOSQE_IO_DRAIN semantics for all
* requests in the link. So, it drains the head and the
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
if (sqe_flags & IOSQE_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
if (io_alloc_async_ctx(req)) {
ret = -EAGAIN;
goto err_req;
......@@ -3648,25 +4774,36 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_req_defer_prep(req, sqe);
if (ret) {
/* fail even hard links since we don't submit */
prev->flags |= REQ_F_FAIL_LINK;
head->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->link_list, &prev->link_list);
} else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
INIT_LIST_HEAD(&req->link_list);
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
*link = req;
trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &head->link_list);
/* last request of a link, enqueue the link */
if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
io_queue_sqe(req, sqe);
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
req->ctx->drain_next = 0;
}
if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list);
ret = io_req_defer_prep(req, sqe);
if (ret)
req->flags |= REQ_F_FAIL_LINK;
*link = req;
} else {
io_queue_sqe(req, sqe);
}
}
if (old_creds)
revert_creds(old_creds);
return true;
}
......@@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
{
struct io_rings *rings = ctx->rings;
if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
/*
* Ensure any loads from the SQEs are done at this point,
* since once we write the new head, the application could
* write new data to them.
*/
smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
/*
* Ensure any loads from the SQEs are done at this point,
* since once we write the new head, the application could
* write new data to them.
*/
smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
/*
......@@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe **sqe_ptr)
{
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array;
unsigned head;
......@@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it.
*/
head = ctx->cached_sq_head;
/* make sure SQ entry isn't read before tail */
if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
return false;
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
if (likely(head < ctx->sq_entries)) {
/*
* All io need record the previous position, if LINK vs DARIN,
......@@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* drop invalid entries */
ctx->cached_sq_head++;
ctx->cached_sq_dropped++;
WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped);
WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
return false;
}
......@@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */
if (!list_empty(&ctx->cq_overflow_list) &&
!io_cqring_overflow_flush(ctx, false))
return -EBUSY;
if (test_bit(0, &ctx->sq_check_overflow)) {
if (!list_empty(&ctx->cq_overflow_list) &&
!io_cqring_overflow_flush(ctx, false))
return -EBUSY;
}
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, nr);
statep = &state;
}
ctx->ring_fd = ring_fd;
ctx->ring_file = ring_file;
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
unsigned int sqe_flags;
req = io_get_req(ctx, statep);
if (unlikely(!req)) {
......@@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break;
}
if (!io_get_sqring(ctx, req, &sqe)) {
__io_free_req(req);
__io_req_do_free(req);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (unlikely(req->opcode >= IORING_OP_LAST)) {
io_cqring_add_event(req, -EINVAL);
io_double_put_req(req);
break;
}
if (io_req_needs_user(req) && !*mm) {
if (io_op_defs[req->opcode].needs_mm && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) {
use_mm(ctx->sqo_mm);
......@@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
}
}
submitted++;
sqe_flags = sqe->flags;
req->ring_file = ring_file;
req->ring_fd = ring_fd;
req->has_user = *mm != NULL;
req->in_async = async;
req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->user_data, true, async);
trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
if (!io_submit_sqe(req, sqe, statep, &link))
break;
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
io_queue_link_head(link);
link = NULL;
}
}
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
percpu_ref_put_many(&ctx->refs, nr - ref_used);
}
if (link)
io_queue_link_head(link);
if (statep)
......@@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
}
to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock);
......@@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif
}
static void io_file_ref_kill(struct percpu_ref *ref)
{
struct fixed_file_data *data;
data = container_of(ref, struct fixed_file_data, refs);
complete(&data->done);
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
struct fixed_file_data *data = ctx->file_data;
unsigned nr_tables, i;
if (!ctx->file_table)
if (!data)
return -ENXIO;
/* protect against inflight atomic switch, which drops the ref */
percpu_ref_get(&data->refs);
/* wait for existing switches */
flush_work(&data->ref_work);
percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
wait_for_completion(&data->done);
percpu_ref_put(&data->refs);
/* flush potential new switch */
flush_work(&data->ref_work);
percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_table[i].files);
kfree(ctx->file_table);
ctx->file_table = NULL;
kfree(data->table[i].files);
kfree(data->table);
kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return 0;
}
......@@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx)
}
#if defined(CONFIG_UNIX)
static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
if (ctx->io_wq)
io_wq_flush(ctx->io_wq);
unix_destruct_scm(skb);
}
/*
* Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have
......@@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
fpl->max = SCM_MAX_FD;
fpl->count = nr_files;
UNIXCB(skb).fp = fpl;
skb->destructor = io_destruct_skb;
skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
......@@ -4231,31 +5382,131 @@ static int io_sqe_files_scm(struct io_ring_ctx *ctx)
}
#endif
static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
unsigned nr_files)
static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
unsigned nr_files)
{
int i;
for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_data->table[i];
unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
table->files = kcalloc(this_files, sizeof(struct file *),
GFP_KERNEL);
if (!table->files)
break;
nr_files -= this_files;
}
if (i == nr_tables)
return 0;
for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_data->table[i];
kfree(table->files);
}
return 1;
}
static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
{
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
__skb_queue_head_init(&list);
/*
* Find the skb that holds this file in its SCM_RIGHTS. When found,
* remove this entry and rearrange the file array.
*/
skb = skb_dequeue(head);
while (skb) {
struct scm_fp_list *fp;
fp = UNIXCB(skb).fp;
for (i = 0; i < fp->count; i++) {
int left;
if (fp->fp[i] != file)
continue;
unix_notinflight(fp->user, fp->fp[i]);
left = fp->count - 1 - i;
if (left) {
memmove(&fp->fp[i], &fp->fp[i + 1],
left * sizeof(struct file *));
}
fp->count--;
if (!fp->count) {
kfree_skb(skb);
skb = NULL;
} else {
__skb_queue_tail(&list, skb);
}
fput(file);
file = NULL;
break;
}
if (!file)
break;
__skb_queue_tail(&list, skb);
skb = skb_dequeue(head);
}
if (skb_peek(&list)) {
spin_lock_irq(&head->lock);
while ((skb = __skb_dequeue(&list)) != NULL)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#else
fput(file);
#endif
}
struct io_file_put {
struct llist_node llist;
struct file *file;
struct completion *done;
};
static void io_ring_file_ref_switch(struct work_struct *work)
{
int i;
struct io_file_put *pfile, *tmp;
struct fixed_file_data *data;
struct llist_node *node;
for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_table[i];
unsigned this_files;
data = container_of(work, struct fixed_file_data, ref_work);
this_files = min(nr_files, IORING_MAX_FILES_TABLE);
table->files = kcalloc(this_files, sizeof(struct file *),
GFP_KERNEL);
if (!table->files)
break;
nr_files -= this_files;
while ((node = llist_del_all(&data->put_llist)) != NULL) {
llist_for_each_entry_safe(pfile, tmp, node, llist) {
io_ring_file_put(data->ctx, pfile->file);
if (pfile->done)
complete(pfile->done);
else
kfree(pfile);
}
}
if (i == nr_tables)
return 0;
percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_table[i];
kfree(table->files);
}
return 1;
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_data *data;
data = container_of(ref, struct fixed_file_data, refs);
/* we can't safely switch from inside this context, punt to wq */
queue_work(system_wq, &data->ref_work);
}
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
......@@ -4263,25 +5514,48 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
{
__s32 __user *fds = (__s32 __user *) arg;
unsigned nr_tables;
struct file *file;
int fd, ret = 0;
unsigned i;
if (ctx->file_table)
if (ctx->file_data)
return -EBUSY;
if (!nr_args)
return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE;
ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
if (!ctx->file_data)
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table),
ctx->file_data->table = kcalloc(nr_tables,
sizeof(struct fixed_file_table),
GFP_KERNEL);
if (!ctx->file_table)
if (!ctx->file_data->table) {
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
ctx->file_data->put_llist.first = NULL;
INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
kfree(ctx->file_table);
ctx->file_table = NULL;
percpu_ref_exit(&ctx->file_data->refs);
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
......@@ -4298,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
continue;
}
table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
table->files[index] = fget(fd);
file = fget(fd);
ret = -EBADF;
if (!table->files[index])
if (!file)
break;
/*
* Don't allow io_uring instances to be registered. If UNIX
* isn't enabled, then this causes a reference cycle and this
......@@ -4312,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
* handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway.
*/
if (table->files[index]->f_op == &io_uring_fops) {
fput(table->files[index]);
if (file->f_op == &io_uring_fops) {
fput(file);
break;
}
ret = 0;
table->files[index] = file;
}
if (ret) {
for (i = 0; i < ctx->nr_user_files; i++) {
struct file *file;
file = io_file_from_index(ctx, i);
if (file)
fput(file);
}
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_table[i].files);
kfree(ctx->file_data->table[i].files);
kfree(ctx->file_table);
ctx->file_table = NULL;
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
ctx->nr_user_files = 0;
return ret;
}
......@@ -4343,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
{
#if defined(CONFIG_UNIX)
struct file *file = io_file_from_index(ctx, index);
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
__skb_queue_head_init(&list);
/*
* Find the skb that holds this file in its SCM_RIGHTS. When found,
* remove this entry and rearrange the file array.
*/
skb = skb_dequeue(head);
while (skb) {
struct scm_fp_list *fp;
fp = UNIXCB(skb).fp;
for (i = 0; i < fp->count; i++) {
int left;
if (fp->fp[i] != file)
continue;
unix_notinflight(fp->user, fp->fp[i]);
left = fp->count - 1 - i;
if (left) {
memmove(&fp->fp[i], &fp->fp[i + 1],
left * sizeof(struct file *));
}
fp->count--;
if (!fp->count) {
kfree_skb(skb);
skb = NULL;
} else {
__skb_queue_tail(&list, skb);
}
fput(file);
file = NULL;
break;
}
if (!file)
break;
__skb_queue_tail(&list, skb);
skb = skb_dequeue(head);
}
if (skb_peek(&list)) {
spin_lock_irq(&head->lock);
while ((skb = __skb_dequeue(&list)) != NULL)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#else
fput(io_file_from_index(ctx, index));
#endif
}
static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
int index)
{
......@@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif
}
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
static void io_atomic_switch(struct percpu_ref *ref)
{
struct io_uring_files_update up;
struct fixed_file_data *data;
data = container_of(ref, struct fixed_file_data, refs);
clear_bit(FFD_F_ATOMIC, &data->state);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
struct io_file_put *pfile, pfile_stack;
DECLARE_COMPLETION_ONSTACK(done);
/*
* If we fail allocating the struct we need for doing async reomval
* of this file, just punt to sync and wait for it.
*/
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile) {
pfile = &pfile_stack;
pfile->done = &done;
}
pfile->file = file;
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
percpu_ref_switch_to_atomic(&data->refs,
io_atomic_switch);
}
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
}
return true;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *up,
unsigned nr_args)
{
struct fixed_file_data *data = ctx->file_data;
bool ref_switch = false;
struct file *file;
__s32 __user *fds;
int fd, i, err;
__u32 done;
if (!ctx->file_table)
return -ENXIO;
if (!nr_args)
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
if (up.resv)
return -EINVAL;
if (check_add_overflow(up.offset, nr_args, &done))
if (check_add_overflow(up->offset, nr_args, &done))
return -EOVERFLOW;
if (done > ctx->nr_user_files)
return -EINVAL;
done = 0;
fds = u64_to_user_ptr(up.fds);
fds = u64_to_user_ptr(up->fds);
while (nr_args) {
struct fixed_file_table *table;
unsigned index;
......@@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
err = -EFAULT;
break;
}
i = array_index_nospec(up.offset, ctx->nr_user_files);
table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT];
i = array_index_nospec(up->offset, ctx->nr_user_files);
table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) {
io_sqe_file_unregister(ctx, i);
file = io_file_from_index(ctx, index);
table->files[index] = NULL;
if (io_queue_file_removal(data, file))
ref_switch = true;
}
if (fd != -1) {
struct file *file;
file = fget(fd);
if (!file) {
err = -EBADF;
......@@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
}
nr_args--;
done++;
up.offset++;
up->offset++;
}
if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
}
return done ? done : err;
}
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_files_update up;
if (!ctx->file_data)
return -ENXIO;
if (!nr_args)
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
if (up.resv)
return -EINVAL;
return __io_sqe_files_update(ctx, &up, nr_args);
}
static void io_put_work(struct io_wq_work *work)
{
......@@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work)
refcount_inc(&req->refs);
}
static int io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
struct io_wq_data data;
struct fd f;
struct io_ring_ctx *ctx_attach;
unsigned int concurrency;
int ret = 0;
data.user = ctx->user;
data.get_work = io_get_work;
data.put_work = io_put_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
}
return ret;
}
f = fdget(p->wq_fd);
if (!f.file)
return -EBADF;
if (f.file->f_op != &io_uring_fops) {
ret = -EINVAL;
goto out_fput;
}
ctx_attach = f.file->private_data;
/* @io_wq is protected by holding the fd */
if (!io_wq_get(ctx_attach->io_wq, &data)) {
ret = -EINVAL;
goto out_fput;
}
ctx->io_wq = ctx_attach->io_wq;
out_fput:
fdput(f);
return ret;
}
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
struct io_wq_data data;
unsigned concurrency;
int ret;
init_waitqueue_head(&ctx->sqo_wait);
......@@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err;
}
data.mm = ctx->sqo_mm;
data.user = ctx->user;
data.creds = ctx->creds;
data.get_work = io_get_work;
data.put_work = io_put_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
ret = io_init_wq_offload(ctx, p);
if (ret)
goto err;
}
return 0;
err:
......@@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on)
return fasync_helper(fd, file, on, &ctx->cq_fasync);
}
static int io_remove_personalities(int id, void *p, void *data)
{
struct io_ring_ctx *ctx = data;
const struct cred *cred;
cred = idr_remove(&ctx->personality_idr, id);
if (cred)
put_cred(cred);
return 0;
}
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{
mutex_lock(&ctx->uring_lock);
......@@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx);
}
......@@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
} else if (to_submit) {
struct mm_struct *cur_mm;
to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock);
/* already have mm, so io_submit_sqes() won't try to grab it */
cur_mm = ctx->sqo_mm;
......@@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX)
ctx->ring_sock->file = file;
ctx->ring_sock->sk->sk_user_data = ctx;
#endif
fd_install(ret, file);
return ret;
......@@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
bool account_mem;
int ret;
if (!entries || entries > IORING_MAX_ENTRIES)
if (!entries)
return -EINVAL;
if (entries > IORING_MAX_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
entries = IORING_MAX_ENTRIES;
}
/*
* Use twice as many entries for the CQ ring. It's possible for the
......@@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
* to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing.
*/
if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES)
if (p->cq_entries < p->sq_entries)
return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
p->cq_entries = roundup_pow_of_two(p->cq_entries);
} else {
p->cq_entries = 2 * p->sq_entries;
......@@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
goto err;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE;
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret;
err:
......@@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
}
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE))
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL;
ret = io_uring_create(entries, &p);
......@@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params);
}
static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_op_defs[i].not_supported)
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds = get_current_cred();
int id;
id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
USHRT_MAX, GFP_KERNEL);
if (id < 0)
put_cred(creds);
return id;
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *old_creds;
old_creds = idr_remove(&ctx->personality_idr, id);
if (old_creds) {
put_cred(old_creds);
return 0;
}
return -EINVAL;
}
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
return false;
default:
return true;
}
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
......@@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO;
percpu_ref_kill(&ctx->refs);
if (io_register_op_must_quiesce(opcode)) {
percpu_ref_kill(&ctx->refs);
/*
* Drop uring mutex before waiting for references to exit. If another
* thread is currently inside io_uring_enter() it might need to grab
* the uring_lock to make progress. If we hold it here across the drain
* wait, then we can deadlock. It's safe to drop the mutex here, since
* no new references will come in after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
wait_for_completion(&ctx->completions[0]);
mutex_lock(&ctx->uring_lock);
/*
* Drop uring mutex before waiting for references to exit. If
* another thread is currently inside io_uring_enter() it might
* need to grab the uring_lock to make progress. If we hold it
* here across the drain wait, then we can deadlock. It's safe
* to drop the mutex here, since no new references will come in
* after we've killed the percpu ref.
*/
mutex_unlock(&ctx->uring_lock);
ret = wait_for_completion_interruptible(&ctx->completions[0]);
mutex_lock(&ctx->uring_lock);
if (ret) {
percpu_ref_resurrect(&ctx->refs);
ret = -EINTR;
goto out;
}
}
switch (opcode) {
case IORING_REGISTER_BUFFERS:
......@@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_files_update(ctx, arg, nr_args);
break;
case IORING_REGISTER_EVENTFD:
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_eventfd_register(ctx, arg);
if (ret)
break;
if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
ctx->eventfd_async = 1;
else
ctx->eventfd_async = 0;
break;
case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL;
......@@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_eventfd_unregister(ctx);
break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
default:
ret = -EINVAL;
break;
}
/* bring the ctx back to life */
reinit_completion(&ctx->completions[0]);
percpu_ref_reinit(&ctx->refs);
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */
percpu_ref_reinit(&ctx->refs);
out:
reinit_completion(&ctx->completions[0]);
}
return ret;
}
......@@ -5524,6 +6963,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
static int __init io_uring_init(void)
{
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0;
};
......
......@@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
static inline struct open_how build_open_how(int flags, umode_t mode)
inline struct open_how build_open_how(int flags, umode_t mode)
{
struct open_how how = {
.flags = flags & VALID_OPEN_FLAGS,
......@@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
return how;
}
static inline int build_open_flags(const struct open_how *how,
struct open_flags *op)
inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
int flags = how->flags;
int lookup_flags = 0;
......
......@@ -21,6 +21,8 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include "internal.h"
/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @inode: Inode to use as the source
......@@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
}
EXPORT_SYMBOL(vfs_statx_fd);
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
{
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
return -EINVAL;
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
*lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
*lookup_flags |= LOOKUP_EMPTY;
return 0;
}
/**
* vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename
......@@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
{
struct path path;
int error = -EINVAL;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
unsigned lookup_flags;
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
retry:
error = user_path_at(dfd, filename, lookup_flags, &path);
if (error)
......@@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
static noinline_for_stack int
noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
struct statx tmp;
......
......@@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file);
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock);
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
#else
static inline void eventpoll_init_file(struct file *file) {}
......
......@@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr,
......
......@@ -210,15 +210,17 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
}
/**
* percpu_ref_tryget - try to increment a percpu refcount
* percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get
* @nr: number of references to get
*
* Increment a percpu refcount unless its count already reached zero.
* Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
unsigned long nr)
{
unsigned long __percpu *percpu_count;
bool ret;
......@@ -226,10 +228,10 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count);
this_cpu_add(*percpu_count, nr);
ret = true;
} else {
ret = atomic_long_inc_not_zero(&ref->count);
ret = atomic_long_add_unless(&ref->count, nr, 0);
}
rcu_read_unlock();
......@@ -237,6 +239,20 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
return ret;
}
/**
* percpu_ref_tryget - try to increment a percpu refcount
* @ref: percpu_ref to try-get
*
* Increment a percpu refcount unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
return percpu_ref_tryget_many(ref, 1);
}
/**
* percpu_ref_tryget_live - try to increment a live percpu refcount
* @ref: percpu_ref to try-get
......
......@@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
* io_uring_submit_sqe - called before submitting one SQE
*
* @ctx: pointer to a ring context structure
* @opcode: opcode of request
* @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE
......@@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
*/
TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread),
TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_ARGS(ctx, user_data, force_nonblock, sq_thread),
TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry (
__field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data )
__field( bool, force_nonblock )
__field( bool, sq_thread )
......@@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
TP_fast_assign(
__entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data;
__entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread;
),
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, (unsigned long long) __entry->user_data,
TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread)
);
......
......@@ -34,21 +34,43 @@ struct io_uring_sqe {
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
};
__u64 user_data; /* data to be passed back at completion time */
union {
__u16 buf_index; /* index into fixed buffers, if used */
struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* personality to use, if used */
__u16 personality;
};
__u64 __pad2[3];
};
};
enum {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
};
/*
* sqe->flags
*/
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */
/* use fixed fileset */
#define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
/* issue after inflight IO */
#define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
/* links next sqe */
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
/* like LINK, but stronger */
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/*
* io_uring_setup() flags
......@@ -57,6 +79,8 @@ struct io_uring_sqe {
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
enum {
IORING_OP_NOP,
......@@ -76,6 +100,19 @@ enum {
IORING_OP_ASYNC_CANCEL,
IORING_OP_LINK_TIMEOUT,
IORING_OP_CONNECT,
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
IORING_OP_FILES_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
IORING_OP_FADVISE,
IORING_OP_MADVISE,
IORING_OP_SEND,
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -153,7 +190,8 @@ struct io_uring_params {
__u32 sq_thread_cpu;
__u32 sq_thread_idle;
__u32 features;
__u32 resv[4];
__u32 wq_fd;
__u32 resv[3];
struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off;
};
......@@ -164,6 +202,8 @@ struct io_uring_params {
#define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
/*
* io_uring_register(2) opcodes and arguments
......@@ -175,6 +215,10 @@ struct io_uring_params {
#define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6
#define IORING_REGISTER_EVENTFD_ASYNC 7
#define IORING_REGISTER_PROBE 8
#define IORING_REGISTER_PERSONALITY 9
#define IORING_UNREGISTER_PERSONALITY 10
struct io_uring_files_update {
__u32 offset;
......@@ -182,4 +226,21 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds;
};
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
__u8 op;
__u8 resv;
__u16 flags; /* IO_URING_OP_* flags */
__u32 resv2;
};
struct io_uring_probe {
__u8 last_op; /* last opcode supported */
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
};
#endif
......@@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable.
*/
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
int do_madvise(unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
......@@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return error;
}
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(start, len_in, behavior);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment