Commit 896f8d23 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:

 - Support for various new opcodes (fallocate, openat, close, statx,
   fadvise, madvise, openat2, non-vectored read/write, send/recv, and
   epoll_ctl)

 - Faster ring quiesce for fileset updates

 - Optimizations for overflow condition checking

 - Support for max-sized clamping

 - Support for probing what opcodes are supported

 - Support for io-wq backend sharing between "sibling" rings

 - Support for registering personalities

 - Lots of little fixes and improvements

* tag 'for-5.6/io_uring-vfs-2020-01-29' of git://git.kernel.dk/linux-block: (64 commits)
  io_uring: add support for epoll_ctl(2)
  eventpoll: support non-blocking do_epoll_ctl() calls
  eventpoll: abstract out epoll_ctl() handler
  io_uring: fix linked command file table usage
  io_uring: support using a registered personality for commands
  io_uring: allow registering credentials
  io_uring: add io-wq workqueue sharing
  io-wq: allow grabbing existing io-wq
  io_uring/io-wq: don't use static creds/mm assignments
  io-wq: make the io_wq ref counted
  io_uring: fix refcounting with batched allocations at OOM
  io_uring: add comment for drain_next
  io_uring: don't attempt to copy iovec for READ/WRITE
  io_uring: honor IOSQE_ASYNC for linked reqs
  io_uring: prep req when do IOSQE_ASYNC
  io_uring: use labeled array init in io_op_defs
  io_uring: optimise sqe-to-req flags translation
  io_uring: remove REQ_F_IO_DRAINED
  io_uring: file switch work needs to get flushed on exit
  io_uring: hide uring_fd in ctx
  ...
parents 33c84e89 3e4827b0
...@@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd) ...@@ -2249,10 +2249,12 @@ static void binder_deferred_fd_close(int fd)
return; return;
init_task_work(&twcb->twork, binder_do_fd_close); init_task_work(&twcb->twork, binder_do_fd_close);
__close_fd_get_file(fd, &twcb->file); __close_fd_get_file(fd, &twcb->file);
if (twcb->file) if (twcb->file) {
filp_close(twcb->file, current->files);
task_work_add(current, &twcb->twork, true); task_work_add(current, &twcb->twork, true);
else } else {
kfree(twcb); kfree(twcb);
}
} }
static void binder_transaction_buffer_release(struct binder_proc *proc, static void binder_transaction_buffer_release(struct binder_proc *proc,
......
...@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p) ...@@ -354,12 +354,6 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
return container_of(p, struct ep_pqueue, pt)->epi; return container_of(p, struct ep_pqueue, pt)->epi;
} }
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
/* Initialize the poll safe wake up structure */ /* Initialize the poll safe wake up structure */
static void ep_nested_calls_init(struct nested_calls *ncalls) static void ep_nested_calls_init(struct nested_calls *ncalls)
{ {
...@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size) ...@@ -2074,27 +2068,28 @@ SYSCALL_DEFINE1(epoll_create, int, size)
return do_epoll_create(0); return do_epoll_create(0);
} }
/* static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
* The following function implements the controller interface for bool nonblock)
* the eventpoll file that enables the insertion/removal/change of {
* file descriptors inside the interest set. if (!nonblock) {
*/ mutex_lock_nested(mutex, depth);
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return 0;
struct epoll_event __user *, event) }
if (mutex_trylock(mutex))
return 0;
return -EAGAIN;
}
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock)
{ {
int error; int error;
int full_check = 0; int full_check = 0;
struct fd f, tf; struct fd f, tf;
struct eventpoll *ep; struct eventpoll *ep;
struct epitem *epi; struct epitem *epi;
struct epoll_event epds;
struct eventpoll *tep = NULL; struct eventpoll *tep = NULL;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
error = -EBADF; error = -EBADF;
f = fdget(epfd); f = fdget(epfd);
if (!f.file) if (!f.file)
...@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2112,7 +2107,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
/* Check if EPOLLWAKEUP is allowed */ /* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op)) if (ep_op_has_event(op))
ep_take_care_of_epollwakeup(&epds); ep_take_care_of_epollwakeup(epds);
/* /*
* We have to check that the file structure underneath the file descriptor * We have to check that the file structure underneath the file descriptor
...@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2128,11 +2123,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups. * Also, we do not currently supported nested exclusive wakeups.
*/ */
if (ep_op_has_event(op) && (epds.events & EPOLLEXCLUSIVE)) { if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
if (op == EPOLL_CTL_MOD) if (op == EPOLL_CTL_MOD)
goto error_tgt_fput; goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput; goto error_tgt_fput;
} }
...@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2157,13 +2152,17 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* deep wakeup paths from forming in parallel through multiple * deep wakeup paths from forming in parallel through multiple
* EPOLL_CTL_ADD operations. * EPOLL_CTL_ADD operations.
*/ */
mutex_lock_nested(&ep->mtx, 0); error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD) { if (op == EPOLL_CTL_ADD) {
if (!list_empty(&f.file->f_ep_links) || if (!list_empty(&f.file->f_ep_links) ||
is_file_epoll(tf.file)) { is_file_epoll(tf.file)) {
full_check = 1;
mutex_unlock(&ep->mtx); mutex_unlock(&ep->mtx);
mutex_lock(&epmutex); error = epoll_mutex_lock(&epmutex, 0, nonblock);
if (error)
goto error_tgt_fput;
full_check = 1;
if (is_file_epoll(tf.file)) { if (is_file_epoll(tf.file)) {
error = -ELOOP; error = -ELOOP;
if (ep_loop_check(ep, tf.file) != 0) { if (ep_loop_check(ep, tf.file) != 0) {
...@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2173,10 +2172,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else } else
list_add(&tf.file->f_tfile_llink, list_add(&tf.file->f_tfile_llink,
&tfile_check_list); &tfile_check_list);
mutex_lock_nested(&ep->mtx, 0); error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
if (error) {
out_del:
list_del(&tf.file->f_tfile_llink);
goto error_tgt_fput;
}
if (is_file_epoll(tf.file)) { if (is_file_epoll(tf.file)) {
tep = tf.file->private_data; tep = tf.file->private_data;
mutex_lock_nested(&tep->mtx, 1); error = epoll_mutex_lock(&tep->mtx, 1, nonblock);
if (error) {
mutex_unlock(&ep->mtx);
goto out_del;
}
} }
} }
} }
...@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2192,8 +2200,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
switch (op) { switch (op) {
case EPOLL_CTL_ADD: case EPOLL_CTL_ADD:
if (!epi) { if (!epi) {
epds.events |= EPOLLERR | EPOLLHUP; epds->events |= EPOLLERR | EPOLLHUP;
error = ep_insert(ep, &epds, tf.file, fd, full_check); error = ep_insert(ep, epds, tf.file, fd, full_check);
} else } else
error = -EEXIST; error = -EEXIST;
if (full_check) if (full_check)
...@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2208,8 +2216,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
case EPOLL_CTL_MOD: case EPOLL_CTL_MOD:
if (epi) { if (epi) {
if (!(epi->event.events & EPOLLEXCLUSIVE)) { if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds.events |= EPOLLERR | EPOLLHUP; epds->events |= EPOLLERR | EPOLLHUP;
error = ep_modify(ep, epi, &epds); error = ep_modify(ep, epi, epds);
} }
} else } else
error = -ENOENT; error = -ENOENT;
...@@ -2231,6 +2239,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -2231,6 +2239,23 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
return error; return error;
} }
/*
* The following function implements the controller interface for
* the eventpoll file that enables the insertion/removal/change of
* file descriptors inside the interest set.
*/
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct epoll_event epds;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
return -EFAULT;
return do_epoll_ctl(epfd, op, fd, &epds, false);
}
/* /*
* Implement the event wait interface for the eventpoll file. It is the kernel * Implement the event wait interface for the eventpoll file. It is the kernel
* part of the user space epoll_wait(2). * part of the user space epoll_wait(2).
......
...@@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd) ...@@ -642,7 +642,9 @@ int __close_fd(struct files_struct *files, unsigned fd)
EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
/* /*
* variant of __close_fd that gets a ref on the file for later fput * variant of __close_fd that gets a ref on the file for later fput.
* The caller must ensure that filp_close() called on the file, and then
* an fput().
*/ */
int __close_fd_get_file(unsigned int fd, struct file **res) int __close_fd_get_file(unsigned int fd, struct file **res)
{ {
...@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res) ...@@ -662,7 +664,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
get_file(file); get_file(file);
*res = file; *res = file;
return filp_close(file, files); return 0;
out_unlock: out_unlock:
spin_unlock(&files->file_lock); spin_unlock(&files->file_lock);
......
...@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, ...@@ -124,6 +124,8 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op); const struct open_flags *op);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *); const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
long do_faccessat(int dfd, const char __user *filename, int mode); long do_faccessat(int dfd, const char __user *filename, int mode);
...@@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations; ...@@ -182,3 +184,9 @@ extern const struct dentry_operations ns_dentry_operations;
/* direct-io.c: */ /* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb); int sb_init_dio_done_wq(struct super_block *sb);
/*
* fs/stat.c:
*/
unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags);
int cp_statx(const struct kstat *stat, struct statx __user *buffer);
...@@ -56,7 +56,8 @@ struct io_worker { ...@@ -56,7 +56,8 @@ struct io_worker {
struct rcu_head rcu; struct rcu_head rcu;
struct mm_struct *mm; struct mm_struct *mm;
const struct cred *creds; const struct cred *cur_creds;
const struct cred *saved_creds;
struct files_struct *restore_files; struct files_struct *restore_files;
}; };
...@@ -109,10 +110,10 @@ struct io_wq { ...@@ -109,10 +110,10 @@ struct io_wq {
struct task_struct *manager; struct task_struct *manager;
struct user_struct *user; struct user_struct *user;
const struct cred *creds;
struct mm_struct *mm;
refcount_t refs; refcount_t refs;
struct completion done; struct completion done;
refcount_t use_refs;
}; };
static bool io_worker_get(struct io_worker *worker) static bool io_worker_get(struct io_worker *worker)
...@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) ...@@ -135,9 +136,9 @@ static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker)
{ {
bool dropped_lock = false; bool dropped_lock = false;
if (worker->creds) { if (worker->saved_creds) {
revert_creds(worker->creds); revert_creds(worker->saved_creds);
worker->creds = NULL; worker->cur_creds = worker->saved_creds = NULL;
} }
if (current->files != worker->restore_files) { if (current->files != worker->restore_files) {
...@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash) ...@@ -396,6 +397,43 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, unsigned *hash)
return NULL; return NULL;
} }
static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work)
{
if (worker->mm) {
unuse_mm(worker->mm);
mmput(worker->mm);
worker->mm = NULL;
}
if (!work->mm) {
set_fs(KERNEL_DS);
return;
}
if (mmget_not_zero(work->mm)) {
use_mm(work->mm);
if (!worker->mm)
set_fs(USER_DS);
worker->mm = work->mm;
/* hang on to this mm */
work->mm = NULL;
return;
}
/* failed grabbing mm, ensure work gets cancelled */
work->flags |= IO_WQ_WORK_CANCEL;
}
static void io_wq_switch_creds(struct io_worker *worker,
struct io_wq_work *work)
{
const struct cred *old_creds = override_creds(work->creds);
worker->cur_creds = work->creds;
if (worker->saved_creds)
put_cred(old_creds); /* creds set by previous switch */
else
worker->saved_creds = old_creds;
}
static void io_worker_handle_work(struct io_worker *worker) static void io_worker_handle_work(struct io_worker *worker)
__releases(wqe->lock) __releases(wqe->lock)
{ {
...@@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker) ...@@ -438,24 +476,19 @@ static void io_worker_handle_work(struct io_worker *worker)
if (work->flags & IO_WQ_WORK_CB) if (work->flags & IO_WQ_WORK_CB)
work->func(&work); work->func(&work);
if ((work->flags & IO_WQ_WORK_NEEDS_FILES) && if (work->files && current->files != work->files) {
current->files != work->files) {
task_lock(current); task_lock(current);
current->files = work->files; current->files = work->files;
task_unlock(current); task_unlock(current);
} }
if ((work->flags & IO_WQ_WORK_NEEDS_USER) && !worker->mm && if (work->mm != worker->mm)
wq->mm) { io_wq_switch_mm(worker, work);
if (mmget_not_zero(wq->mm)) { if (worker->cur_creds != work->creds)
use_mm(wq->mm); io_wq_switch_creds(worker, work);
set_fs(USER_DS); /*
worker->mm = wq->mm; * OK to set IO_WQ_WORK_CANCEL even for uncancellable work,
} else { * the worker function will do the right thing.
work->flags |= IO_WQ_WORK_CANCEL; */
}
}
if (!worker->creds)
worker->creds = override_creds(wq->creds);
if (test_bit(IO_WQ_BIT_CANCEL, &wq->state)) if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
work->flags |= IO_WQ_WORK_CANCEL; work->flags |= IO_WQ_WORK_CANCEL;
if (worker->mm) if (worker->mm)
...@@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, ...@@ -720,6 +753,7 @@ static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct,
static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
{ {
struct io_wqe_acct *acct = io_work_get_acct(wqe, work); struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
int work_flags;
unsigned long flags; unsigned long flags;
/* /*
...@@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) ...@@ -734,12 +768,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
return; return;
} }
work_flags = work->flags;
spin_lock_irqsave(&wqe->lock, flags); spin_lock_irqsave(&wqe->lock, flags);
wq_list_add_tail(&work->list, &wqe->work_list); wq_list_add_tail(&work->list, &wqe->work_list);
wqe->flags &= ~IO_WQE_FLAG_STALLED; wqe->flags &= ~IO_WQE_FLAG_STALLED;
spin_unlock_irqrestore(&wqe->lock, flags); spin_unlock_irqrestore(&wqe->lock, flags);
if (!atomic_read(&acct->nr_running)) if ((work_flags & IO_WQ_WORK_CONCURRENT) ||
!atomic_read(&acct->nr_running))
io_wqe_wake_worker(wqe, acct); io_wqe_wake_worker(wqe, acct);
} }
...@@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data) ...@@ -828,6 +864,7 @@ static bool io_work_cancel(struct io_worker *worker, void *cancel_data)
*/ */
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work && if (worker->cur_work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL) &&
data->cancel(worker->cur_work, data->caller_data)) { data->cancel(worker->cur_work, data->caller_data)) {
send_sig(SIGINT, worker->task, 1); send_sig(SIGINT, worker->task, 1);
ret = true; ret = true;
...@@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) ...@@ -902,7 +939,8 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data)
return false; return false;
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (worker->cur_work == work) { if (worker->cur_work == work &&
!(worker->cur_work->flags & IO_WQ_WORK_NO_CANCEL)) {
send_sig(SIGINT, worker->task, 1); send_sig(SIGINT, worker->task, 1);
ret = true; ret = true;
} }
...@@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1026,7 +1064,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
/* caller must already hold a reference to this */ /* caller must already hold a reference to this */
wq->user = data->user; wq->user = data->user;
wq->creds = data->creds;
for_each_node(node) { for_each_node(node) {
struct io_wqe *wqe; struct io_wqe *wqe;
...@@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1053,9 +1090,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
init_completion(&wq->done); init_completion(&wq->done);
/* caller must have already done mmgrab() on this mm */
wq->mm = data->mm;
wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager");
if (!IS_ERR(wq->manager)) { if (!IS_ERR(wq->manager)) {
wake_up_process(wq->manager); wake_up_process(wq->manager);
...@@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1064,6 +1098,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
ret = -ENOMEM; ret = -ENOMEM;
goto err; goto err;
} }
refcount_set(&wq->use_refs, 1);
reinit_completion(&wq->done); reinit_completion(&wq->done);
return wq; return wq;
} }
...@@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1078,13 +1113,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
return ERR_PTR(ret); return ERR_PTR(ret);
} }
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data)
{
if (data->get_work != wq->get_work || data->put_work != wq->put_work)
return false;
return refcount_inc_not_zero(&wq->use_refs);
}
static bool io_wq_worker_wake(struct io_worker *worker, void *data) static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{ {
wake_up_process(worker->task); wake_up_process(worker->task);
return false; return false;
} }
void io_wq_destroy(struct io_wq *wq) static void __io_wq_destroy(struct io_wq *wq)
{ {
int node; int node;
...@@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq) ...@@ -1104,3 +1147,9 @@ void io_wq_destroy(struct io_wq *wq)
kfree(wq->wqes); kfree(wq->wqes);
kfree(wq); kfree(wq);
} }
void io_wq_destroy(struct io_wq *wq)
{
if (refcount_dec_and_test(&wq->use_refs))
__io_wq_destroy(wq);
}
...@@ -7,11 +7,11 @@ enum { ...@@ -7,11 +7,11 @@ enum {
IO_WQ_WORK_CANCEL = 1, IO_WQ_WORK_CANCEL = 1,
IO_WQ_WORK_HAS_MM = 2, IO_WQ_WORK_HAS_MM = 2,
IO_WQ_WORK_HASHED = 4, IO_WQ_WORK_HASHED = 4,
IO_WQ_WORK_NEEDS_USER = 8,
IO_WQ_WORK_NEEDS_FILES = 16,
IO_WQ_WORK_UNBOUND = 32, IO_WQ_WORK_UNBOUND = 32,
IO_WQ_WORK_INTERNAL = 64, IO_WQ_WORK_INTERNAL = 64,
IO_WQ_WORK_CB = 128, IO_WQ_WORK_CB = 128,
IO_WQ_WORK_NO_CANCEL = 256,
IO_WQ_WORK_CONCURRENT = 512,
IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */
}; };
...@@ -72,6 +72,8 @@ struct io_wq_work { ...@@ -72,6 +72,8 @@ struct io_wq_work {
}; };
void (*func)(struct io_wq_work **); void (*func)(struct io_wq_work **);
struct files_struct *files; struct files_struct *files;
struct mm_struct *mm;
const struct cred *creds;
unsigned flags; unsigned flags;
}; };
...@@ -81,21 +83,22 @@ struct io_wq_work { ...@@ -81,21 +83,22 @@ struct io_wq_work {
(work)->func = _func; \ (work)->func = _func; \
(work)->flags = 0; \ (work)->flags = 0; \
(work)->files = NULL; \ (work)->files = NULL; \
(work)->mm = NULL; \
(work)->creds = NULL; \
} while (0) \ } while (0) \
typedef void (get_work_fn)(struct io_wq_work *); typedef void (get_work_fn)(struct io_wq_work *);
typedef void (put_work_fn)(struct io_wq_work *); typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data { struct io_wq_data {
struct mm_struct *mm;
struct user_struct *user; struct user_struct *user;
const struct cred *creds;
get_work_fn *get_work; get_work_fn *get_work;
put_work_fn *put_work; put_work_fn *put_work;
}; };
struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data);
bool io_wq_get(struct io_wq *wq, struct io_wq_data *data);
void io_wq_destroy(struct io_wq *wq); void io_wq_destroy(struct io_wq *wq);
void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work);
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <linux/compat.h> #include <linux/compat.h>
#include <linux/refcount.h> #include <linux/refcount.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/bits.h>
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <linux/fs.h> #include <linux/fs.h>
...@@ -70,6 +71,10 @@ ...@@ -70,6 +71,10 @@
#include <linux/sizes.h> #include <linux/sizes.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/namei.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/eventpoll.h>
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h> #include <trace/events/io_uring.h>
...@@ -177,6 +182,21 @@ struct fixed_file_table { ...@@ -177,6 +182,21 @@ struct fixed_file_table {
struct file **files; struct file **files;
}; };
enum {
FFD_F_ATOMIC,
};
struct fixed_file_data {
struct fixed_file_table *table;
struct io_ring_ctx *ctx;
struct percpu_ref refs;
struct llist_head put_llist;
unsigned long state;
struct work_struct ref_work;
struct completion done;
};
struct io_ring_ctx { struct io_ring_ctx {
struct { struct {
struct percpu_ref refs; struct percpu_ref refs;
...@@ -184,10 +204,11 @@ struct io_ring_ctx { ...@@ -184,10 +204,11 @@ struct io_ring_ctx {
struct { struct {
unsigned int flags; unsigned int flags;
bool compat; int compat: 1;
bool account_mem; int account_mem: 1;
bool cq_overflow_flushed; int cq_overflow_flushed: 1;
bool drain_next; int drain_next: 1;
int eventfd_async: 1;
/* /*
* Ring buffer of indices into array of io_uring_sqe, which is * Ring buffer of indices into array of io_uring_sqe, which is
...@@ -207,13 +228,14 @@ struct io_ring_ctx { ...@@ -207,13 +228,14 @@ struct io_ring_ctx {
unsigned sq_thread_idle; unsigned sq_thread_idle;
unsigned cached_sq_dropped; unsigned cached_sq_dropped;
atomic_t cached_cq_overflow; atomic_t cached_cq_overflow;
struct io_uring_sqe *sq_sqes; unsigned long sq_check_overflow;
struct list_head defer_list; struct list_head defer_list;
struct list_head timeout_list; struct list_head timeout_list;
struct list_head cq_overflow_list; struct list_head cq_overflow_list;
wait_queue_head_t inflight_wait; wait_queue_head_t inflight_wait;
struct io_uring_sqe *sq_sqes;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
struct io_rings *rings; struct io_rings *rings;
...@@ -229,8 +251,10 @@ struct io_ring_ctx { ...@@ -229,8 +251,10 @@ struct io_ring_ctx {
* readers must ensure that ->refs is alive as long as the file* is * readers must ensure that ->refs is alive as long as the file* is
* used. Only updated through io_uring_register(2). * used. Only updated through io_uring_register(2).
*/ */
struct fixed_file_table *file_table; struct fixed_file_data *file_data;
unsigned nr_user_files; unsigned nr_user_files;
int ring_fd;
struct file *ring_file;
/* if used, fixed mapped user buffers */ /* if used, fixed mapped user buffers */
unsigned nr_user_bufs; unsigned nr_user_bufs;
...@@ -250,11 +274,14 @@ struct io_ring_ctx { ...@@ -250,11 +274,14 @@ struct io_ring_ctx {
struct socket *ring_sock; struct socket *ring_sock;
#endif #endif
struct idr personality_idr;
struct { struct {
unsigned cached_cq_tail; unsigned cached_cq_tail;
unsigned cq_entries; unsigned cq_entries;
unsigned cq_mask; unsigned cq_mask;
atomic_t cq_timeouts; atomic_t cq_timeouts;
unsigned long cq_check_overflow;
struct wait_queue_head cq_wait; struct wait_queue_head cq_wait;
struct fasync_struct *cq_fasync; struct fasync_struct *cq_fasync;
struct eventfd_ctx *cq_ev_fd; struct eventfd_ctx *cq_ev_fd;
...@@ -267,7 +294,8 @@ struct io_ring_ctx { ...@@ -267,7 +294,8 @@ struct io_ring_ctx {
struct { struct {
spinlock_t completion_lock; spinlock_t completion_lock;
bool poll_multi_file; struct llist_head poll_llist;
/* /*
* ->poll_list is protected by the ctx->uring_lock for * ->poll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL. * io_uring instances that don't use IORING_SETUP_SQPOLL.
...@@ -277,6 +305,7 @@ struct io_ring_ctx { ...@@ -277,6 +305,7 @@ struct io_ring_ctx {
struct list_head poll_list; struct list_head poll_list;
struct hlist_head *cancel_hash; struct hlist_head *cancel_hash;
unsigned cancel_hash_bits; unsigned cancel_hash_bits;
bool poll_multi_file;
spinlock_t inflight_lock; spinlock_t inflight_lock;
struct list_head inflight_list; struct list_head inflight_list;
...@@ -299,6 +328,12 @@ struct io_poll_iocb { ...@@ -299,6 +328,12 @@ struct io_poll_iocb {
struct wait_queue_entry wait; struct wait_queue_entry wait;
}; };
struct io_close {
struct file *file;
struct file *put_file;
int fd;
};
struct io_timeout_data { struct io_timeout_data {
struct io_kiocb *req; struct io_kiocb *req;
struct hrtimer timer; struct hrtimer timer;
...@@ -319,6 +354,7 @@ struct io_sync { ...@@ -319,6 +354,7 @@ struct io_sync {
loff_t len; loff_t len;
loff_t off; loff_t off;
int flags; int flags;
int mode;
}; };
struct io_cancel { struct io_cancel {
...@@ -348,8 +384,52 @@ struct io_connect { ...@@ -348,8 +384,52 @@ struct io_connect {
struct io_sr_msg { struct io_sr_msg {
struct file *file; struct file *file;
union {
struct user_msghdr __user *msg; struct user_msghdr __user *msg;
void __user *buf;
};
int msg_flags; int msg_flags;
size_t len;
};
struct io_open {
struct file *file;
int dfd;
union {
unsigned mask;
};
struct filename *filename;
struct statx __user *buffer;
struct open_how how;
};
struct io_files_update {
struct file *file;
u64 arg;
u32 nr_args;
u32 offset;
};
struct io_fadvise {
struct file *file;
u64 offset;
u32 len;
u32 advice;
};
struct io_madvise {
struct file *file;
u64 addr;
u32 len;
u32 advice;
};
struct io_epoll {
struct file *file;
int epfd;
int op;
int fd;
struct epoll_event event;
}; };
struct io_async_connect { struct io_async_connect {
...@@ -370,15 +450,79 @@ struct io_async_rw { ...@@ -370,15 +450,79 @@ struct io_async_rw {
ssize_t size; ssize_t size;
}; };
struct io_async_open {
struct filename *filename;
};
struct io_async_ctx { struct io_async_ctx {
union { union {
struct io_async_rw rw; struct io_async_rw rw;
struct io_async_msghdr msg; struct io_async_msghdr msg;
struct io_async_connect connect; struct io_async_connect connect;
struct io_timeout_data timeout; struct io_timeout_data timeout;
struct io_async_open open;
}; };
}; };
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
REQ_F_LINK_BIT = IOSQE_IO_LINK_BIT,
REQ_F_HARDLINK_BIT = IOSQE_IO_HARDLINK_BIT,
REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT,
REQ_F_LINK_NEXT_BIT,
REQ_F_FAIL_LINK_BIT,
REQ_F_INFLIGHT_BIT,
REQ_F_CUR_POS_BIT,
REQ_F_NOWAIT_BIT,
REQ_F_IOPOLL_COMPLETED_BIT,
REQ_F_LINK_TIMEOUT_BIT,
REQ_F_TIMEOUT_BIT,
REQ_F_ISREG_BIT,
REQ_F_MUST_PUNT_BIT,
REQ_F_TIMEOUT_NOSEQ_BIT,
REQ_F_COMP_LOCKED_BIT,
};
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
/* already grabbed next link */
REQ_F_LINK_NEXT = BIT(REQ_F_LINK_NEXT_BIT),
/* fail rest of links */
REQ_F_FAIL_LINK = BIT(REQ_F_FAIL_LINK_BIT),
/* on inflight list */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
/* polled IO has completed */
REQ_F_IOPOLL_COMPLETED = BIT(REQ_F_IOPOLL_COMPLETED_BIT),
/* has linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
/* timeout request */
REQ_F_TIMEOUT = BIT(REQ_F_TIMEOUT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
/* must be punted even for NONBLOCK */
REQ_F_MUST_PUNT = BIT(REQ_F_MUST_PUNT_BIT),
/* no timeout sequence */
REQ_F_TIMEOUT_NOSEQ = BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
/* completion under lock */
REQ_F_COMP_LOCKED = BIT(REQ_F_COMP_LOCKED_BIT),
};
/* /*
* NOTE! Each of the iocb union members has the file pointer * NOTE! Each of the iocb union members has the file pointer
* as the first entry in their struct definition. So you can * as the first entry in their struct definition. So you can
...@@ -396,11 +540,19 @@ struct io_kiocb { ...@@ -396,11 +540,19 @@ struct io_kiocb {
struct io_timeout timeout; struct io_timeout timeout;
struct io_connect connect; struct io_connect connect;
struct io_sr_msg sr_msg; struct io_sr_msg sr_msg;
struct io_open open;
struct io_close close;
struct io_files_update files_update;
struct io_fadvise fadvise;
struct io_madvise madvise;
struct io_epoll epoll;
}; };
struct io_async_ctx *io; struct io_async_ctx *io;
struct file *ring_file; /*
int ring_fd; * llist_node is only used for poll deferred completions
*/
struct llist_node llist_node;
bool has_user; bool has_user;
bool in_async; bool in_async;
bool needs_fixed_file; bool needs_fixed_file;
...@@ -414,23 +566,6 @@ struct io_kiocb { ...@@ -414,23 +566,6 @@ struct io_kiocb {
struct list_head link_list; struct list_head link_list;
unsigned int flags; unsigned int flags;
refcount_t refs; refcount_t refs;
#define REQ_F_NOWAIT 1 /* must not punt to workers */
#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */
#define REQ_F_FIXED_FILE 4 /* ctx owns file */
#define REQ_F_LINK_NEXT 8 /* already grabbed next link */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_LINK_TIMEOUT 128 /* has linked timeout */
#define REQ_F_FAIL_LINK 256 /* fail rest of links */
#define REQ_F_DRAIN_LINK 512 /* link should be fully drained */
#define REQ_F_TIMEOUT 1024 /* timeout request */
#define REQ_F_ISREG 2048 /* regular file */
#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */
#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */
u64 user_data; u64 user_data;
u32 result; u32 result;
u32 sequence; u32 sequence;
...@@ -463,14 +598,162 @@ struct io_submit_state { ...@@ -463,14 +598,162 @@ struct io_submit_state {
unsigned int ios_left; unsigned int ios_left;
}; };
struct io_op_def {
/* needs req->io allocated for deferral/async */
unsigned async_ctx : 1;
/* needs current->mm setup, does mm access */
unsigned needs_mm : 1;
/* needs req->file assigned */
unsigned needs_file : 1;
/* needs req->file assigned IFF fd is >= 0 */
unsigned fd_non_neg : 1;
/* hash wq insertion if file is a regular file */
unsigned hash_reg_file : 1;
/* unbound wq insertion if file is a non-regular file */
unsigned unbound_nonreg_file : 1;
/* opcode is not supported by this kernel */
unsigned not_supported : 1;
/* needs file table */
unsigned file_table : 1;
};
static const struct io_op_def io_op_defs[] = {
[IORING_OP_NOP] = {},
[IORING_OP_READV] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITEV] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
},
[IORING_OP_READ_FIXED] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
.hash_reg_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_POLL_REMOVE] = {},
[IORING_OP_SYNC_FILE_RANGE] = {
.needs_file = 1,
},
[IORING_OP_SENDMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_RECVMSG] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_TIMEOUT] = {
.async_ctx = 1,
.needs_mm = 1,
},
[IORING_OP_TIMEOUT_REMOVE] = {},
[IORING_OP_ACCEPT] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
.file_table = 1,
},
[IORING_OP_ASYNC_CANCEL] = {},
[IORING_OP_LINK_TIMEOUT] = {
.async_ctx = 1,
.needs_mm = 1,
},
[IORING_OP_CONNECT] = {
.async_ctx = 1,
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
},
[IORING_OP_OPENAT] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
},
[IORING_OP_CLOSE] = {
.needs_file = 1,
.file_table = 1,
},
[IORING_OP_FILES_UPDATE] = {
.needs_mm = 1,
.file_table = 1,
},
[IORING_OP_STATX] = {
.needs_mm = 1,
.needs_file = 1,
.fd_non_neg = 1,
},
[IORING_OP_READ] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_WRITE] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
},
[IORING_OP_MADVISE] = {
.needs_mm = 1,
},
[IORING_OP_SEND] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_RECV] = {
.needs_mm = 1,
.needs_file = 1,
.unbound_nonreg_file = 1,
},
[IORING_OP_OPENAT2] = {
.needs_file = 1,
.fd_non_neg = 1,
.file_table = 1,
},
[IORING_OP_EPOLL_CTL] = {
.unbound_nonreg_file = 1,
.file_table = 1,
},
};
static void io_wq_submit_work(struct io_wq_work **workptr); static void io_wq_submit_work(struct io_wq_work **workptr);
static void io_cqring_fill_event(struct io_kiocb *req, long res); static void io_cqring_fill_event(struct io_kiocb *req, long res);
static void __io_free_req(struct io_kiocb *req);
static void io_put_req(struct io_kiocb *req); static void io_put_req(struct io_kiocb *req);
static void io_double_put_req(struct io_kiocb *req);
static void __io_double_put_req(struct io_kiocb *req); static void __io_double_put_req(struct io_kiocb *req);
static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
static void io_queue_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req);
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
static struct kmem_cache *req_cachep; static struct kmem_cache *req_cachep;
...@@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -537,9 +820,11 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->cq_overflow_list);
init_completion(&ctx->completions[0]); init_completion(&ctx->completions[0]);
init_completion(&ctx->completions[1]); init_completion(&ctx->completions[1]);
idr_init(&ctx->personality_idr);
mutex_init(&ctx->uring_lock); mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait); init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->completion_lock);
init_llist_head(&ctx->poll_llist);
INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->poll_list);
INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->timeout_list);
...@@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req) ...@@ -566,7 +851,7 @@ static inline bool __req_need_defer(struct io_kiocb *req)
static inline bool req_need_defer(struct io_kiocb *req) static inline bool req_need_defer(struct io_kiocb *req)
{ {
if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) == REQ_F_IO_DRAIN) if (unlikely(req->flags & REQ_F_IO_DRAIN))
return __req_need_defer(req); return __req_need_defer(req);
return false; return false;
...@@ -606,7 +891,6 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) ...@@ -606,7 +891,6 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
if (ctx->cached_cq_tail != READ_ONCE(rings->cq.tail)) {
/* order cqe stores with ring update */ /* order cqe stores with ring update */
smp_store_release(&rings->cq.tail, ctx->cached_cq_tail); smp_store_release(&rings->cq.tail, ctx->cached_cq_tail);
...@@ -614,45 +898,46 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) ...@@ -614,45 +898,46 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
wake_up_interruptible(&ctx->cq_wait); wake_up_interruptible(&ctx->cq_wait);
kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN);
} }
}
static inline void io_req_work_grab_env(struct io_kiocb *req,
const struct io_op_def *def)
{
if (!req->work.mm && def->needs_mm) {
mmgrab(current->mm);
req->work.mm = current->mm;
} }
if (!req->work.creds)
req->work.creds = get_current_cred();
} }
static inline bool io_req_needs_user(struct io_kiocb *req) static inline void io_req_work_drop_env(struct io_kiocb *req)
{ {
return !(req->opcode == IORING_OP_READ_FIXED || if (req->work.mm) {
req->opcode == IORING_OP_WRITE_FIXED); mmdrop(req->work.mm);
req->work.mm = NULL;
}
if (req->work.creds) {
put_cred(req->work.creds);
req->work.creds = NULL;
}
} }
static inline bool io_prep_async_work(struct io_kiocb *req, static inline bool io_prep_async_work(struct io_kiocb *req,
struct io_kiocb **link) struct io_kiocb **link)
{ {
const struct io_op_def *def = &io_op_defs[req->opcode];
bool do_hashed = false; bool do_hashed = false;
switch (req->opcode) { if (req->flags & REQ_F_ISREG) {
case IORING_OP_WRITEV: if (def->hash_reg_file)
case IORING_OP_WRITE_FIXED:
/* only regular files should be hashed for writes */
if (req->flags & REQ_F_ISREG)
do_hashed = true; do_hashed = true;
/* fall-through */ } else {
case IORING_OP_READV: if (def->unbound_nonreg_file)
case IORING_OP_READ_FIXED:
case IORING_OP_SENDMSG:
case IORING_OP_RECVMSG:
case IORING_OP_ACCEPT:
case IORING_OP_POLL_ADD:
case IORING_OP_CONNECT:
/*
* We know REQ_F_ISREG is not set on some of these
* opcodes, but this enables us to keep the check in
* just one place.
*/
if (!(req->flags & REQ_F_ISREG))
req->work.flags |= IO_WQ_WORK_UNBOUND; req->work.flags |= IO_WQ_WORK_UNBOUND;
break;
} }
if (io_req_needs_user(req))
req->work.flags |= IO_WQ_WORK_NEEDS_USER; io_req_work_grab_env(req, def);
*link = io_prep_linked_timeout(req); *link = io_prep_linked_timeout(req);
return do_hashed; return do_hashed;
...@@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx) ...@@ -711,10 +996,8 @@ static void io_commit_cqring(struct io_ring_ctx *ctx)
__io_commit_cqring(ctx); __io_commit_cqring(ctx);
while ((req = io_get_deferred_req(ctx)) != NULL) { while ((req = io_get_deferred_req(ctx)) != NULL)
req->flags |= REQ_F_IO_DRAINED;
io_queue_async_work(req); io_queue_async_work(req);
}
} }
static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
...@@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) ...@@ -735,13 +1018,20 @@ static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx)
return &rings->cqes[tail & ctx->cq_mask]; return &rings->cqes[tail & ctx->cq_mask];
} }
static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx)
{
if (!ctx->eventfd_async)
return true;
return io_wq_current_is_worker() || in_interrupt();
}
static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
{ {
if (waitqueue_active(&ctx->wait)) if (waitqueue_active(&ctx->wait))
wake_up(&ctx->wait); wake_up(&ctx->wait);
if (waitqueue_active(&ctx->sqo_wait)) if (waitqueue_active(&ctx->sqo_wait))
wake_up(&ctx->sqo_wait); wake_up(&ctx->sqo_wait);
if (ctx->cq_ev_fd) if (ctx->cq_ev_fd && io_should_trigger_evfd(ctx))
eventfd_signal(ctx->cq_ev_fd, 1); eventfd_signal(ctx->cq_ev_fd, 1);
} }
...@@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ...@@ -766,7 +1056,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
/* if force is set, the ring is going away. always drop after that */ /* if force is set, the ring is going away. always drop after that */
if (force) if (force)
ctx->cq_overflow_flushed = true; ctx->cq_overflow_flushed = 1;
cqe = NULL; cqe = NULL;
while (!list_empty(&ctx->cq_overflow_list)) { while (!list_empty(&ctx->cq_overflow_list)) {
...@@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) ...@@ -788,6 +1078,10 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
} }
io_commit_cqring(ctx); io_commit_cqring(ctx);
if (cqe) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
}
spin_unlock_irqrestore(&ctx->completion_lock, flags); spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx); io_cqring_ev_posted(ctx);
...@@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res) ...@@ -821,6 +1115,10 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
WRITE_ONCE(ctx->rings->cq_overflow, WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow)); atomic_inc_return(&ctx->cached_cq_overflow));
} else { } else {
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
}
refcount_inc(&req->refs); refcount_inc(&req->refs);
req->result = res; req->result = res;
list_add_tail(&req->list, &ctx->cq_overflow_list); list_add_tail(&req->list, &ctx->cq_overflow_list);
...@@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ...@@ -863,9 +1161,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req; struct io_kiocb *req;
if (!percpu_ref_tryget(&ctx->refs))
return NULL;
if (!state) { if (!state) {
req = kmem_cache_alloc(req_cachep, gfp); req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req)) if (unlikely(!req))
...@@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ...@@ -898,7 +1193,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
got_it: got_it:
req->io = NULL; req->io = NULL;
req->ring_file = NULL;
req->file = NULL; req->file = NULL;
req->ctx = ctx; req->ctx = ctx;
req->flags = 0; req->flags = 0;
...@@ -915,24 +1209,35 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ...@@ -915,24 +1209,35 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
return NULL; return NULL;
} }
static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) static void __io_req_do_free(struct io_kiocb *req)
{ {
if (*nr) { if (likely(!io_is_fallback_req(req)))
kmem_cache_free_bulk(req_cachep, *nr, reqs); kmem_cache_free(req_cachep, req);
percpu_ref_put_many(&ctx->refs, *nr); else
*nr = 0; clear_bit_unlock(0, (unsigned long *) req->ctx->fallback_req);
}
} }
static void __io_free_req(struct io_kiocb *req) static void __io_req_aux_free(struct io_kiocb *req)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
if (req->io)
kfree(req->io); kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE)) if (req->file) {
if (req->flags & REQ_F_FIXED_FILE)
percpu_ref_put(&ctx->file_data->refs);
else
fput(req->file); fput(req->file);
}
io_req_work_drop_env(req);
}
static void __io_free_req(struct io_kiocb *req)
{
__io_req_aux_free(req);
if (req->flags & REQ_F_INFLIGHT) { if (req->flags & REQ_F_INFLIGHT) {
struct io_ring_ctx *ctx = req->ctx;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&ctx->inflight_lock, flags); spin_lock_irqsave(&ctx->inflight_lock, flags);
...@@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req) ...@@ -941,11 +1246,63 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait); wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags); spin_unlock_irqrestore(&ctx->inflight_lock, flags);
} }
percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req))) percpu_ref_put(&req->ctx->refs);
kmem_cache_free(req_cachep, req); __io_req_do_free(req);
else }
clear_bit_unlock(0, (unsigned long *) ctx->fallback_req);
struct req_batch {
void *reqs[IO_IOPOLL_BATCH];
int to_free;
int need_iter;
};
static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
{
int fixed_refs = rb->to_free;
if (!rb->to_free)
return;
if (rb->need_iter) {
int i, inflight = 0;
unsigned long flags;
fixed_refs = 0;
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_FIXED_FILE) {
req->file = NULL;
fixed_refs++;
}
if (req->flags & REQ_F_INFLIGHT)
inflight++;
__io_req_aux_free(req);
}
if (!inflight)
goto do_free;
spin_lock_irqsave(&ctx->inflight_lock, flags);
for (i = 0; i < rb->to_free; i++) {
struct io_kiocb *req = rb->reqs[i];
if (req->flags & REQ_F_INFLIGHT) {
list_del(&req->inflight_entry);
if (!--inflight)
break;
}
}
spin_unlock_irqrestore(&ctx->inflight_lock, flags);
if (waitqueue_active(&ctx->inflight_wait))
wake_up(&ctx->inflight_wait);
}
do_free:
kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
if (fixed_refs)
percpu_ref_put_many(&ctx->file_data->refs, fixed_refs);
percpu_ref_put_many(&ctx->refs, rb->to_free);
rb->to_free = rb->need_iter = 0;
} }
static bool io_link_cancel_timeout(struct io_kiocb *req) static bool io_link_cancel_timeout(struct io_kiocb *req)
...@@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush) ...@@ -1118,19 +1475,21 @@ static unsigned io_cqring_events(struct io_ring_ctx *ctx, bool noflush)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
if (test_bit(0, &ctx->cq_check_overflow)) {
/* /*
* noflush == true is from the waitqueue handler, just ensure we wake * noflush == true is from the waitqueue handler, just ensure
* up the task, and the next invocation will flush the entries. We * we wake up the task, and the next invocation will flush the
* cannot safely to it from here. * entries. We cannot safely to it from here.
*/ */
if (noflush && !list_empty(&ctx->cq_overflow_list)) if (noflush && !list_empty(&ctx->cq_overflow_list))
return -1U; return -1U;
io_cqring_overflow_flush(ctx, false); io_cqring_overflow_flush(ctx, false);
}
/* See comment at the top of this file */ /* See comment at the top of this file */
smp_rmb(); smp_rmb();
return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); return ctx->cached_cq_tail - READ_ONCE(rings->cq.head);
} }
static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
...@@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) ...@@ -1141,17 +1500,30 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
} }
static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
{
if ((req->flags & REQ_F_LINK) || io_is_fallback_req(req))
return false;
if (!(req->flags & REQ_F_FIXED_FILE) || req->io)
rb->need_iter++;
rb->reqs[rb->to_free++] = req;
if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
io_free_req_many(req->ctx, rb);
return true;
}
/* /*
* Find and free completed poll iocbs * Find and free completed poll iocbs
*/ */
static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
struct list_head *done) struct list_head *done)
{ {
void *reqs[IO_IOPOLL_BATCH]; struct req_batch rb;
struct io_kiocb *req; struct io_kiocb *req;
int to_free;
to_free = 0; rb.to_free = rb.need_iter = 0;
while (!list_empty(done)) { while (!list_empty(done)) {
req = list_first_entry(done, struct io_kiocb, list); req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list); list_del(&req->list);
...@@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, ...@@ -1159,26 +1531,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
io_cqring_fill_event(req, req->result); io_cqring_fill_event(req, req->result);
(*nr_events)++; (*nr_events)++;
if (refcount_dec_and_test(&req->refs)) { if (refcount_dec_and_test(&req->refs) &&
/* If we're not using fixed files, we have to pair the !io_req_multi_free(&rb, req))
* completion part with the file put. Use regular
* completions for those, only batch free for fixed
* file and non-linked commands.
*/
if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
!req->io) {
reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free);
} else {
io_free_req(req); io_free_req(req);
} }
}
}
io_commit_cqring(ctx); io_commit_cqring(ctx);
io_free_req_many(ctx, reqs, &to_free); io_free_req_many(ctx, &rb);
} }
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
...@@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1503,6 +1862,10 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
req->flags |= REQ_F_ISREG; req->flags |= REQ_F_ISREG;
kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_pos = READ_ONCE(sqe->off);
if (kiocb->ki_pos == -1 && !(req->file->f_mode & FMODE_STREAM)) {
req->flags |= REQ_F_CUR_POS;
kiocb->ki_pos = req->file->f_pos;
}
kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
...@@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) ...@@ -1574,6 +1937,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt, static void kiocb_done(struct kiocb *kiocb, ssize_t ret, struct io_kiocb **nxt,
bool in_async) bool in_async)
{ {
struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
if (req->flags & REQ_F_CUR_POS)
req->file->f_pos = kiocb->ki_pos;
if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw) if (in_async && ret >= 0 && kiocb->ki_complete == io_complete_rw)
*nxt = __io_complete_rw(kiocb, ret); *nxt = __io_complete_rw(kiocb, ret);
else else
...@@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ...@@ -1671,6 +2038,13 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
if (req->rw.kiocb.private) if (req->rw.kiocb.private)
return -EINVAL; return -EINVAL;
if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) {
ssize_t ret;
ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
*iovec = NULL;
return ret;
}
if (req->io) { if (req->io) {
struct io_async_rw *iorw = &req->io->rw; struct io_async_rw *iorw = &req->io->rw;
...@@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, ...@@ -1767,6 +2141,8 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
static int io_alloc_async_ctx(struct io_kiocb *req) static int io_alloc_async_ctx(struct io_kiocb *req)
{ {
if (!io_op_defs[req->opcode].async_ctx)
return 0;
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
return req->io == NULL; return req->io == NULL;
} }
...@@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, ...@@ -1786,8 +2162,7 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov, struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter) struct iov_iter *iter)
{ {
if (req->opcode == IORING_OP_READ_FIXED || if (!io_op_defs[req->opcode].async_ctx)
req->opcode == IORING_OP_WRITE_FIXED)
return 0; return 0;
if (!req->io && io_alloc_async_ctx(req)) if (!req->io && io_alloc_async_ctx(req))
return -ENOMEM; return -ENOMEM;
...@@ -2101,35 +2476,14 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -2101,35 +2476,14 @@ static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt,
return 0; return 0;
} }
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) static void io_fallocate_finish(struct io_wq_work **workptr)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->file)
return -EBADF;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->len);
req->sync.flags = READ_ONCE(sqe->sync_range_flags);
return 0;
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{ {
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL; struct io_kiocb *nxt = NULL;
int ret; int ret;
if (io_req_cancelled(req)) ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
return; req->sync.len);
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0) if (ret < 0)
req_set_fail_links(req); req_set_fail_links(req);
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
...@@ -2138,59 +2492,496 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr) ...@@ -2138,59 +2492,496 @@ static void io_sync_file_range_finish(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt); io_wq_assign_next(workptr, nxt);
} }
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, static int io_fallocate_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->rw_flags)
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->addr);
req->sync.mode = READ_ONCE(sqe->len);
return 0;
}
static int io_fallocate(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock) bool force_nonblock)
{ {
struct io_wq_work *work, *old_work; struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */ /* fallocate always requiring blocking context */
if (force_nonblock) { if (force_nonblock) {
io_put_req(req); io_put_req(req);
req->work.func = io_sync_file_range_finish; req->work.func = io_fallocate_finish;
return -EAGAIN; return -EAGAIN;
} }
work = old_work = &req->work; work = old_work = &req->work;
io_sync_file_range_finish(&work); io_fallocate_finish(&work);
if (work && work != old_work) if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work); *nxt = container_of(work, struct io_kiocb, work);
return 0; return 0;
} }
#if defined(CONFIG_NET) static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
static void io_sendrecv_async(struct io_wq_work **workptr)
{ {
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); const char __user *fname;
struct iovec *iov = NULL; int ret;
if (req->io->rw.iov != req->io->rw.fast_iov) if (sqe->ioprio || sqe->buf_index)
iov = req->io->msg.iov; return -EINVAL;
io_wq_submit_work(workptr);
kfree(iov);
}
#endif
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->open.dfd = READ_ONCE(sqe->fd);
{ req->open.how.mode = READ_ONCE(sqe->len);
#if defined(CONFIG_NET) fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
struct io_sr_msg *sr = &req->sr_msg; req->open.how.flags = READ_ONCE(sqe->open_flags);
struct io_async_ctx *io = req->io;
sr->msg_flags = READ_ONCE(sqe->msg_flags); req->open.filename = getname(fname);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
if (!io)
return 0; return 0;
io->msg.iov = io->msg.fast_iov;
return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
#else
return -EOPNOTSUPP;
#endif
} }
static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct open_how __user *how;
const char __user *fname;
size_t len;
int ret;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
req->open.dfd = READ_ONCE(sqe->fd);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
how = u64_to_user_ptr(READ_ONCE(sqe->addr2));
len = READ_ONCE(sqe->len);
if (len < OPEN_HOW_SIZE_VER0)
return -EINVAL;
ret = copy_struct_from_user(&req->open.how, sizeof(req->open.how), how,
len);
if (ret)
return ret;
if (!(req->open.how.flags & O_PATH) && force_o_largefile())
req->open.how.flags |= O_LARGEFILE;
req->open.filename = getname(fname);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
return 0;
}
static int io_openat2(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct open_flags op;
struct file *file;
int ret;
if (force_nonblock)
return -EAGAIN;
ret = build_open_flags(&req->open.how, &op);
if (ret)
goto err;
ret = get_unused_fd_flags(req->open.how.flags);
if (ret < 0)
goto err;
file = do_filp_open(req->open.dfd, req->open.filename, &op);
if (IS_ERR(file)) {
put_unused_fd(ret);
ret = PTR_ERR(file);
} else {
fsnotify_open(file);
fd_install(ret, file);
}
err:
putname(req->open.filename);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_openat(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
req->open.how = build_open_how(req->open.how.flags, req->open.how.mode);
return io_openat2(req, nxt, force_nonblock);
}
static int io_epoll_ctl_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_EPOLL)
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
req->epoll.epfd = READ_ONCE(sqe->fd);
req->epoll.op = READ_ONCE(sqe->len);
req->epoll.fd = READ_ONCE(sqe->off);
if (ep_op_has_event(req->epoll.op)) {
struct epoll_event __user *ev;
ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&req->epoll.event, ev, sizeof(*ev)))
return -EFAULT;
}
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_epoll_ctl(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_EPOLL)
struct io_epoll *ie = &req->epoll;
int ret;
ret = do_epoll_ctl(ie->epfd, ie->op, ie->fd, &ie->event, force_nonblock);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
if (sqe->ioprio || sqe->buf_index || sqe->off)
return -EINVAL;
req->madvise.addr = READ_ONCE(sqe->addr);
req->madvise.len = READ_ONCE(sqe->len);
req->madvise.advice = READ_ONCE(sqe->fadvise_advice);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_madvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU)
struct io_madvise *ma = &req->madvise;
int ret;
if (force_nonblock)
return -EAGAIN;
ret = do_madvise(ma->addr, ma->len, ma->advice);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
if (sqe->ioprio || sqe->buf_index || sqe->addr)
return -EINVAL;
req->fadvise.offset = READ_ONCE(sqe->off);
req->fadvise.len = READ_ONCE(sqe->len);
req->fadvise.advice = READ_ONCE(sqe->fadvise_advice);
return 0;
}
static int io_fadvise(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_fadvise *fa = &req->fadvise;
int ret;
/* DONTNEED may block, others _should_ not */
if (fa->advice == POSIX_FADV_DONTNEED && force_nonblock)
return -EAGAIN;
ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
const char __user *fname;
unsigned lookup_flags;
int ret;
if (sqe->ioprio || sqe->buf_index)
return -EINVAL;
req->open.dfd = READ_ONCE(sqe->fd);
req->open.mask = READ_ONCE(sqe->len);
fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
req->open.buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2));
req->open.how.flags = READ_ONCE(sqe->statx_flags);
if (vfs_stat_set_lookup_flags(&lookup_flags, req->open.how.flags))
return -EINVAL;
req->open.filename = getname_flags(fname, lookup_flags, NULL);
if (IS_ERR(req->open.filename)) {
ret = PTR_ERR(req->open.filename);
req->open.filename = NULL;
return ret;
}
return 0;
}
static int io_statx(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_open *ctx = &req->open;
unsigned lookup_flags;
struct path path;
struct kstat stat;
int ret;
if (force_nonblock)
return -EAGAIN;
if (vfs_stat_set_lookup_flags(&lookup_flags, ctx->how.flags))
return -EINVAL;
retry:
/* filename_lookup() drops it, keep a reference */
ctx->filename->refcnt++;
ret = filename_lookup(ctx->dfd, ctx->filename, lookup_flags, &path,
NULL);
if (ret)
goto err;
ret = vfs_getattr(&path, &stat, ctx->mask, ctx->how.flags);
path_put(&path);
if (retry_estale(ret, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
if (!ret)
ret = cp_statx(&stat, ctx->buffer);
err:
putname(ctx->filename);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, nxt);
return 0;
}
static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
/*
* If we queue this for async, it must not be cancellable. That would
* leave the 'file' in an undeterminate state.
*/
req->work.flags |= IO_WQ_WORK_NO_CANCEL;
if (sqe->ioprio || sqe->off || sqe->addr || sqe->len ||
sqe->rw_flags || sqe->buf_index)
return -EINVAL;
if (sqe->flags & IOSQE_FIXED_FILE)
return -EINVAL;
req->close.fd = READ_ONCE(sqe->fd);
if (req->file->f_op == &io_uring_fops ||
req->close.fd == req->ctx->ring_fd)
return -EBADF;
return 0;
}
static void io_close_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
/* Invoked with files, we need to do the close */
if (req->work.files) {
int ret;
ret = filp_close(req->close.put_file, req->work.files);
if (ret < 0) {
req_set_fail_links(req);
}
io_cqring_add_event(req, ret);
}
fput(req->close.put_file);
/* we bypassed the re-issue, drop the submission reference */
io_put_req(req);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static int io_close(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
int ret;
req->close.put_file = NULL;
ret = __close_fd_get_file(req->close.fd, &req->close.put_file);
if (ret < 0)
return ret;
/* if the file has a flush method, be safe and punt to async */
if (req->close.put_file->f_op->flush && !io_wq_current_is_worker())
goto eagain;
/*
* No ->flush(), safely close from here and just punt the
* fput() to async context.
*/
ret = filp_close(req->close.put_file, current->files);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
if (io_wq_current_is_worker()) {
struct io_wq_work *old_work, *work;
old_work = work = &req->work;
io_close_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
return 0;
}
eagain:
req->work.func = io_close_finish;
return -EAGAIN;
}
static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ring_ctx *ctx = req->ctx;
if (!req->file)
return -EBADF;
if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index))
return -EINVAL;
req->sync.off = READ_ONCE(sqe->off);
req->sync.len = READ_ONCE(sqe->len);
req->sync.flags = READ_ONCE(sqe->sync_range_flags);
return 0;
}
static void io_sync_file_range_finish(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
int ret;
if (io_req_cancelled(req))
return;
ret = sync_file_range(req->file, req->sync.off, req->sync.len,
req->sync.flags);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req_find_next(req, &nxt);
if (nxt)
io_wq_assign_next(workptr, nxt);
}
static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
struct io_wq_work *work, *old_work;
/* sync_file_range always requires a blocking context */
if (force_nonblock) {
io_put_req(req);
req->work.func = io_sync_file_range_finish;
return -EAGAIN;
}
work = old_work = &req->work;
io_sync_file_range_finish(&work);
if (work && work != old_work)
*nxt = container_of(work, struct io_kiocb, work);
return 0;
}
#if defined(CONFIG_NET)
static void io_sendrecv_async(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct iovec *iov = NULL;
if (req->io->rw.iov != req->io->rw.fast_iov)
iov = req->io->msg.iov;
io_wq_submit_work(workptr);
kfree(iov);
}
#endif
static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
#if defined(CONFIG_NET)
struct io_sr_msg *sr = &req->sr_msg;
struct io_async_ctx *io = req->io;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
sr->len = READ_ONCE(sqe->len);
if (!io || req->opcode == IORING_OP_SEND)
return 0;
io->msg.iov = io->msg.fast_iov;
return sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
&io->msg.iov);
#else
return -EOPNOTSUPP;
#endif
}
static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock) bool force_nonblock)
{ {
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
...@@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -2259,6 +3050,56 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif #endif
} }
static int io_send(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
struct iovec iov;
unsigned flags;
ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
&msg.msg_iter);
if (ret)
return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
ret = __sys_sendmsg_sock(sock, &msg, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg_prep(struct io_kiocb *req, static int io_recvmsg_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe) const struct io_uring_sqe *sqe)
{ {
...@@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, ...@@ -2269,7 +3110,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
sr->msg_flags = READ_ONCE(sqe->msg_flags); sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (!io) if (!io || req->opcode == IORING_OP_RECV)
return 0; return 0;
io->msg.iov = io->msg.fast_iov; io->msg.iov = io->msg.fast_iov;
...@@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -2351,6 +3192,59 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
#endif #endif
} }
static int io_recv(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock)
{
#if defined(CONFIG_NET)
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct io_sr_msg *sr = &req->sr_msg;
struct msghdr msg;
struct iovec iov;
unsigned flags;
ret = import_single_range(READ, sr->buf, sr->len, &iov,
&msg.msg_iter);
if (ret)
return ret;
msg.msg_name = NULL;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_namelen = 0;
msg.msg_iocb = NULL;
msg.msg_flags = 0;
flags = req->sr_msg.msg_flags;
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
ret = __sys_recvmsg_sock(sock, &msg, NULL, NULL, flags);
if (force_nonblock && ret == -EAGAIN)
return -EAGAIN;
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
io_cqring_add_event(req, ret);
if (ret < 0)
req_set_fail_links(req);
io_put_req_find_next(req, nxt);
return 0;
#else
return -EOPNOTSUPP;
#endif
}
static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
...@@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -2414,7 +3308,6 @@ static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt,
ret = __io_accept(req, nxt, force_nonblock); ret = __io_accept(req, nxt, force_nonblock);
if (ret == -EAGAIN && force_nonblock) { if (ret == -EAGAIN && force_nonblock) {
req->work.func = io_accept_finish; req->work.func = io_accept_finish;
req->work.flags |= IO_WQ_WORK_NEEDS_FILES;
io_put_req(req); io_put_req(req);
return -EAGAIN; return -EAGAIN;
} }
...@@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr) ...@@ -2635,6 +3528,39 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt); io_wq_assign_next(workptr, nxt);
} }
static void __io_poll_flush(struct io_ring_ctx *ctx, struct llist_node *nodes)
{
struct io_kiocb *req, *tmp;
struct req_batch rb;
rb.to_free = rb.need_iter = 0;
spin_lock_irq(&ctx->completion_lock);
llist_for_each_entry_safe(req, tmp, nodes, llist_node) {
hash_del(&req->hash_node);
io_poll_complete(req, req->result, 0);
if (refcount_dec_and_test(&req->refs) &&
!io_req_multi_free(&rb, req)) {
req->flags |= REQ_F_COMP_LOCKED;
io_free_req(req);
}
}
spin_unlock_irq(&ctx->completion_lock);
io_cqring_ev_posted(ctx);
io_free_req_many(ctx, &rb);
}
static void io_poll_flush(struct io_wq_work **workptr)
{
struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
struct llist_node *nodes;
nodes = llist_del_all(&req->ctx->poll_llist);
if (nodes)
__io_poll_flush(req->ctx, nodes);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
void *key) void *key)
{ {
...@@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -2642,7 +3568,6 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); struct io_kiocb *req = container_of(poll, struct io_kiocb, poll);
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
__poll_t mask = key_to_poll(key); __poll_t mask = key_to_poll(key);
unsigned long flags;
/* for instances that support it check for an event match first: */ /* for instances that support it check for an event match first: */
if (mask && !(mask & poll->events)) if (mask && !(mask & poll->events))
...@@ -2656,7 +3581,11 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -2656,7 +3581,11 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* If we have a link timeout we're going to need the completion_lock * If we have a link timeout we're going to need the completion_lock
* for finalizing the request, mark us as having grabbed that already. * for finalizing the request, mark us as having grabbed that already.
*/ */
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { if (mask) {
unsigned long flags;
if (llist_empty(&ctx->poll_llist) &&
spin_trylock_irqsave(&ctx->completion_lock, flags)) {
hash_del(&req->hash_node); hash_del(&req->hash_node);
io_poll_complete(req, mask, 0); io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED; req->flags |= REQ_F_COMP_LOCKED;
...@@ -2664,9 +3593,19 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -2664,9 +3593,19 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
spin_unlock_irqrestore(&ctx->completion_lock, flags); spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx); io_cqring_ev_posted(ctx);
req = NULL;
} else { } else {
io_queue_async_work(req); req->result = mask;
req->llist_node.next = NULL;
/* if the list wasn't empty, we're done */
if (!llist_add(&req->llist_node, &ctx->poll_llist))
req = NULL;
else
req->work.func = io_poll_flush;
} }
}
if (req)
io_queue_async_work(req);
return 1; return 1;
} }
...@@ -3059,15 +3998,52 @@ static int io_async_cancel_prep(struct io_kiocb *req, ...@@ -3059,15 +3998,52 @@ static int io_async_cancel_prep(struct io_kiocb *req,
sqe->cancel_flags) sqe->cancel_flags)
return -EINVAL; return -EINVAL;
req->cancel.addr = READ_ONCE(sqe->addr); req->cancel.addr = READ_ONCE(sqe->addr);
return 0;
}
static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt)
{
struct io_ring_ctx *ctx = req->ctx;
io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0);
return 0;
}
static int io_files_update_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
if (sqe->flags || sqe->ioprio || sqe->rw_flags)
return -EINVAL;
req->files_update.offset = READ_ONCE(sqe->off);
req->files_update.nr_args = READ_ONCE(sqe->len);
if (!req->files_update.nr_args)
return -EINVAL;
req->files_update.arg = READ_ONCE(sqe->addr);
return 0; return 0;
} }
static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) static int io_files_update(struct io_kiocb *req, bool force_nonblock)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_uring_files_update up;
int ret;
io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); if (force_nonblock)
return -EAGAIN;
up.offset = req->files_update.offset;
up.fds = req->files_update.arg;
mutex_lock(&ctx->uring_lock);
ret = __io_sqe_files_update(ctx, &up, req->files_update.nr_args);
mutex_unlock(&ctx->uring_lock);
if (ret < 0)
req_set_fail_links(req);
io_cqring_add_event(req, ret);
io_put_req(req);
return 0; return 0;
} }
...@@ -3076,15 +4052,25 @@ static int io_req_defer_prep(struct io_kiocb *req, ...@@ -3076,15 +4052,25 @@ static int io_req_defer_prep(struct io_kiocb *req,
{ {
ssize_t ret = 0; ssize_t ret = 0;
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req);
if (unlikely(ret))
return ret;
}
io_req_work_grab_env(req, &io_op_defs[req->opcode]);
switch (req->opcode) { switch (req->opcode) {
case IORING_OP_NOP: case IORING_OP_NOP:
break; break;
case IORING_OP_READV: case IORING_OP_READV:
case IORING_OP_READ_FIXED: case IORING_OP_READ_FIXED:
case IORING_OP_READ:
ret = io_read_prep(req, sqe, true); ret = io_read_prep(req, sqe, true);
break; break;
case IORING_OP_WRITEV: case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
ret = io_write_prep(req, sqe, true); ret = io_write_prep(req, sqe, true);
break; break;
case IORING_OP_POLL_ADD: case IORING_OP_POLL_ADD:
...@@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req, ...@@ -3100,9 +4086,11 @@ static int io_req_defer_prep(struct io_kiocb *req,
ret = io_prep_sfr(req, sqe); ret = io_prep_sfr(req, sqe);
break; break;
case IORING_OP_SENDMSG: case IORING_OP_SENDMSG:
case IORING_OP_SEND:
ret = io_sendmsg_prep(req, sqe); ret = io_sendmsg_prep(req, sqe);
break; break;
case IORING_OP_RECVMSG: case IORING_OP_RECVMSG:
case IORING_OP_RECV:
ret = io_recvmsg_prep(req, sqe); ret = io_recvmsg_prep(req, sqe);
break; break;
case IORING_OP_CONNECT: case IORING_OP_CONNECT:
...@@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req, ...@@ -3123,6 +4111,33 @@ static int io_req_defer_prep(struct io_kiocb *req,
case IORING_OP_ACCEPT: case IORING_OP_ACCEPT:
ret = io_accept_prep(req, sqe); ret = io_accept_prep(req, sqe);
break; break;
case IORING_OP_FALLOCATE:
ret = io_fallocate_prep(req, sqe);
break;
case IORING_OP_OPENAT:
ret = io_openat_prep(req, sqe);
break;
case IORING_OP_CLOSE:
ret = io_close_prep(req, sqe);
break;
case IORING_OP_FILES_UPDATE:
ret = io_files_update_prep(req, sqe);
break;
case IORING_OP_STATX:
ret = io_statx_prep(req, sqe);
break;
case IORING_OP_FADVISE:
ret = io_fadvise_prep(req, sqe);
break;
case IORING_OP_MADVISE:
ret = io_madvise_prep(req, sqe);
break;
case IORING_OP_OPENAT2:
ret = io_openat2_prep(req, sqe);
break;
case IORING_OP_EPOLL_CTL:
ret = io_epoll_ctl_prep(req, sqe);
break;
default: default:
printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
req->opcode); req->opcode);
...@@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3173,6 +4188,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break; break;
case IORING_OP_READV: case IORING_OP_READV:
case IORING_OP_READ_FIXED: case IORING_OP_READ_FIXED:
case IORING_OP_READ:
if (sqe) { if (sqe) {
ret = io_read_prep(req, sqe, force_nonblock); ret = io_read_prep(req, sqe, force_nonblock);
if (ret < 0) if (ret < 0)
...@@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3182,6 +4198,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
break; break;
case IORING_OP_WRITEV: case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED: case IORING_OP_WRITE_FIXED:
case IORING_OP_WRITE:
if (sqe) { if (sqe) {
ret = io_write_prep(req, sqe, force_nonblock); ret = io_write_prep(req, sqe, force_nonblock);
if (ret < 0) if (ret < 0)
...@@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3222,20 +4239,28 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_sync_file_range(req, nxt, force_nonblock); ret = io_sync_file_range(req, nxt, force_nonblock);
break; break;
case IORING_OP_SENDMSG: case IORING_OP_SENDMSG:
case IORING_OP_SEND:
if (sqe) { if (sqe) {
ret = io_sendmsg_prep(req, sqe); ret = io_sendmsg_prep(req, sqe);
if (ret < 0) if (ret < 0)
break; break;
} }
if (req->opcode == IORING_OP_SENDMSG)
ret = io_sendmsg(req, nxt, force_nonblock); ret = io_sendmsg(req, nxt, force_nonblock);
else
ret = io_send(req, nxt, force_nonblock);
break; break;
case IORING_OP_RECVMSG: case IORING_OP_RECVMSG:
case IORING_OP_RECV:
if (sqe) { if (sqe) {
ret = io_recvmsg_prep(req, sqe); ret = io_recvmsg_prep(req, sqe);
if (ret) if (ret)
break; break;
} }
if (req->opcode == IORING_OP_RECVMSG)
ret = io_recvmsg(req, nxt, force_nonblock); ret = io_recvmsg(req, nxt, force_nonblock);
else
ret = io_recv(req, nxt, force_nonblock);
break; break;
case IORING_OP_TIMEOUT: case IORING_OP_TIMEOUT:
if (sqe) { if (sqe) {
...@@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3277,6 +4302,78 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} }
ret = io_async_cancel(req, nxt); ret = io_async_cancel(req, nxt);
break; break;
case IORING_OP_FALLOCATE:
if (sqe) {
ret = io_fallocate_prep(req, sqe);
if (ret)
break;
}
ret = io_fallocate(req, nxt, force_nonblock);
break;
case IORING_OP_OPENAT:
if (sqe) {
ret = io_openat_prep(req, sqe);
if (ret)
break;
}
ret = io_openat(req, nxt, force_nonblock);
break;
case IORING_OP_CLOSE:
if (sqe) {
ret = io_close_prep(req, sqe);
if (ret)
break;
}
ret = io_close(req, nxt, force_nonblock);
break;
case IORING_OP_FILES_UPDATE:
if (sqe) {
ret = io_files_update_prep(req, sqe);
if (ret)
break;
}
ret = io_files_update(req, force_nonblock);
break;
case IORING_OP_STATX:
if (sqe) {
ret = io_statx_prep(req, sqe);
if (ret)
break;
}
ret = io_statx(req, nxt, force_nonblock);
break;
case IORING_OP_FADVISE:
if (sqe) {
ret = io_fadvise_prep(req, sqe);
if (ret)
break;
}
ret = io_fadvise(req, nxt, force_nonblock);
break;
case IORING_OP_MADVISE:
if (sqe) {
ret = io_madvise_prep(req, sqe);
if (ret)
break;
}
ret = io_madvise(req, nxt, force_nonblock);
break;
case IORING_OP_OPENAT2:
if (sqe) {
ret = io_openat2_prep(req, sqe);
if (ret)
break;
}
ret = io_openat2(req, nxt, force_nonblock);
break;
case IORING_OP_EPOLL_CTL:
if (sqe) {
ret = io_epoll_ctl_prep(req, sqe);
if (ret)
break;
}
ret = io_epoll_ctl(req, nxt, force_nonblock);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr) ...@@ -3311,8 +4408,11 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
struct io_kiocb *nxt = NULL; struct io_kiocb *nxt = NULL;
int ret = 0; int ret = 0;
if (work->flags & IO_WQ_WORK_CANCEL) /* if NO_CANCEL is set, we must still run the work */
if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
IO_WQ_WORK_CANCEL) {
ret = -ECANCELED; ret = -ECANCELED;
}
if (!ret) { if (!ret) {
req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0; req->has_user = (work->flags & IO_WQ_WORK_HAS_MM) != 0;
...@@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr) ...@@ -3344,26 +4444,13 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
io_wq_assign_next(workptr, nxt); io_wq_assign_next(workptr, nxt);
} }
static bool io_req_op_valid(int op) static int io_req_needs_file(struct io_kiocb *req, int fd)
{
return op >= IORING_OP_NOP && op < IORING_OP_LAST;
}
static int io_req_needs_file(struct io_kiocb *req)
{ {
switch (req->opcode) { if (!io_op_defs[req->opcode].needs_file)
case IORING_OP_NOP: return 0;
case IORING_OP_POLL_REMOVE: if (fd == -1 && io_op_defs[req->opcode].fd_non_neg)
case IORING_OP_TIMEOUT:
case IORING_OP_TIMEOUT_REMOVE:
case IORING_OP_ASYNC_CANCEL:
case IORING_OP_LINK_TIMEOUT:
return 0; return 0;
default:
if (io_req_op_valid(req->opcode))
return 1; return 1;
return -EINVAL;
}
} }
static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
...@@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, ...@@ -3371,8 +4458,8 @@ static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,
{ {
struct fixed_file_table *table; struct fixed_file_table *table;
table = &ctx->file_table[index >> IORING_FILE_TABLE_SHIFT]; table = &ctx->file_data->table[index >> IORING_FILE_TABLE_SHIFT];
return table->files[index & IORING_FILE_TABLE_MASK]; return table->files[index & IORING_FILE_TABLE_MASK];;
} }
static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
...@@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, ...@@ -3380,20 +4467,16 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
unsigned flags; unsigned flags;
int fd, ret; int fd;
flags = READ_ONCE(sqe->flags); flags = READ_ONCE(sqe->flags);
fd = READ_ONCE(sqe->fd); fd = READ_ONCE(sqe->fd);
if (flags & IOSQE_IO_DRAIN) if (!io_req_needs_file(req, fd))
req->flags |= REQ_F_IO_DRAIN; return 0;
ret = io_req_needs_file(req);
if (ret <= 0)
return ret;
if (flags & IOSQE_FIXED_FILE) { if (flags & IOSQE_FIXED_FILE) {
if (unlikely(!ctx->file_table || if (unlikely(!ctx->file_data ||
(unsigned) fd >= ctx->nr_user_files)) (unsigned) fd >= ctx->nr_user_files))
return -EBADF; return -EBADF;
fd = array_index_nospec(fd, ctx->nr_user_files); fd = array_index_nospec(fd, ctx->nr_user_files);
...@@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req, ...@@ -3401,6 +4484,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req,
if (!req->file) if (!req->file)
return -EBADF; return -EBADF;
req->flags |= REQ_F_FIXED_FILE; req->flags |= REQ_F_FIXED_FILE;
percpu_ref_get(&ctx->file_data->refs);
} else { } else {
if (req->needs_fixed_file) if (req->needs_fixed_file)
return -EBADF; return -EBADF;
...@@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req) ...@@ -3418,6 +4502,11 @@ static int io_grab_files(struct io_kiocb *req)
int ret = -EBADF; int ret = -EBADF;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
if (req->work.files)
return 0;
if (!ctx->ring_file)
return -EBADF;
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&ctx->inflight_lock); spin_lock_irq(&ctx->inflight_lock);
/* /*
...@@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req) ...@@ -3426,7 +4515,7 @@ static int io_grab_files(struct io_kiocb *req)
* the fd has changed since we started down this path, and disallow * the fd has changed since we started down this path, and disallow
* this operation if it has. * this operation if it has.
*/ */
if (fcheck(req->ring_fd) == req->ring_file) { if (fcheck(ctx->ring_fd) == ctx->ring_file) {
list_add(&req->inflight_entry, &ctx->inflight_list); list_add(&req->inflight_entry, &ctx->inflight_list);
req->flags |= REQ_F_INFLIGHT; req->flags |= REQ_F_INFLIGHT;
req->work.files = current->files; req->work.files = current->files;
...@@ -3532,7 +4621,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -3532,7 +4621,8 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
*/ */
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) { (req->flags & REQ_F_MUST_PUNT))) {
if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { punt:
if (io_op_defs[req->opcode].file_table) {
ret = io_grab_files(req); ret = io_grab_files(req);
if (ret) if (ret)
goto err; goto err;
...@@ -3567,6 +4657,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -3567,6 +4657,9 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
if (nxt) { if (nxt) {
req = nxt; req = nxt;
nxt = NULL; nxt = NULL;
if (req->flags & REQ_F_FORCE_ASYNC)
goto punt;
goto again; goto again;
} }
} }
...@@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -3575,21 +4668,27 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{ {
int ret; int ret;
if (unlikely(req->ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
req->ctx->drain_next = false;
}
req->ctx->drain_next = (req->flags & REQ_F_DRAIN_LINK);
ret = io_req_defer(req, sqe); ret = io_req_defer(req, sqe);
if (ret) { if (ret) {
if (ret != -EIOCBQUEUED) { if (ret != -EIOCBQUEUED) {
fail_req:
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
req_set_fail_links(req); req_set_fail_links(req);
io_double_put_req(req); io_double_put_req(req);
} }
} else } else if (req->flags & REQ_F_FORCE_ASYNC) {
ret = io_req_defer_prep(req, sqe);
if (unlikely(ret < 0))
goto fail_req;
/*
* Never try inline submit of IOSQE_ASYNC is set, go straight
* to async execution.
*/
req->work.flags |= IO_WQ_WORK_CONCURRENT;
io_queue_async_work(req);
} else {
__io_queue_sqe(req, sqe); __io_queue_sqe(req, sqe);
}
} }
static inline void io_queue_link_head(struct io_kiocb *req) static inline void io_queue_link_head(struct io_kiocb *req)
...@@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req) ...@@ -3602,25 +4701,47 @@ static inline void io_queue_link_head(struct io_kiocb *req)
} }
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \
IOSQE_IO_HARDLINK) IOSQE_IO_HARDLINK | IOSQE_ASYNC)
static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_submit_state *state, struct io_kiocb **link) struct io_submit_state *state, struct io_kiocb **link)
{ {
const struct cred *old_creds = NULL;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
int ret; unsigned int sqe_flags;
int ret, id;
sqe_flags = READ_ONCE(sqe->flags);
/* enforce forwards compatibility on users */ /* enforce forwards compatibility on users */
if (unlikely(sqe->flags & ~SQE_VALID_FLAGS)) { if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err_req;
}
id = READ_ONCE(sqe->personality);
if (id) {
const struct cred *personality_creds;
personality_creds = idr_find(&ctx->personality_idr, id);
if (unlikely(!personality_creds)) {
ret = -EINVAL; ret = -EINVAL;
goto err_req; goto err_req;
} }
old_creds = override_creds(personality_creds);
}
/* same numerical values with corresponding REQ_F_*, safe to copy */
req->flags |= sqe_flags & (IOSQE_IO_DRAIN|IOSQE_IO_HARDLINK|
IOSQE_ASYNC);
ret = io_req_set_file(state, req, sqe); ret = io_req_set_file(state, req, sqe);
if (unlikely(ret)) { if (unlikely(ret)) {
err_req: err_req:
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
io_double_put_req(req); io_double_put_req(req);
if (old_creds)
revert_creds(old_creds);
return false; return false;
} }
...@@ -3632,14 +4753,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3632,14 +4753,19 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
* conditions are true (normal request), then just queue it. * conditions are true (normal request), then just queue it.
*/ */
if (*link) { if (*link) {
struct io_kiocb *prev = *link; struct io_kiocb *head = *link;
if (sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
/*
* Taking sequential execution of a link, draining both sides
* of the link also fullfils IOSQE_IO_DRAIN semantics for all
* requests in the link. So, it drains the head and the
* next after the link request. The last one is done via
* drain_next flag to persist the effect across calls.
*/
if (sqe_flags & IOSQE_IO_DRAIN) {
head->flags |= REQ_F_IO_DRAIN;
ctx->drain_next = 1;
}
if (io_alloc_async_ctx(req)) { if (io_alloc_async_ctx(req)) {
ret = -EAGAIN; ret = -EAGAIN;
goto err_req; goto err_req;
...@@ -3648,16 +4774,24 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3648,16 +4774,24 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
ret = io_req_defer_prep(req, sqe); ret = io_req_defer_prep(req, sqe);
if (ret) { if (ret) {
/* fail even hard links since we don't submit */ /* fail even hard links since we don't submit */
prev->flags |= REQ_F_FAIL_LINK; head->flags |= REQ_F_FAIL_LINK;
goto err_req; goto err_req;
} }
trace_io_uring_link(ctx, req, prev); trace_io_uring_link(ctx, req, head);
list_add_tail(&req->link_list, &prev->link_list); list_add_tail(&req->link_list, &head->link_list);
} else if (sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
if (sqe->flags & IOSQE_IO_HARDLINK)
req->flags |= REQ_F_HARDLINK;
/* last request of a link, enqueue the link */
if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK))) {
io_queue_link_head(head);
*link = NULL;
}
} else {
if (unlikely(ctx->drain_next)) {
req->flags |= REQ_F_IO_DRAIN;
req->ctx->drain_next = 0;
}
if (sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) {
req->flags |= REQ_F_LINK;
INIT_LIST_HEAD(&req->link_list); INIT_LIST_HEAD(&req->link_list);
ret = io_req_defer_prep(req, sqe); ret = io_req_defer_prep(req, sqe);
if (ret) if (ret)
...@@ -3666,7 +4800,10 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -3666,7 +4800,10 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
} else { } else {
io_queue_sqe(req, sqe); io_queue_sqe(req, sqe);
} }
}
if (old_creds)
revert_creds(old_creds);
return true; return true;
} }
...@@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) ...@@ -3698,14 +4835,12 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
{ {
struct io_rings *rings = ctx->rings; struct io_rings *rings = ctx->rings;
if (ctx->cached_sq_head != READ_ONCE(rings->sq.head)) {
/* /*
* Ensure any loads from the SQEs are done at this point, * Ensure any loads from the SQEs are done at this point,
* since once we write the new head, the application could * since once we write the new head, the application could
* write new data to them. * write new data to them.
*/ */
smp_store_release(&rings->sq.head, ctx->cached_sq_head); smp_store_release(&rings->sq.head, ctx->cached_sq_head);
}
} }
/* /*
...@@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) ...@@ -3719,7 +4854,6 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe **sqe_ptr) const struct io_uring_sqe **sqe_ptr)
{ {
struct io_rings *rings = ctx->rings;
u32 *sq_array = ctx->sq_array; u32 *sq_array = ctx->sq_array;
unsigned head; unsigned head;
...@@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -3731,12 +4865,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
* 2) allows the kernel side to track the head on its own, even * 2) allows the kernel side to track the head on its own, even
* though the application is the one updating it. * though the application is the one updating it.
*/ */
head = ctx->cached_sq_head; head = READ_ONCE(sq_array[ctx->cached_sq_head & ctx->sq_mask]);
/* make sure SQ entry isn't read before tail */
if (unlikely(head == smp_load_acquire(&rings->sq.tail)))
return false;
head = READ_ONCE(sq_array[head & ctx->sq_mask]);
if (likely(head < ctx->sq_entries)) { if (likely(head < ctx->sq_entries)) {
/* /*
* All io need record the previous position, if LINK vs DARIN, * All io need record the previous position, if LINK vs DARIN,
...@@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -3754,7 +4883,7 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req,
/* drop invalid entries */ /* drop invalid entries */
ctx->cached_sq_head++; ctx->cached_sq_head++;
ctx->cached_sq_dropped++; ctx->cached_sq_dropped++;
WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); WRITE_ONCE(ctx->rings->sq_dropped, ctx->cached_sq_dropped);
return false; return false;
} }
...@@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3768,19 +4897,29 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
bool mm_fault = false; bool mm_fault = false;
/* if we have a backlog and couldn't flush it all, return BUSY */ /* if we have a backlog and couldn't flush it all, return BUSY */
if (test_bit(0, &ctx->sq_check_overflow)) {
if (!list_empty(&ctx->cq_overflow_list) && if (!list_empty(&ctx->cq_overflow_list) &&
!io_cqring_overflow_flush(ctx, false)) !io_cqring_overflow_flush(ctx, false))
return -EBUSY; return -EBUSY;
}
/* make sure SQ entry isn't read before tail */
nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx));
if (!percpu_ref_tryget_many(&ctx->refs, nr))
return -EAGAIN;
if (nr > IO_PLUG_THRESHOLD) { if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, nr); io_submit_state_start(&state, nr);
statep = &state; statep = &state;
} }
ctx->ring_fd = ring_fd;
ctx->ring_file = ring_file;
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe; const struct io_uring_sqe *sqe;
struct io_kiocb *req; struct io_kiocb *req;
unsigned int sqe_flags;
req = io_get_req(ctx, statep); req = io_get_req(ctx, statep);
if (unlikely(!req)) { if (unlikely(!req)) {
...@@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3789,11 +4928,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
break; break;
} }
if (!io_get_sqring(ctx, req, &sqe)) { if (!io_get_sqring(ctx, req, &sqe)) {
__io_free_req(req); __io_req_do_free(req);
break;
}
/* will complete beyond this point, count as submitted */
submitted++;
if (unlikely(req->opcode >= IORING_OP_LAST)) {
io_cqring_add_event(req, -EINVAL);
io_double_put_req(req);
break; break;
} }
if (io_req_needs_user(req) && !*mm) { if (io_op_defs[req->opcode].needs_mm && !*mm) {
mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm);
if (!mm_fault) { if (!mm_fault) {
use_mm(ctx->sqo_mm); use_mm(ctx->sqo_mm);
...@@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3801,27 +4949,20 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
} }
} }
submitted++;
sqe_flags = sqe->flags;
req->ring_file = ring_file;
req->ring_fd = ring_fd;
req->has_user = *mm != NULL; req->has_user = *mm != NULL;
req->in_async = async; req->in_async = async;
req->needs_fixed_file = async; req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->user_data, true, async); trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
true, async);
if (!io_submit_sqe(req, sqe, statep, &link)) if (!io_submit_sqe(req, sqe, statep, &link))
break; break;
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) {
io_queue_link_head(link);
link = NULL;
}
} }
if (unlikely(submitted != nr)) {
int ref_used = (submitted == -EAGAIN) ? 0 : submitted;
percpu_ref_put_many(&ctx->refs, nr - ref_used);
}
if (link) if (link)
io_queue_link_head(link); io_queue_link_head(link);
if (statep) if (statep)
...@@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data) ...@@ -3944,7 +5085,6 @@ static int io_sq_thread(void *data)
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
} }
to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
...@@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) ...@@ -4075,19 +5215,40 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
#endif #endif
} }
static void io_file_ref_kill(struct percpu_ref *ref)
{
struct fixed_file_data *data;
data = container_of(ref, struct fixed_file_data, refs);
complete(&data->done);
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx) static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
{ {
struct fixed_file_data *data = ctx->file_data;
unsigned nr_tables, i; unsigned nr_tables, i;
if (!ctx->file_table) if (!data)
return -ENXIO; return -ENXIO;
/* protect against inflight atomic switch, which drops the ref */
percpu_ref_get(&data->refs);
/* wait for existing switches */
flush_work(&data->ref_work);
percpu_ref_kill_and_confirm(&data->refs, io_file_ref_kill);
wait_for_completion(&data->done);
percpu_ref_put(&data->refs);
/* flush potential new switch */
flush_work(&data->ref_work);
percpu_ref_exit(&data->refs);
__io_sqe_files_unregister(ctx); __io_sqe_files_unregister(ctx);
nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE); nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
for (i = 0; i < nr_tables; i++) for (i = 0; i < nr_tables; i++)
kfree(ctx->file_table[i].files); kfree(data->table[i].files);
kfree(ctx->file_table); kfree(data->table);
ctx->file_table = NULL; kfree(data);
ctx->file_data = NULL;
ctx->nr_user_files = 0; ctx->nr_user_files = 0;
return 0; return 0;
} }
...@@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx) ...@@ -4118,16 +5279,6 @@ static void io_finish_async(struct io_ring_ctx *ctx)
} }
#if defined(CONFIG_UNIX) #if defined(CONFIG_UNIX)
static void io_destruct_skb(struct sk_buff *skb)
{
struct io_ring_ctx *ctx = skb->sk->sk_user_data;
if (ctx->io_wq)
io_wq_flush(ctx->io_wq);
unix_destruct_scm(skb);
}
/* /*
* Ensure the UNIX gc is aware of our file set, so we are certain that * Ensure the UNIX gc is aware of our file set, so we are certain that
* the io_uring can be safely unregistered on process exit, even if we have * the io_uring can be safely unregistered on process exit, even if we have
...@@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) ...@@ -4175,7 +5326,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
fpl->max = SCM_MAX_FD; fpl->max = SCM_MAX_FD;
fpl->count = nr_files; fpl->count = nr_files;
UNIXCB(skb).fp = fpl; UNIXCB(skb).fp = fpl;
skb->destructor = io_destruct_skb; skb->destructor = unix_destruct_scm;
refcount_add(skb->truesize, &sk->sk_wmem_alloc); refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb); skb_queue_head(&sk->sk_receive_queue, skb);
...@@ -4237,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, ...@@ -4237,7 +5388,7 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
int i; int i;
for (i = 0; i < nr_tables; i++) { for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_table[i]; struct fixed_file_table *table = &ctx->file_data->table[i];
unsigned this_files; unsigned this_files;
this_files = min(nr_files, IORING_MAX_FILES_TABLE); this_files = min(nr_files, IORING_MAX_FILES_TABLE);
...@@ -4248,14 +5399,114 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables, ...@@ -4248,14 +5399,114 @@ static int io_sqe_alloc_file_tables(struct io_ring_ctx *ctx, unsigned nr_tables,
nr_files -= this_files; nr_files -= this_files;
} }
if (i == nr_tables) if (i == nr_tables)
return 0; return 0;
for (i = 0; i < nr_tables; i++) {
struct fixed_file_table *table = &ctx->file_data->table[i];
kfree(table->files);
}
return 1;
}
static void io_ring_file_put(struct io_ring_ctx *ctx, struct file *file)
{
#if defined(CONFIG_UNIX)
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
__skb_queue_head_init(&list);
/*
* Find the skb that holds this file in its SCM_RIGHTS. When found,
* remove this entry and rearrange the file array.
*/
skb = skb_dequeue(head);
while (skb) {
struct scm_fp_list *fp;
fp = UNIXCB(skb).fp;
for (i = 0; i < fp->count; i++) {
int left;
if (fp->fp[i] != file)
continue;
unix_notinflight(fp->user, fp->fp[i]);
left = fp->count - 1 - i;
if (left) {
memmove(&fp->fp[i], &fp->fp[i + 1],
left * sizeof(struct file *));
}
fp->count--;
if (!fp->count) {
kfree_skb(skb);
skb = NULL;
} else {
__skb_queue_tail(&list, skb);
}
fput(file);
file = NULL;
break;
}
if (!file)
break;
__skb_queue_tail(&list, skb);
skb = skb_dequeue(head);
}
if (skb_peek(&list)) {
spin_lock_irq(&head->lock);
while ((skb = __skb_dequeue(&list)) != NULL)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#else
fput(file);
#endif
}
struct io_file_put {
struct llist_node llist;
struct file *file;
struct completion *done;
};
static void io_ring_file_ref_switch(struct work_struct *work)
{
struct io_file_put *pfile, *tmp;
struct fixed_file_data *data;
struct llist_node *node;
data = container_of(work, struct fixed_file_data, ref_work);
while ((node = llist_del_all(&data->put_llist)) != NULL) {
llist_for_each_entry_safe(pfile, tmp, node, llist) {
io_ring_file_put(data->ctx, pfile->file);
if (pfile->done)
complete(pfile->done);
else
kfree(pfile);
}
}
percpu_ref_get(&data->refs);
percpu_ref_switch_to_percpu(&data->refs);
}
static void io_file_data_ref_zero(struct percpu_ref *ref)
{
struct fixed_file_data *data;
for (i = 0; i < nr_tables; i++) { data = container_of(ref, struct fixed_file_data, refs);
struct fixed_file_table *table = &ctx->file_table[i];
kfree(table->files); /* we can't safely switch from inside this context, punt to wq */
} queue_work(system_wq, &data->ref_work);
return 1;
} }
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
...@@ -4263,25 +5514,48 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4263,25 +5514,48 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
{ {
__s32 __user *fds = (__s32 __user *) arg; __s32 __user *fds = (__s32 __user *) arg;
unsigned nr_tables; unsigned nr_tables;
struct file *file;
int fd, ret = 0; int fd, ret = 0;
unsigned i; unsigned i;
if (ctx->file_table) if (ctx->file_data)
return -EBUSY; return -EBUSY;
if (!nr_args) if (!nr_args)
return -EINVAL; return -EINVAL;
if (nr_args > IORING_MAX_FIXED_FILES) if (nr_args > IORING_MAX_FIXED_FILES)
return -EMFILE; return -EMFILE;
ctx->file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL);
if (!ctx->file_data)
return -ENOMEM;
ctx->file_data->ctx = ctx;
init_completion(&ctx->file_data->done);
nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE);
ctx->file_table = kcalloc(nr_tables, sizeof(struct fixed_file_table), ctx->file_data->table = kcalloc(nr_tables,
sizeof(struct fixed_file_table),
GFP_KERNEL); GFP_KERNEL);
if (!ctx->file_table) if (!ctx->file_data->table) {
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM; return -ENOMEM;
}
if (percpu_ref_init(&ctx->file_data->refs, io_file_data_ref_zero,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) {
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM;
}
ctx->file_data->put_llist.first = NULL;
INIT_WORK(&ctx->file_data->ref_work, io_ring_file_ref_switch);
if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) { if (io_sqe_alloc_file_tables(ctx, nr_tables, nr_args)) {
kfree(ctx->file_table); percpu_ref_exit(&ctx->file_data->refs);
ctx->file_table = NULL; kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
return -ENOMEM; return -ENOMEM;
} }
...@@ -4298,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4298,13 +5572,14 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
continue; continue;
} }
table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK; index = i & IORING_FILE_TABLE_MASK;
table->files[index] = fget(fd); file = fget(fd);
ret = -EBADF; ret = -EBADF;
if (!table->files[index]) if (!file)
break; break;
/* /*
* Don't allow io_uring instances to be registered. If UNIX * Don't allow io_uring instances to be registered. If UNIX
* isn't enabled, then this causes a reference cycle and this * isn't enabled, then this causes a reference cycle and this
...@@ -4312,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4312,26 +5587,26 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
* handle it just fine, but there's still no point in allowing * handle it just fine, but there's still no point in allowing
* a ring fd as it doesn't support regular read/write anyway. * a ring fd as it doesn't support regular read/write anyway.
*/ */
if (table->files[index]->f_op == &io_uring_fops) { if (file->f_op == &io_uring_fops) {
fput(table->files[index]); fput(file);
break; break;
} }
ret = 0; ret = 0;
table->files[index] = file;
} }
if (ret) { if (ret) {
for (i = 0; i < ctx->nr_user_files; i++) { for (i = 0; i < ctx->nr_user_files; i++) {
struct file *file;
file = io_file_from_index(ctx, i); file = io_file_from_index(ctx, i);
if (file) if (file)
fput(file); fput(file);
} }
for (i = 0; i < nr_tables; i++) for (i = 0; i < nr_tables; i++)
kfree(ctx->file_table[i].files); kfree(ctx->file_data->table[i].files);
kfree(ctx->file_table); kfree(ctx->file_data->table);
ctx->file_table = NULL; kfree(ctx->file_data);
ctx->file_data = NULL;
ctx->nr_user_files = 0; ctx->nr_user_files = 0;
return ret; return ret;
} }
...@@ -4343,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4343,69 +5618,6 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret; return ret;
} }
static void io_sqe_file_unregister(struct io_ring_ctx *ctx, int index)
{
#if defined(CONFIG_UNIX)
struct file *file = io_file_from_index(ctx, index);
struct sock *sock = ctx->ring_sock->sk;
struct sk_buff_head list, *head = &sock->sk_receive_queue;
struct sk_buff *skb;
int i;
__skb_queue_head_init(&list);
/*
* Find the skb that holds this file in its SCM_RIGHTS. When found,
* remove this entry and rearrange the file array.
*/
skb = skb_dequeue(head);
while (skb) {
struct scm_fp_list *fp;
fp = UNIXCB(skb).fp;
for (i = 0; i < fp->count; i++) {
int left;
if (fp->fp[i] != file)
continue;
unix_notinflight(fp->user, fp->fp[i]);
left = fp->count - 1 - i;
if (left) {
memmove(&fp->fp[i], &fp->fp[i + 1],
left * sizeof(struct file *));
}
fp->count--;
if (!fp->count) {
kfree_skb(skb);
skb = NULL;
} else {
__skb_queue_tail(&list, skb);
}
fput(file);
file = NULL;
break;
}
if (!file)
break;
__skb_queue_tail(&list, skb);
skb = skb_dequeue(head);
}
if (skb_peek(&list)) {
spin_lock_irq(&head->lock);
while ((skb = __skb_dequeue(&list)) != NULL)
__skb_queue_tail(head, skb);
spin_unlock_irq(&head->lock);
}
#else
fput(io_file_from_index(ctx, index));
#endif
}
static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
int index) int index)
{ {
...@@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, ...@@ -4449,29 +5661,65 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file,
#endif #endif
} }
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, static void io_atomic_switch(struct percpu_ref *ref)
{
struct fixed_file_data *data;
data = container_of(ref, struct fixed_file_data, refs);
clear_bit(FFD_F_ATOMIC, &data->state);
}
static bool io_queue_file_removal(struct fixed_file_data *data,
struct file *file)
{
struct io_file_put *pfile, pfile_stack;
DECLARE_COMPLETION_ONSTACK(done);
/*
* If we fail allocating the struct we need for doing async reomval
* of this file, just punt to sync and wait for it.
*/
pfile = kzalloc(sizeof(*pfile), GFP_KERNEL);
if (!pfile) {
pfile = &pfile_stack;
pfile->done = &done;
}
pfile->file = file;
llist_add(&pfile->llist, &data->put_llist);
if (pfile == &pfile_stack) {
if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
percpu_ref_switch_to_atomic(&data->refs,
io_atomic_switch);
}
wait_for_completion(&done);
flush_work(&data->ref_work);
return false;
}
return true;
}
static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *up,
unsigned nr_args) unsigned nr_args)
{ {
struct io_uring_files_update up; struct fixed_file_data *data = ctx->file_data;
bool ref_switch = false;
struct file *file;
__s32 __user *fds; __s32 __user *fds;
int fd, i, err; int fd, i, err;
__u32 done; __u32 done;
if (!ctx->file_table) if (check_add_overflow(up->offset, nr_args, &done))
return -ENXIO;
if (!nr_args)
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
if (up.resv)
return -EINVAL;
if (check_add_overflow(up.offset, nr_args, &done))
return -EOVERFLOW; return -EOVERFLOW;
if (done > ctx->nr_user_files) if (done > ctx->nr_user_files)
return -EINVAL; return -EINVAL;
done = 0; done = 0;
fds = u64_to_user_ptr(up.fds); fds = u64_to_user_ptr(up->fds);
while (nr_args) { while (nr_args) {
struct fixed_file_table *table; struct fixed_file_table *table;
unsigned index; unsigned index;
...@@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4481,16 +5729,16 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
err = -EFAULT; err = -EFAULT;
break; break;
} }
i = array_index_nospec(up.offset, ctx->nr_user_files); i = array_index_nospec(up->offset, ctx->nr_user_files);
table = &ctx->file_table[i >> IORING_FILE_TABLE_SHIFT]; table = &ctx->file_data->table[i >> IORING_FILE_TABLE_SHIFT];
index = i & IORING_FILE_TABLE_MASK; index = i & IORING_FILE_TABLE_MASK;
if (table->files[index]) { if (table->files[index]) {
io_sqe_file_unregister(ctx, i); file = io_file_from_index(ctx, index);
table->files[index] = NULL; table->files[index] = NULL;
if (io_queue_file_removal(data, file))
ref_switch = true;
} }
if (fd != -1) { if (fd != -1) {
struct file *file;
file = fget(fd); file = fget(fd);
if (!file) { if (!file) {
err = -EBADF; err = -EBADF;
...@@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg, ...@@ -4516,11 +5764,32 @@ static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
} }
nr_args--; nr_args--;
done++; done++;
up.offset++; up->offset++;
}
if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
percpu_ref_put(&data->refs);
percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
} }
return done ? done : err; return done ? done : err;
} }
static int io_sqe_files_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
struct io_uring_files_update up;
if (!ctx->file_data)
return -ENXIO;
if (!nr_args)
return -EINVAL;
if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
if (up.resv)
return -EINVAL;
return __io_sqe_files_update(ctx, &up, nr_args);
}
static void io_put_work(struct io_wq_work *work) static void io_put_work(struct io_wq_work *work)
{ {
...@@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work) ...@@ -4536,11 +5805,56 @@ static void io_get_work(struct io_wq_work *work)
refcount_inc(&req->refs); refcount_inc(&req->refs);
} }
static int io_sq_offload_start(struct io_ring_ctx *ctx, static int io_init_wq_offload(struct io_ring_ctx *ctx,
struct io_uring_params *p) struct io_uring_params *p)
{ {
struct io_wq_data data; struct io_wq_data data;
unsigned concurrency; struct fd f;
struct io_ring_ctx *ctx_attach;
unsigned int concurrency;
int ret = 0;
data.user = ctx->user;
data.get_work = io_get_work;
data.put_work = io_put_work;
if (!(p->flags & IORING_SETUP_ATTACH_WQ)) {
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
}
return ret;
}
f = fdget(p->wq_fd);
if (!f.file)
return -EBADF;
if (f.file->f_op != &io_uring_fops) {
ret = -EINVAL;
goto out_fput;
}
ctx_attach = f.file->private_data;
/* @io_wq is protected by holding the fd */
if (!io_wq_get(ctx_attach->io_wq, &data)) {
ret = -EINVAL;
goto out_fput;
}
ctx->io_wq = ctx_attach->io_wq;
out_fput:
fdput(f);
return ret;
}
static int io_sq_offload_start(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
int ret; int ret;
init_waitqueue_head(&ctx->sqo_wait); init_waitqueue_head(&ctx->sqo_wait);
...@@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx, ...@@ -4584,20 +5898,9 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
goto err; goto err;
} }
data.mm = ctx->sqo_mm; ret = io_init_wq_offload(ctx, p);
data.user = ctx->user; if (ret)
data.creds = ctx->creds;
data.get_work = io_get_work;
data.put_work = io_put_work;
/* Do QD, or 4 * CPUS, whatever is smallest */
concurrency = min(ctx->sq_entries, 4 * num_online_cpus());
ctx->io_wq = io_wq_create(concurrency, &data);
if (IS_ERR(ctx->io_wq)) {
ret = PTR_ERR(ctx->io_wq);
ctx->io_wq = NULL;
goto err; goto err;
}
return 0; return 0;
err: err:
...@@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on) ...@@ -4975,6 +6278,17 @@ static int io_uring_fasync(int fd, struct file *file, int on)
return fasync_helper(fd, file, on, &ctx->cq_fasync); return fasync_helper(fd, file, on, &ctx->cq_fasync);
} }
static int io_remove_personalities(int id, void *p, void *data)
{
struct io_ring_ctx *ctx = data;
const struct cred *cred;
cred = idr_remove(&ctx->personality_idr, id);
if (cred)
put_cred(cred);
return 0;
}
static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{ {
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
...@@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) ...@@ -4991,6 +6305,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
/* if we failed setting up the ctx, we might not have any rings */ /* if we failed setting up the ctx, we might not have any rings */
if (ctx->rings) if (ctx->rings)
io_cqring_overflow_flush(ctx, true); io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
wait_for_completion(&ctx->completions[0]); wait_for_completion(&ctx->completions[0]);
io_ring_ctx_free(ctx); io_ring_ctx_free(ctx);
} }
...@@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, ...@@ -5157,7 +6472,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
} else if (to_submit) { } else if (to_submit) {
struct mm_struct *cur_mm; struct mm_struct *cur_mm;
to_submit = min(to_submit, ctx->sq_entries);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
/* already have mm, so io_submit_sqes() won't try to grab it */ /* already have mm, so io_submit_sqes() won't try to grab it */
cur_mm = ctx->sqo_mm; cur_mm = ctx->sqo_mm;
...@@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) ...@@ -5273,7 +6587,6 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx)
#if defined(CONFIG_UNIX) #if defined(CONFIG_UNIX)
ctx->ring_sock->file = file; ctx->ring_sock->file = file;
ctx->ring_sock->sk->sk_user_data = ctx;
#endif #endif
fd_install(ret, file); fd_install(ret, file);
return ret; return ret;
...@@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ...@@ -5292,8 +6605,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
bool account_mem; bool account_mem;
int ret; int ret;
if (!entries || entries > IORING_MAX_ENTRIES) if (!entries)
return -EINVAL; return -EINVAL;
if (entries > IORING_MAX_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
entries = IORING_MAX_ENTRIES;
}
/* /*
* Use twice as many entries for the CQ ring. It's possible for the * Use twice as many entries for the CQ ring. It's possible for the
...@@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ...@@ -5310,8 +6628,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
* to a power-of-two, if it isn't already. We do NOT impose * to a power-of-two, if it isn't already. We do NOT impose
* any cq vs sq ring sizing. * any cq vs sq ring sizing.
*/ */
if (p->cq_entries < p->sq_entries || p->cq_entries > IORING_MAX_CQ_ENTRIES) if (p->cq_entries < p->sq_entries)
return -EINVAL; return -EINVAL;
if (p->cq_entries > IORING_MAX_CQ_ENTRIES) {
if (!(p->flags & IORING_SETUP_CLAMP))
return -EINVAL;
p->cq_entries = IORING_MAX_CQ_ENTRIES;
}
p->cq_entries = roundup_pow_of_two(p->cq_entries); p->cq_entries = roundup_pow_of_two(p->cq_entries);
} else { } else {
p->cq_entries = 2 * p->sq_entries; p->cq_entries = 2 * p->sq_entries;
...@@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ...@@ -5376,7 +6699,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
goto err; goto err;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE; IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
IORING_FEAT_CUR_PERSONALITY;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret; return ret;
err: err:
...@@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) ...@@ -5403,7 +6727,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
} }
if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL |
IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE)) IORING_SETUP_SQ_AFF | IORING_SETUP_CQSIZE |
IORING_SETUP_CLAMP | IORING_SETUP_ATTACH_WQ))
return -EINVAL; return -EINVAL;
ret = io_uring_create(entries, &p); ret = io_uring_create(entries, &p);
...@@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, ...@@ -5422,6 +6747,84 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries,
return io_uring_setup(entries, params); return io_uring_setup(entries, params);
} }
static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args)
{
struct io_uring_probe *p;
size_t size;
int i, ret;
size = struct_size(p, ops, nr_args);
if (size == SIZE_MAX)
return -EOVERFLOW;
p = kzalloc(size, GFP_KERNEL);
if (!p)
return -ENOMEM;
ret = -EFAULT;
if (copy_from_user(p, arg, size))
goto out;
ret = -EINVAL;
if (memchr_inv(p, 0, size))
goto out;
p->last_op = IORING_OP_LAST - 1;
if (nr_args > IORING_OP_LAST)
nr_args = IORING_OP_LAST;
for (i = 0; i < nr_args; i++) {
p->ops[i].op = i;
if (!io_op_defs[i].not_supported)
p->ops[i].flags = IO_URING_OP_SUPPORTED;
}
p->ops_len = i;
ret = 0;
if (copy_to_user(arg, p, size))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
static int io_register_personality(struct io_ring_ctx *ctx)
{
const struct cred *creds = get_current_cred();
int id;
id = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1,
USHRT_MAX, GFP_KERNEL);
if (id < 0)
put_cred(creds);
return id;
}
static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
{
const struct cred *old_creds;
old_creds = idr_remove(&ctx->personality_idr, id);
if (old_creds) {
put_cred(old_creds);
return 0;
}
return -EINVAL;
}
static bool io_register_op_must_quiesce(int op)
{
switch (op) {
case IORING_UNREGISTER_FILES:
case IORING_REGISTER_FILES_UPDATE:
case IORING_REGISTER_PROBE:
case IORING_REGISTER_PERSONALITY:
case IORING_UNREGISTER_PERSONALITY:
return false;
default:
return true;
}
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args) void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock) __releases(ctx->uring_lock)
...@@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -5437,18 +6840,26 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
if (percpu_ref_is_dying(&ctx->refs)) if (percpu_ref_is_dying(&ctx->refs))
return -ENXIO; return -ENXIO;
if (io_register_op_must_quiesce(opcode)) {
percpu_ref_kill(&ctx->refs); percpu_ref_kill(&ctx->refs);
/* /*
* Drop uring mutex before waiting for references to exit. If another * Drop uring mutex before waiting for references to exit. If
* thread is currently inside io_uring_enter() it might need to grab * another thread is currently inside io_uring_enter() it might
* the uring_lock to make progress. If we hold it here across the drain * need to grab the uring_lock to make progress. If we hold it
* wait, then we can deadlock. It's safe to drop the mutex here, since * here across the drain wait, then we can deadlock. It's safe
* no new references will come in after we've killed the percpu ref. * to drop the mutex here, since no new references will come in
* after we've killed the percpu ref.
*/ */
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
wait_for_completion(&ctx->completions[0]); ret = wait_for_completion_interruptible(&ctx->completions[0]);
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
if (ret) {
percpu_ref_resurrect(&ctx->refs);
ret = -EINTR;
goto out;
}
}
switch (opcode) { switch (opcode) {
case IORING_REGISTER_BUFFERS: case IORING_REGISTER_BUFFERS:
...@@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -5473,10 +6884,17 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
ret = io_sqe_files_update(ctx, arg, nr_args); ret = io_sqe_files_update(ctx, arg, nr_args);
break; break;
case IORING_REGISTER_EVENTFD: case IORING_REGISTER_EVENTFD:
case IORING_REGISTER_EVENTFD_ASYNC:
ret = -EINVAL; ret = -EINVAL;
if (nr_args != 1) if (nr_args != 1)
break; break;
ret = io_eventfd_register(ctx, arg); ret = io_eventfd_register(ctx, arg);
if (ret)
break;
if (opcode == IORING_REGISTER_EVENTFD_ASYNC)
ctx->eventfd_async = 1;
else
ctx->eventfd_async = 0;
break; break;
case IORING_UNREGISTER_EVENTFD: case IORING_UNREGISTER_EVENTFD:
ret = -EINVAL; ret = -EINVAL;
...@@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -5484,14 +6902,35 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break; break;
ret = io_eventfd_unregister(ctx); ret = io_eventfd_unregister(ctx);
break; break;
case IORING_REGISTER_PROBE:
ret = -EINVAL;
if (!arg || nr_args > 256)
break;
ret = io_probe(ctx, arg, nr_args);
break;
case IORING_REGISTER_PERSONALITY:
ret = -EINVAL;
if (arg || nr_args)
break;
ret = io_register_personality(ctx);
break;
case IORING_UNREGISTER_PERSONALITY:
ret = -EINVAL;
if (arg)
break;
ret = io_unregister_personality(ctx, nr_args);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
} }
if (io_register_op_must_quiesce(opcode)) {
/* bring the ctx back to life */ /* bring the ctx back to life */
reinit_completion(&ctx->completions[0]);
percpu_ref_reinit(&ctx->refs); percpu_ref_reinit(&ctx->refs);
out:
reinit_completion(&ctx->completions[0]);
}
return ret; return ret;
} }
...@@ -5524,6 +6963,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ...@@ -5524,6 +6963,7 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
static int __init io_uring_init(void) static int __init io_uring_init(void)
{ {
BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
return 0; return 0;
}; };
......
...@@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path); ...@@ -958,7 +958,7 @@ EXPORT_SYMBOL(open_with_fake_path);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE)) #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC) #define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
static inline struct open_how build_open_how(int flags, umode_t mode) inline struct open_how build_open_how(int flags, umode_t mode)
{ {
struct open_how how = { struct open_how how = {
.flags = flags & VALID_OPEN_FLAGS, .flags = flags & VALID_OPEN_FLAGS,
...@@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode) ...@@ -974,8 +974,7 @@ static inline struct open_how build_open_how(int flags, umode_t mode)
return how; return how;
} }
static inline int build_open_flags(const struct open_how *how, inline int build_open_flags(const struct open_how *how, struct open_flags *op)
struct open_flags *op)
{ {
int flags = how->flags; int flags = how->flags;
int lookup_flags = 0; int lookup_flags = 0;
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/unistd.h> #include <asm/unistd.h>
#include "internal.h"
/** /**
* generic_fillattr - Fill in the basic attributes from the inode struct * generic_fillattr - Fill in the basic attributes from the inode struct
* @inode: Inode to use as the source * @inode: Inode to use as the source
...@@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat, ...@@ -150,6 +152,23 @@ int vfs_statx_fd(unsigned int fd, struct kstat *stat,
} }
EXPORT_SYMBOL(vfs_statx_fd); EXPORT_SYMBOL(vfs_statx_fd);
inline unsigned vfs_stat_set_lookup_flags(unsigned *lookup_flags, int flags)
{
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
return -EINVAL;
*lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
if (flags & AT_SYMLINK_NOFOLLOW)
*lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
*lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
*lookup_flags |= LOOKUP_EMPTY;
return 0;
}
/** /**
* vfs_statx - Get basic and extra attributes by filename * vfs_statx - Get basic and extra attributes by filename
* @dfd: A file descriptor representing the base dir for a relative filename * @dfd: A file descriptor representing the base dir for a relative filename
...@@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags, ...@@ -170,19 +189,10 @@ int vfs_statx(int dfd, const char __user *filename, int flags,
{ {
struct path path; struct path path;
int error = -EINVAL; int error = -EINVAL;
unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; unsigned lookup_flags;
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | if (vfs_stat_set_lookup_flags(&lookup_flags, flags))
AT_EMPTY_PATH | KSTAT_QUERY_FLAGS)) != 0)
return -EINVAL; return -EINVAL;
if (flags & AT_SYMLINK_NOFOLLOW)
lookup_flags &= ~LOOKUP_FOLLOW;
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
if (flags & AT_EMPTY_PATH)
lookup_flags |= LOOKUP_EMPTY;
retry: retry:
error = user_path_at(dfd, filename, lookup_flags, &path); error = user_path_at(dfd, filename, lookup_flags, &path);
if (error) if (error)
...@@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename, ...@@ -523,7 +533,7 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
} }
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */ #endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */
static noinline_for_stack int noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer) cp_statx(const struct kstat *stat, struct statx __user *buffer)
{ {
struct statx tmp; struct statx tmp;
......
...@@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file) ...@@ -61,6 +61,15 @@ static inline void eventpoll_release(struct file *file)
eventpoll_release_file(file); eventpoll_release_file(file);
} }
int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
bool nonblock);
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
return op != EPOLL_CTL_DEL;
}
#else #else
static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_init_file(struct file *file) {}
......
...@@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t, ...@@ -2323,6 +2323,7 @@ extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf, bool downgrade); struct list_head *uf, bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t, extern int do_munmap(struct mm_struct *, unsigned long, size_t,
struct list_head *uf); struct list_head *uf);
extern int do_madvise(unsigned long start, size_t len_in, int behavior);
static inline unsigned long static inline unsigned long
do_mmap_pgoff(struct file *file, unsigned long addr, do_mmap_pgoff(struct file *file, unsigned long addr,
......
...@@ -210,15 +210,17 @@ static inline void percpu_ref_get(struct percpu_ref *ref) ...@@ -210,15 +210,17 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
} }
/** /**
* percpu_ref_tryget - try to increment a percpu refcount * percpu_ref_tryget_many - try to increment a percpu refcount
* @ref: percpu_ref to try-get * @ref: percpu_ref to try-get
* @nr: number of references to get
* *
* Increment a percpu refcount unless its count already reached zero. * Increment a percpu refcount by @nr unless its count already reached zero.
* Returns %true on success; %false on failure. * Returns %true on success; %false on failure.
* *
* This function is safe to call as long as @ref is between init and exit. * This function is safe to call as long as @ref is between init and exit.
*/ */
static inline bool percpu_ref_tryget(struct percpu_ref *ref) static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
unsigned long nr)
{ {
unsigned long __percpu *percpu_count; unsigned long __percpu *percpu_count;
bool ret; bool ret;
...@@ -226,10 +228,10 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) ...@@ -226,10 +228,10 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
rcu_read_lock(); rcu_read_lock();
if (__ref_is_percpu(ref, &percpu_count)) { if (__ref_is_percpu(ref, &percpu_count)) {
this_cpu_inc(*percpu_count); this_cpu_add(*percpu_count, nr);
ret = true; ret = true;
} else { } else {
ret = atomic_long_inc_not_zero(&ref->count); ret = atomic_long_add_unless(&ref->count, nr, 0);
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -237,6 +239,20 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) ...@@ -237,6 +239,20 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
return ret; return ret;
} }
/**
* percpu_ref_tryget - try to increment a percpu refcount
* @ref: percpu_ref to try-get
*
* Increment a percpu refcount unless its count already reached zero.
* Returns %true on success; %false on failure.
*
* This function is safe to call as long as @ref is between init and exit.
*/
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
return percpu_ref_tryget_many(ref, 1);
}
/** /**
* percpu_ref_tryget_live - try to increment a live percpu refcount * percpu_ref_tryget_live - try to increment a live percpu refcount
* @ref: percpu_ref to try-get * @ref: percpu_ref to try-get
......
...@@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete, ...@@ -320,6 +320,7 @@ TRACE_EVENT(io_uring_complete,
* io_uring_submit_sqe - called before submitting one SQE * io_uring_submit_sqe - called before submitting one SQE
* *
* @ctx: pointer to a ring context structure * @ctx: pointer to a ring context structure
* @opcode: opcode of request
* @user_data: user data associated with the request * @user_data: user data associated with the request
* @force_nonblock: whether a context blocking or not * @force_nonblock: whether a context blocking or not
* @sq_thread: true if sq_thread has submitted this SQE * @sq_thread: true if sq_thread has submitted this SQE
...@@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete, ...@@ -329,12 +330,14 @@ TRACE_EVENT(io_uring_complete,
*/ */
TRACE_EVENT(io_uring_submit_sqe, TRACE_EVENT(io_uring_submit_sqe,
TP_PROTO(void *ctx, u64 user_data, bool force_nonblock, bool sq_thread), TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock,
bool sq_thread),
TP_ARGS(ctx, user_data, force_nonblock, sq_thread), TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread),
TP_STRUCT__entry ( TP_STRUCT__entry (
__field( void *, ctx ) __field( void *, ctx )
__field( u8, opcode )
__field( u64, user_data ) __field( u64, user_data )
__field( bool, force_nonblock ) __field( bool, force_nonblock )
__field( bool, sq_thread ) __field( bool, sq_thread )
...@@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe, ...@@ -342,13 +345,15 @@ TRACE_EVENT(io_uring_submit_sqe,
TP_fast_assign( TP_fast_assign(
__entry->ctx = ctx; __entry->ctx = ctx;
__entry->opcode = opcode;
__entry->user_data = user_data; __entry->user_data = user_data;
__entry->force_nonblock = force_nonblock; __entry->force_nonblock = force_nonblock;
__entry->sq_thread = sq_thread; __entry->sq_thread = sq_thread;
), ),
TP_printk("ring %p, user data 0x%llx, non block %d, sq_thread %d", TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d",
__entry->ctx, (unsigned long long) __entry->user_data, __entry->ctx, __entry->opcode,
(unsigned long long) __entry->user_data,
__entry->force_nonblock, __entry->sq_thread) __entry->force_nonblock, __entry->sq_thread)
); );
......
...@@ -34,21 +34,43 @@ struct io_uring_sqe { ...@@ -34,21 +34,43 @@ struct io_uring_sqe {
__u32 timeout_flags; __u32 timeout_flags;
__u32 accept_flags; __u32 accept_flags;
__u32 cancel_flags; __u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
union { union {
__u16 buf_index; /* index into fixed buffers, if used */ struct {
/* index into fixed buffers, if used */
__u16 buf_index;
/* personality to use, if used */
__u16 personality;
};
__u64 __pad2[3]; __u64 __pad2[3];
}; };
}; };
enum {
IOSQE_FIXED_FILE_BIT,
IOSQE_IO_DRAIN_BIT,
IOSQE_IO_LINK_BIT,
IOSQE_IO_HARDLINK_BIT,
IOSQE_ASYNC_BIT,
};
/* /*
* sqe->flags * sqe->flags
*/ */
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ #define IOSQE_FIXED_FILE (1U << IOSQE_FIXED_FILE_BIT)
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */ /* issue after inflight IO */
#define IOSQE_IO_HARDLINK (1U << 3) /* like LINK, but stronger */ #define IOSQE_IO_DRAIN (1U << IOSQE_IO_DRAIN_BIT)
/* links next sqe */
#define IOSQE_IO_LINK (1U << IOSQE_IO_LINK_BIT)
/* like LINK, but stronger */
#define IOSQE_IO_HARDLINK (1U << IOSQE_IO_HARDLINK_BIT)
/* always go async */
#define IOSQE_ASYNC (1U << IOSQE_ASYNC_BIT)
/* /*
* io_uring_setup() flags * io_uring_setup() flags
...@@ -57,6 +79,8 @@ struct io_uring_sqe { ...@@ -57,6 +79,8 @@ struct io_uring_sqe {
#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ #define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */
#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */
#define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */ #define IORING_SETUP_CQSIZE (1U << 3) /* app defines CQ size */
#define IORING_SETUP_CLAMP (1U << 4) /* clamp SQ/CQ ring sizes */
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
enum { enum {
IORING_OP_NOP, IORING_OP_NOP,
...@@ -76,6 +100,19 @@ enum { ...@@ -76,6 +100,19 @@ enum {
IORING_OP_ASYNC_CANCEL, IORING_OP_ASYNC_CANCEL,
IORING_OP_LINK_TIMEOUT, IORING_OP_LINK_TIMEOUT,
IORING_OP_CONNECT, IORING_OP_CONNECT,
IORING_OP_FALLOCATE,
IORING_OP_OPENAT,
IORING_OP_CLOSE,
IORING_OP_FILES_UPDATE,
IORING_OP_STATX,
IORING_OP_READ,
IORING_OP_WRITE,
IORING_OP_FADVISE,
IORING_OP_MADVISE,
IORING_OP_SEND,
IORING_OP_RECV,
IORING_OP_OPENAT2,
IORING_OP_EPOLL_CTL,
/* this goes last, obviously */ /* this goes last, obviously */
IORING_OP_LAST, IORING_OP_LAST,
...@@ -153,7 +190,8 @@ struct io_uring_params { ...@@ -153,7 +190,8 @@ struct io_uring_params {
__u32 sq_thread_cpu; __u32 sq_thread_cpu;
__u32 sq_thread_idle; __u32 sq_thread_idle;
__u32 features; __u32 features;
__u32 resv[4]; __u32 wq_fd;
__u32 resv[3];
struct io_sqring_offsets sq_off; struct io_sqring_offsets sq_off;
struct io_cqring_offsets cq_off; struct io_cqring_offsets cq_off;
}; };
...@@ -164,6 +202,8 @@ struct io_uring_params { ...@@ -164,6 +202,8 @@ struct io_uring_params {
#define IORING_FEAT_SINGLE_MMAP (1U << 0) #define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2) #define IORING_FEAT_SUBMIT_STABLE (1U << 2)
#define IORING_FEAT_RW_CUR_POS (1U << 3)
#define IORING_FEAT_CUR_PERSONALITY (1U << 4)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
...@@ -175,6 +215,10 @@ struct io_uring_params { ...@@ -175,6 +215,10 @@ struct io_uring_params {
#define IORING_REGISTER_EVENTFD 4 #define IORING_REGISTER_EVENTFD 4
#define IORING_UNREGISTER_EVENTFD 5 #define IORING_UNREGISTER_EVENTFD 5
#define IORING_REGISTER_FILES_UPDATE 6 #define IORING_REGISTER_FILES_UPDATE 6
#define IORING_REGISTER_EVENTFD_ASYNC 7
#define IORING_REGISTER_PROBE 8
#define IORING_REGISTER_PERSONALITY 9
#define IORING_UNREGISTER_PERSONALITY 10
struct io_uring_files_update { struct io_uring_files_update {
__u32 offset; __u32 offset;
...@@ -182,4 +226,21 @@ struct io_uring_files_update { ...@@ -182,4 +226,21 @@ struct io_uring_files_update {
__aligned_u64 /* __s32 * */ fds; __aligned_u64 /* __s32 * */ fds;
}; };
#define IO_URING_OP_SUPPORTED (1U << 0)
struct io_uring_probe_op {
__u8 op;
__u8 resv;
__u16 flags; /* IO_URING_OP_* flags */
__u32 resv2;
};
struct io_uring_probe {
__u8 last_op; /* last opcode supported */
__u8 ops_len; /* length of ops[] array below */
__u16 resv;
__u32 resv2[3];
struct io_uring_probe_op ops[0];
};
#endif #endif
...@@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior) ...@@ -1044,7 +1044,7 @@ madvise_behavior_valid(int behavior)
* -EBADF - map exists, but area maps something that isn't a file. * -EBADF - map exists, but area maps something that isn't a file.
* -EAGAIN - a kernel resource was temporarily unavailable. * -EAGAIN - a kernel resource was temporarily unavailable.
*/ */
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int do_madvise(unsigned long start, size_t len_in, int behavior)
{ {
unsigned long end, tmp; unsigned long end, tmp;
struct vm_area_struct *vma, *prev; struct vm_area_struct *vma, *prev;
...@@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) ...@@ -1141,3 +1141,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return error; return error;
} }
SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
return do_madvise(start, len_in, behavior);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment