Commit 3a4d319a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - NAPI fixes and cleanups (Pavel, Olivier)

 - Add support for absolute timeouts (Pavel)

 - Fixes for io-wq/sqpoll affinities (Felix)

 - Efficiency improvements for dealing with huge pages (Chenliang)

 - Support for a minwait mode, where the application essentially has two
   timouts - one smaller one that defines the batch timeout, and the
   overall large one similar to what we had before. This enables
   efficient use of batching based on count + timeout, while still
   working well with periods of less intensive workloads

 - Use ITER_UBUF for single segment sends

 - Add support for incremental buffer consumption. Right now each
   operation will always consume a full buffer. With incremental
   consumption, a recv/read operation only consumes the part of the
   buffer that it needs to satisfy the operation

 - Add support for GCOV for io_uring, to help retain a high coverage of
   test to code ratio

 - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly
   converted to a blocking retry

 - Add support for cloning registered buffers from one ring to another

 - Misc cleanups (Anuj, me)

* tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits)
  io_uring: add IORING_REGISTER_COPY_BUFFERS method
  io_uring/register: provide helper to get io_ring_ctx from 'fd'
  io_uring/rsrc: add reference count to struct io_mapped_ubuf
  io_uring/rsrc: clear 'slot' entry upfront
  io_uring/io-wq: inherit cpuset of cgroup in io worker
  io_uring/io-wq: do not allow pinning outside of cpuset
  io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common()
  io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN
  io_uring/sqpoll: do not allow pinning outside of cpuset
  io_uring/eventfd: move refs to refcount_t
  io_uring: remove unused rsrc_put_fn
  io_uring: add new line after variable declaration
  io_uring: add GCOV_PROFILE_URING Kconfig option
  io_uring/kbuf: add support for incremental buffer consumption
  io_uring/kbuf: pass in 'len' argument for buffer commit
  Revert "io_uring: Require zeroed sqe->len on provided-buffers send"
  io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h
  io_uring/kbuf: add io_kbuf_commit() helper
  io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg
  io_uring: wire up min batch wake timeout
  ...
parents 69a3a0a4 7cc2a6ea
...@@ -239,6 +239,9 @@ struct io_ring_ctx { ...@@ -239,6 +239,9 @@ struct io_ring_ctx {
struct io_rings *rings; struct io_rings *rings;
struct percpu_ref refs; struct percpu_ref refs;
clockid_t clockid;
enum tk_offsets clock_offset;
enum task_work_notify_mode notify_method; enum task_work_notify_mode notify_method;
unsigned sq_thread_idle; unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
......
...@@ -440,11 +440,21 @@ struct io_uring_cqe { ...@@ -440,11 +440,21 @@ struct io_uring_cqe {
* IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv
* IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct
* them from sends. * them from sends.
* IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
* more completions. In other words, the buffer is being
* partially consumed, and will be used by the kernel for
* more completions. This is only set for buffers used via
* the incremental buffer consumption, as provided by
* a ring buffer setup with IOU_PBUF_RING_INC. For any
* other provided buffer type, all completions with a
* buffer passed back is automatically returned to the
* application.
*/ */
#define IORING_CQE_F_BUFFER (1U << 0) #define IORING_CQE_F_BUFFER (1U << 0)
#define IORING_CQE_F_MORE (1U << 1) #define IORING_CQE_F_MORE (1U << 1)
#define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2)
#define IORING_CQE_F_NOTIF (1U << 3) #define IORING_CQE_F_NOTIF (1U << 3)
#define IORING_CQE_F_BUF_MORE (1U << 4)
#define IORING_CQE_BUFFER_SHIFT 16 #define IORING_CQE_BUFFER_SHIFT 16
...@@ -507,6 +517,7 @@ struct io_cqring_offsets { ...@@ -507,6 +517,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2) #define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3) #define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4) #define IORING_ENTER_REGISTERED_RING (1U << 4)
#define IORING_ENTER_ABS_TIMER (1U << 5)
/* /*
* Passed in for io_uring_setup(2). Copied back with updated info on success * Passed in for io_uring_setup(2). Copied back with updated info on success
...@@ -542,6 +553,7 @@ struct io_uring_params { ...@@ -542,6 +553,7 @@ struct io_uring_params {
#define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_LINKED_FILE (1U << 12)
#define IORING_FEAT_REG_REG_RING (1U << 13) #define IORING_FEAT_REG_REG_RING (1U << 13)
#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) #define IORING_FEAT_RECVSEND_BUNDLE (1U << 14)
#define IORING_FEAT_MIN_TIMEOUT (1U << 15)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
...@@ -595,6 +607,11 @@ enum io_uring_register_op { ...@@ -595,6 +607,11 @@ enum io_uring_register_op {
IORING_REGISTER_NAPI = 27, IORING_REGISTER_NAPI = 27,
IORING_UNREGISTER_NAPI = 28, IORING_UNREGISTER_NAPI = 28,
IORING_REGISTER_CLOCK = 29,
/* copy registered buffers from source ring to current ring */
IORING_REGISTER_COPY_BUFFERS = 30,
/* this goes last */ /* this goes last */
IORING_REGISTER_LAST, IORING_REGISTER_LAST,
...@@ -675,6 +692,21 @@ struct io_uring_restriction { ...@@ -675,6 +692,21 @@ struct io_uring_restriction {
__u32 resv2[3]; __u32 resv2[3];
}; };
struct io_uring_clock_register {
__u32 clockid;
__u32 __resv[3];
};
enum {
IORING_REGISTER_SRC_REGISTERED = 1,
};
struct io_uring_copy_buffers {
__u32 src_fd;
__u32 flags;
__u32 pad[6];
};
struct io_uring_buf { struct io_uring_buf {
__u64 addr; __u64 addr;
__u32 len; __u32 len;
...@@ -707,9 +739,17 @@ struct io_uring_buf_ring { ...@@ -707,9 +739,17 @@ struct io_uring_buf_ring {
* mmap(2) with the offset set as: * mmap(2) with the offset set as:
* IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
* to get a virtual mapping for the ring. * to get a virtual mapping for the ring.
* IOU_PBUF_RING_INC: If set, buffers consumed from this buffer ring can be
* consumed incrementally. Normally one (or more) buffers
* are fully consumed. With incremental consumptions, it's
* feasible to register big ranges of buffers, and each
* use of it will consume only as much as it needs. This
* requires that both the kernel and application keep
* track of where the current read/recv index is at.
*/ */
enum io_uring_register_pbuf_ring_flags { enum io_uring_register_pbuf_ring_flags {
IOU_PBUF_RING_MMAP = 1, IOU_PBUF_RING_MMAP = 1,
IOU_PBUF_RING_INC = 2,
}; };
/* argument for IORING_(UN)REGISTER_PBUF_RING */ /* argument for IORING_(UN)REGISTER_PBUF_RING */
...@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op { ...@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
struct io_uring_getevents_arg { struct io_uring_getevents_arg {
__u64 sigmask; __u64 sigmask;
__u32 sigmask_sz; __u32 sigmask_sz;
__u32 pad; __u32 min_wait_usec;
__u64 ts; __u64 ts;
}; };
......
...@@ -1687,6 +1687,19 @@ config IO_URING ...@@ -1687,6 +1687,19 @@ config IO_URING
applications to submit and complete IO through submission and applications to submit and complete IO through submission and
completion rings that are shared between the kernel and application. completion rings that are shared between the kernel and application.
config GCOV_PROFILE_URING
bool "Enable GCOV profiling on the io_uring subsystem"
depends on GCOV_KERNEL
help
Enable GCOV profiling on the io_uring subsystem, to facilitate
code coverage testing.
If unsure, say N.
Note that this will have a negative impact on the performance of
the io_uring subsystem, hence this should only be enabled for
specific test purposes.
config ADVISE_SYSCALLS config ADVISE_SYSCALLS
bool "Enable madvise/fadvise syscalls" if EXPERT bool "Enable madvise/fadvise syscalls" if EXPERT
default y default y
......
...@@ -2,6 +2,10 @@ ...@@ -2,6 +2,10 @@
# #
# Makefile for io_uring # Makefile for io_uring
ifdef CONFIG_GCOV_PROFILE_URING
GCOV_PROFILE := y
endif
obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \
tctx.o filetable.o rw.o net.o poll.o \ tctx.o filetable.o rw.o net.o poll.o \
eventfd.o uring_cmd.o openclose.o \ eventfd.o uring_cmd.o openclose.o \
......
...@@ -15,7 +15,7 @@ struct io_ev_fd { ...@@ -15,7 +15,7 @@ struct io_ev_fd {
struct eventfd_ctx *cq_ev_fd; struct eventfd_ctx *cq_ev_fd;
unsigned int eventfd_async: 1; unsigned int eventfd_async: 1;
struct rcu_head rcu; struct rcu_head rcu;
atomic_t refs; refcount_t refs;
atomic_t ops; atomic_t ops;
}; };
...@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) ...@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)
eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
io_eventfd_free(rcu); io_eventfd_free(rcu);
} }
...@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) ...@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
*/ */
if (unlikely(!ev_fd)) if (unlikely(!ev_fd))
return; return;
if (!atomic_inc_not_zero(&ev_fd->refs)) if (!refcount_inc_not_zero(&ev_fd->refs))
return; return;
if (ev_fd->eventfd_async && !io_wq_current_is_worker()) if (ev_fd->eventfd_async && !io_wq_current_is_worker())
goto out; goto out;
...@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx) ...@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
} }
} }
out: out:
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free); call_rcu(&ev_fd->rcu, io_eventfd_free);
} }
...@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
if (IS_ERR(ev_fd->cq_ev_fd)) { if (IS_ERR(ev_fd->cq_ev_fd)) {
int ret = PTR_ERR(ev_fd->cq_ev_fd); int ret = PTR_ERR(ev_fd->cq_ev_fd);
kfree(ev_fd); kfree(ev_fd);
return ret; return ret;
} }
...@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, ...@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
ev_fd->eventfd_async = eventfd_async; ev_fd->eventfd_async = eventfd_async;
ctx->has_evfd = true; ctx->has_evfd = true;
atomic_set(&ev_fd->refs, 1); refcount_set(&ev_fd->refs, 1);
atomic_set(&ev_fd->ops, 0); atomic_set(&ev_fd->ops, 0);
rcu_assign_pointer(ctx->io_ev_fd, ev_fd); rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
return 0; return 0;
...@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) ...@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
if (ev_fd) { if (ev_fd) {
ctx->has_evfd = false; ctx->has_evfd = false;
rcu_assign_pointer(ctx->io_ev_fd, NULL); rcu_assign_pointer(ctx->io_ev_fd, NULL);
if (atomic_dec_and_test(&ev_fd->refs)) if (refcount_dec_and_test(&ev_fd->refs))
call_rcu(&ev_fd->rcu, io_eventfd_free); call_rcu(&ev_fd->rcu, io_eventfd_free);
return 0; return 0;
} }
......
...@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) ...@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
cqe->user_data, cqe->res, cqe->flags); cqe->user_data, cqe->res, cqe->flags);
} }
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
#ifdef CONFIG_NET_RX_BUSY_POLL
if (ctx->napi_enabled) {
seq_puts(m, "NAPI:\tenabled\n");
seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
if (ctx->napi_prefer_busy_poll)
seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
else
seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
} else {
seq_puts(m, "NAPI:\tdisabled\n");
}
#endif
} }
#endif #endif
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/task_work.h> #include <linux/task_work.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/mmu_context.h> #include <linux/mmu_context.h>
...@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) ...@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL)) if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
goto err; goto err;
cpumask_copy(wq->cpu_mask, cpu_possible_mask); cpuset_cpus_allowed(data->task, wq->cpu_mask);
wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
wq->acct[IO_WQ_ACCT_UNBOUND].max_workers = wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
task_rlimit(current, RLIMIT_NPROC); task_rlimit(current, RLIMIT_NPROC);
...@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node) ...@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)
int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask) int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
{ {
cpumask_var_t allowed_mask;
int ret = 0;
if (!tctx || !tctx->io_wq) if (!tctx || !tctx->io_wq)
return -EINVAL; return -EINVAL;
if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
return -ENOMEM;
rcu_read_lock(); rcu_read_lock();
if (mask) cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
cpumask_copy(tctx->io_wq->cpu_mask, mask); if (mask) {
else if (cpumask_subset(mask, allowed_mask))
cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask); cpumask_copy(tctx->io_wq->cpu_mask, mask);
else
ret = -EINVAL;
} else {
cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
}
rcu_read_unlock(); rcu_read_unlock();
return 0; free_cpumask_var(allowed_mask);
return ret;
} }
/* /*
......
This diff is collapsed.
...@@ -39,8 +39,12 @@ struct io_wait_queue { ...@@ -39,8 +39,12 @@ struct io_wait_queue {
struct wait_queue_entry wq; struct wait_queue_entry wq;
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
unsigned cq_tail; unsigned cq_tail;
unsigned cq_min_tail;
unsigned nr_timeouts; unsigned nr_timeouts;
int hit_timeout;
ktime_t min_timeout;
ktime_t timeout; ktime_t timeout;
struct hrtimer t;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
ktime_t napi_busy_poll_dt; ktime_t napi_busy_poll_dt;
...@@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req) ...@@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
return false; return false;
} }
static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
{
if (ctx->clockid == CLOCK_MONOTONIC)
return ktime_get();
return ktime_get_with_offset(ctx->clock_offset);
}
enum { enum {
IO_CHECK_CQ_OVERFLOW_BIT, IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT, IO_CHECK_CQ_DROPPED_BIT,
......
...@@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) ...@@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
return true; return true;
} }
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
{ {
/* /*
* We can add this buffer back to two lists: * We can add this buffer back to two lists:
...@@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags) ...@@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock); spin_lock(&ctx->completion_lock);
__io_put_kbuf_list(req, &ctx->io_buffers_comp); __io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock); spin_unlock(&ctx->completion_lock);
} else { } else {
lockdep_assert_held(&req->ctx->uring_lock); lockdep_assert_held(&req->ctx->uring_lock);
__io_put_kbuf_list(req, &req->ctx->io_buffers_cache); __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
} }
} }
...@@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, ...@@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
return 1; return 1;
} }
static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
__u16 head, __u16 mask)
{
return &br->bufs[head & mask];
}
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl, struct io_buffer_list *bl,
unsigned int issue_flags) unsigned int issue_flags)
...@@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, ...@@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
* the transfer completes (or if we get -EAGAIN and must poll of * the transfer completes (or if we get -EAGAIN and must poll of
* retry). * retry).
*/ */
req->flags &= ~REQ_F_BUFFERS_COMMIT; io_kbuf_commit(req, bl, *len, 1);
req->buf_list = NULL; req->buf_list = NULL;
bl->head++;
} }
return u64_to_user_ptr(buf->addr); return u64_to_user_ptr(buf->addr);
} }
...@@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, ...@@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
bl = io_buffer_get_list(ctx, req->buf_index); bl = io_buffer_get_list(ctx, req->buf_index);
if (likely(bl)) { if (likely(bl)) {
if (bl->is_buf_ring) if (bl->flags & IOBL_BUF_RING)
ret = io_ring_buffer_select(req, len, bl, issue_flags); ret = io_ring_buffer_select(req, len, bl, issue_flags);
else else
ret = io_provided_buffer_select(req, len, bl); ret = io_provided_buffer_select(req, len, bl);
...@@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, ...@@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
buf = io_ring_head_to_buf(br, head, bl->mask); buf = io_ring_head_to_buf(br, head, bl->mask);
if (arg->max_len) { if (arg->max_len) {
u32 len = READ_ONCE(buf->len); u32 len = READ_ONCE(buf->len);
size_t needed;
if (unlikely(!len)) if (unlikely(!len))
return -ENOBUFS; return -ENOBUFS;
needed = (arg->max_len + len - 1) / len; /*
needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); * Limit incremental buffers to 1 segment. No point trying
if (nr_avail > needed) * to peek ahead and map more than we need, when the buffers
nr_avail = needed; * themselves should be large when setup with
* IOU_PBUF_RING_INC.
*/
if (bl->flags & IOBL_INC) {
nr_avail = 1;
} else {
size_t needed;
needed = (arg->max_len + len - 1) / len;
needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
if (nr_avail > needed)
nr_avail = needed;
}
} }
/* /*
...@@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, ...@@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
req->buf_index = buf->bid; req->buf_index = buf->bid;
do { do {
/* truncate end piece, if needed */ u32 len = buf->len;
if (buf->len > arg->max_len)
buf->len = arg->max_len; /* truncate end piece, if needed, for non partial buffers */
if (len > arg->max_len) {
len = arg->max_len;
if (!(bl->flags & IOBL_INC))
buf->len = len;
}
iov->iov_base = u64_to_user_ptr(buf->addr); iov->iov_base = u64_to_user_ptr(buf->addr);
iov->iov_len = buf->len; iov->iov_len = len;
iov++; iov++;
arg->out_len += buf->len; arg->out_len += len;
arg->max_len -= buf->len; arg->max_len -= len;
if (!arg->max_len) if (!arg->max_len)
break; break;
...@@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, ...@@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
if (unlikely(!bl)) if (unlikely(!bl))
goto out_unlock; goto out_unlock;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl); ret = io_ring_buffers_peek(req, arg, bl);
/* /*
* Don't recycle these buffers if we need to go through poll. * Don't recycle these buffers if we need to go through poll.
...@@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, ...@@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
* committed them, they cannot be put back in the queue. * committed them, they cannot be put back in the queue.
*/ */
if (ret > 0) { if (ret > 0) {
req->flags |= REQ_F_BL_NO_RECYCLE; req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
req->buf_list->head += ret; io_kbuf_commit(req, bl, arg->out_len, ret);
} }
} else { } else {
ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
...@@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) ...@@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
if (unlikely(!bl)) if (unlikely(!bl))
return -ENOENT; return -ENOENT;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = io_ring_buffers_peek(req, arg, bl); ret = io_ring_buffers_peek(req, arg, bl);
if (ret > 0) if (ret > 0)
req->flags |= REQ_F_BUFFERS_COMMIT; req->flags |= REQ_F_BUFFERS_COMMIT;
...@@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, ...@@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (!nbufs) if (!nbufs)
return 0; return 0;
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head; i = bl->buf_ring->tail - bl->head;
if (bl->buf_nr_pages) { if (bl->buf_nr_pages) {
int j; int j;
if (!bl->is_mmap) { if (!(bl->flags & IOBL_MMAP)) {
for (j = 0; j < bl->buf_nr_pages; j++) for (j = 0; j < bl->buf_nr_pages; j++)
unpin_user_page(bl->buf_pages[j]); unpin_user_page(bl->buf_pages[j]);
} }
io_pages_unmap(bl->buf_ring, &bl->buf_pages, io_pages_unmap(bl->buf_ring, &bl->buf_pages,
&bl->buf_nr_pages, bl->is_mmap); &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
bl->is_mmap = 0; bl->flags &= ~IOBL_MMAP;
} }
/* make sure it's seen as empty */ /* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list); INIT_LIST_HEAD(&bl->buf_list);
bl->is_buf_ring = 0; bl->flags &= ~IOBL_BUF_RING;
return i; return i;
} }
...@@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) ...@@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
if (bl) { if (bl) {
ret = -EINVAL; ret = -EINVAL;
/* can't use provide/remove buffers command on mapped buffers */ /* can't use provide/remove buffers command on mapped buffers */
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
ret = __io_remove_buffers(ctx, bl, p->nbufs); ret = __io_remove_buffers(ctx, bl, p->nbufs);
} }
io_ring_submit_unlock(ctx, issue_flags); io_ring_submit_unlock(ctx, issue_flags);
...@@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) ...@@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
} }
} }
/* can't add buffers via this command for a mapped buffer ring */ /* can't add buffers via this command for a mapped buffer ring */
if (bl->is_buf_ring) { if (bl->flags & IOBL_BUF_RING) {
ret = -EINVAL; ret = -EINVAL;
goto err; goto err;
} }
...@@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, ...@@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
bl->buf_pages = pages; bl->buf_pages = pages;
bl->buf_nr_pages = nr_pages; bl->buf_nr_pages = nr_pages;
bl->buf_ring = br; bl->buf_ring = br;
bl->is_buf_ring = 1; bl->flags |= IOBL_BUF_RING;
bl->is_mmap = 0; bl->flags &= ~IOBL_MMAP;
return 0; return 0;
error_unpin: error_unpin:
unpin_user_pages(pages, nr_pages); unpin_user_pages(pages, nr_pages);
...@@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, ...@@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
return -ENOMEM; return -ENOMEM;
} }
bl->is_buf_ring = 1; bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
bl->is_mmap = 1;
return 0; return 0;
} }
...@@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (reg.resv[0] || reg.resv[1] || reg.resv[2]) if (reg.resv[0] || reg.resv[1] || reg.resv[2])
return -EINVAL; return -EINVAL;
if (reg.flags & ~IOU_PBUF_RING_MMAP) if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL; return -EINVAL;
if (!(reg.flags & IOU_PBUF_RING_MMAP)) { if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
if (!reg.ring_addr) if (!reg.ring_addr)
...@@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (bl) { if (bl) {
/* if mapped buffer ring OR classic exists, don't allow */ /* if mapped buffer ring OR classic exists, don't allow */
if (bl->is_buf_ring || !list_empty(&bl->buf_list)) if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST; return -EEXIST;
} else { } else {
free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
...@@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!ret) { if (!ret) {
bl->nr_entries = reg.ring_entries; bl->nr_entries = reg.ring_entries;
bl->mask = reg.ring_entries - 1; bl->mask = reg.ring_entries - 1;
if (reg.flags & IOU_PBUF_RING_INC)
bl->flags |= IOBL_INC;
io_buffer_add_list(ctx, bl, reg.bgid); io_buffer_add_list(ctx, bl, reg.bgid);
return 0; return 0;
...@@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) ...@@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, reg.bgid); bl = io_buffer_get_list(ctx, reg.bgid);
if (!bl) if (!bl)
return -ENOENT; return -ENOENT;
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL; return -EINVAL;
xa_erase(&ctx->io_bl_xa, bl->bgid); xa_erase(&ctx->io_bl_xa, bl->bgid);
...@@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) ...@@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
bl = io_buffer_get_list(ctx, buf_status.buf_group); bl = io_buffer_get_list(ctx, buf_status.buf_group);
if (!bl) if (!bl)
return -ENOENT; return -ENOENT;
if (!bl->is_buf_ring) if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL; return -EINVAL;
buf_status.head = bl->head; buf_status.head = bl->head;
...@@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, ...@@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
bl = xa_load(&ctx->io_bl_xa, bgid); bl = xa_load(&ctx->io_bl_xa, bgid);
/* must be a mmap'able buffer ring and have pages */ /* must be a mmap'able buffer ring and have pages */
ret = false; ret = false;
if (bl && bl->is_mmap) if (bl && bl->flags & IOBL_MMAP)
ret = atomic_inc_not_zero(&bl->refs); ret = atomic_inc_not_zero(&bl->refs);
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -4,6 +4,16 @@ ...@@ -4,6 +4,16 @@
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
enum {
/* ring mapped provided buffers */
IOBL_BUF_RING = 1,
/* ring mapped provided buffers, but mmap'ed by application */
IOBL_MMAP = 2,
/* buffers are consumed incrementally rather than always fully */
IOBL_INC = 4,
};
struct io_buffer_list { struct io_buffer_list {
/* /*
* If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not, * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
...@@ -25,12 +35,9 @@ struct io_buffer_list { ...@@ -25,12 +35,9 @@ struct io_buffer_list {
__u16 head; __u16 head;
__u16 mask; __u16 mask;
atomic_t refs; __u16 flags;
/* ring mapped provided buffers */ atomic_t refs;
__u8 is_buf_ring;
/* ring mapped provided buffers, but mmap'ed by application */
__u8 is_mmap;
}; };
struct io_buffer { struct io_buffer {
...@@ -52,8 +59,8 @@ struct buf_sel_arg { ...@@ -52,8 +59,8 @@ struct buf_sel_arg {
struct iovec *iovs; struct iovec *iovs;
size_t out_len; size_t out_len;
size_t max_len; size_t max_len;
int nr_iovs; unsigned short nr_iovs;
int mode; unsigned short mode;
}; };
void __user *io_buffer_select(struct io_kiocb *req, size_t *len, void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
...@@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); ...@@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
...@@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) ...@@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false; return false;
} }
static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) /* Mapped buffer ring, return io_uring_buf from head */
#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)]
static inline bool io_kbuf_commit(struct io_kiocb *req,
struct io_buffer_list *bl, int len, int nr)
{
if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
return true;
req->flags &= ~REQ_F_BUFFERS_COMMIT;
if (unlikely(len < 0))
return true;
if (bl->flags & IOBL_INC) {
struct io_uring_buf *buf;
buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
if (WARN_ON_ONCE(len > buf->len))
len = buf->len;
buf->len -= len;
if (buf->len) {
buf->addr += len;
return false;
}
}
bl->head += nr;
return true;
}
static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
{ {
struct io_buffer_list *bl = req->buf_list; struct io_buffer_list *bl = req->buf_list;
bool ret = true;
if (bl) { if (bl) {
if (req->flags & REQ_F_BUFFERS_COMMIT) { ret = io_kbuf_commit(req, bl, len, nr);
bl->head += nr;
req->flags &= ~REQ_F_BUFFERS_COMMIT;
}
req->buf_index = bl->bgid; req->buf_index = bl->bgid;
} }
req->flags &= ~REQ_F_BUFFER_RING; req->flags &= ~REQ_F_BUFFER_RING;
return ret;
} }
static inline void __io_put_kbuf_list(struct io_kiocb *req, static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
struct list_head *list) struct list_head *list)
{ {
if (req->flags & REQ_F_BUFFER_RING) { if (req->flags & REQ_F_BUFFER_RING) {
__io_put_kbuf_ring(req, 1); __io_put_kbuf_ring(req, len, 1);
} else { } else {
req->buf_index = req->kbuf->bgid; req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list); list_add(&req->kbuf->list, list);
...@@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req) ...@@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return; return;
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp); /* len == 0 is fine here, non-ring will always drop all of it */
__io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp);
} }
static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
unsigned issue_flags) int nbufs, unsigned issue_flags)
{ {
unsigned int ret; unsigned int ret;
...@@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, ...@@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
return 0; return 0;
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING) if (req->flags & REQ_F_BUFFER_RING) {
__io_put_kbuf_ring(req, nbufs); if (!__io_put_kbuf_ring(req, len, nbufs))
else ret |= IORING_CQE_F_BUF_MORE;
__io_put_kbuf(req, issue_flags); } else {
__io_put_kbuf(req, len, issue_flags);
}
return ret; return ret;
} }
static inline unsigned int io_put_kbuf(struct io_kiocb *req, static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
unsigned issue_flags) unsigned issue_flags)
{ {
return __io_put_kbufs(req, 1, issue_flags); return __io_put_kbufs(req, len, 1, issue_flags);
} }
static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
unsigned issue_flags) int nbufs, unsigned issue_flags)
{ {
return __io_put_kbufs(req, nbufs, issue_flags); return __io_put_kbufs(req, len, nbufs, issue_flags);
} }
#endif #endif
...@@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) ...@@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
return 0; return 0;
} }
/*
* __io_napi_adjust_timeout() - adjust busy loop timeout
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
* @ts: pointer to timespec or NULL
*
* Adjust the busy loop timeout according to timespec and busy poll timeout.
* If the specified NAPI timeout is bigger than the wait timeout, then adjust
* the NAPI timeout accordingly.
*/
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
ktime_t to_wait)
{
ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
if (to_wait)
poll_dt = min(poll_dt, to_wait);
iowq->napi_busy_poll_dt = poll_dt;
}
/* /*
* __io_napi_busy_loop() - execute busy poll loop * __io_napi_busy_loop() - execute busy poll loop
* @ctx: pointer to io-uring context structure * @ctx: pointer to io-uring context structure
...@@ -299,10 +278,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow ...@@ -299,10 +278,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
*/ */
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{ {
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); if (ctx->flags & IORING_SETUP_SQPOLL)
return;
if (!(ctx->flags & IORING_SETUP_SQPOLL)) iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
io_napi_blocking_busy_loop(ctx, iowq); if (iowq->timeout != KTIME_MAX) {
ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
}
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
io_napi_blocking_busy_loop(ctx, iowq);
} }
/* /*
......
...@@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); ...@@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq, ktime_t to_wait);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
...@@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) ...@@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
return !list_empty(&ctx->napi_list); return !list_empty(&ctx->napi_list);
} }
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t to_wait)
{
if (!io_napi(ctx))
return;
__io_napi_adjust_timeout(ctx, iowq, to_wait);
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq) struct io_wait_queue *iowq)
{ {
...@@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx) ...@@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
static inline void io_napi_add(struct io_kiocb *req) static inline void io_napi_add(struct io_kiocb *req)
{ {
} }
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t to_wait)
{
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq) struct io_wait_queue *iowq)
{ {
......
...@@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
sr->buf_group = req->buf_index; sr->buf_group = req->buf_index;
req->buf_list = NULL; req->buf_list = NULL;
} }
if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
return -EINVAL;
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
if (req->ctx->compat) if (req->ctx->compat)
...@@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, ...@@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
unsigned int cflags; unsigned int cflags;
if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
cflags = io_put_kbuf(req, issue_flags); cflags = io_put_kbuf(req, *ret, issue_flags);
goto finish; goto finish;
} }
cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);
if (bundle_finished || req->flags & REQ_F_BL_EMPTY) if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
goto finish; goto finish;
...@@ -599,7 +597,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ...@@ -599,7 +597,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (io_do_buffer_select(req)) { if (io_do_buffer_select(req)) {
struct buf_sel_arg arg = { struct buf_sel_arg arg = {
.iovs = &kmsg->fast_iov, .iovs = &kmsg->fast_iov,
.max_len = INT_MAX, .max_len = min_not_zero(sr->len, INT_MAX),
.nr_iovs = 1, .nr_iovs = 1,
}; };
...@@ -618,14 +616,23 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ...@@ -618,14 +616,23 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(ret < 0)) if (unlikely(ret < 0))
return ret; return ret;
sr->len = arg.out_len;
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
arg.out_len);
if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
kmsg->free_iov_nr = ret; kmsg->free_iov_nr = ret;
kmsg->free_iov = arg.iovs; kmsg->free_iov = arg.iovs;
req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_NEED_CLEANUP;
} }
sr->len = arg.out_len;
if (ret == 1) {
sr->buf = arg.iovs[0].iov_base;
ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
&kmsg->msg.msg_iter);
if (unlikely(ret))
return ret;
} else {
iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
arg.iovs, ret, arg.out_len);
}
} }
/* /*
...@@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, ...@@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= IORING_CQE_F_SOCK_NONEMPTY; cflags |= IORING_CQE_F_SOCK_NONEMPTY;
if (sr->flags & IORING_RECVSEND_BUNDLE) { if (sr->flags & IORING_RECVSEND_BUNDLE) {
cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
issue_flags); issue_flags);
/* bundle with no more immediate buffers, we're done */ /* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY) if (req->flags & REQ_F_BL_EMPTY)
goto finish; goto finish;
} else { } else {
cflags |= io_put_kbuf(req, issue_flags); cflags |= io_put_kbuf(req, *ret, issue_flags);
} }
/* /*
......
...@@ -335,6 +335,31 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, ...@@ -335,6 +335,31 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return ret; return ret;
} }
static int io_register_clock(struct io_ring_ctx *ctx,
struct io_uring_clock_register __user *arg)
{
struct io_uring_clock_register reg;
if (copy_from_user(&reg, arg, sizeof(reg)))
return -EFAULT;
if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
return -EINVAL;
switch (reg.clockid) {
case CLOCK_MONOTONIC:
ctx->clock_offset = 0;
break;
case CLOCK_BOOTTIME:
ctx->clock_offset = TK_OFFS_BOOT;
break;
default:
return -EINVAL;
}
ctx->clockid = reg.clockid;
return 0;
}
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args) void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock) __releases(ctx->uring_lock)
...@@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break; break;
ret = io_unregister_napi(ctx, arg); ret = io_unregister_napi(ctx, arg);
break; break;
case IORING_REGISTER_CLOCK:
ret = -EINVAL;
if (!arg || nr_args)
break;
ret = io_register_clock(ctx, arg);
break;
case IORING_REGISTER_COPY_BUFFERS:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_copy_buffers(ctx, arg);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -519,21 +556,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, ...@@ -519,21 +556,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
return ret; return ret;
} }
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, /*
void __user *, arg, unsigned int, nr_args) * Given an 'fd' value, return the ctx associated with if. If 'registered' is
* true, then the registered index is used. Otherwise, the normal fd table.
* Caller must call fput() on the returned file, unless it's an ERR_PTR.
*/
struct file *io_uring_register_get_file(int fd, bool registered)
{ {
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct file *file; struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING); if (registered) {
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
if (use_registered_ring) {
/* /*
* Ring fd has been registered via IORING_REGISTER_RING_FDS, we * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
* need only dereference our task private array to find it. * need only dereference our task private array to find it.
...@@ -541,27 +573,44 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ...@@ -541,27 +573,44 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
struct io_uring_task *tctx = current->io_uring; struct io_uring_task *tctx = current->io_uring;
if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX)) if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
return -EINVAL; return ERR_PTR(-EINVAL);
fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
file = tctx->registered_rings[fd]; file = tctx->registered_rings[fd];
if (unlikely(!file))
return -EBADF;
} else { } else {
file = fget(fd); file = fget(fd);
if (unlikely(!file))
return -EBADF;
ret = -EOPNOTSUPP;
if (!io_is_uring_fops(file))
goto out_fput;
} }
if (unlikely(!file))
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
void __user *, arg, unsigned int, nr_args)
{
struct io_ring_ctx *ctx;
long ret = -EBADF;
struct file *file;
bool use_registered_ring;
use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
if (opcode >= IORING_REGISTER_LAST)
return -EINVAL;
file = io_uring_register_get_file(fd, use_registered_ring);
if (IS_ERR(file))
return PTR_ERR(file);
ctx = file->private_data; ctx = file->private_data;
mutex_lock(&ctx->uring_lock); mutex_lock(&ctx->uring_lock);
ret = __io_uring_register(ctx, opcode, arg, nr_args); ret = __io_uring_register(ctx, opcode, arg, nr_args);
mutex_unlock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock);
trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
out_fput:
if (!use_registered_ring) if (!use_registered_ring)
fput(file); fput(file);
return ret; return ret;
......
...@@ -4,5 +4,6 @@ ...@@ -4,5 +4,6 @@
int io_eventfd_unregister(struct io_ring_ctx *ctx); int io_eventfd_unregister(struct io_ring_ctx *ctx);
int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id); int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
struct file *io_uring_register_get_file(int fd, bool registered);
#endif #endif
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include "openclose.h" #include "openclose.h"
#include "rsrc.h" #include "rsrc.h"
#include "memmap.h" #include "memmap.h"
#include "register.h"
struct io_rsrc_update { struct io_rsrc_update {
struct file *file; struct file *file;
...@@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo ...@@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
struct io_mapped_ubuf *imu = *slot; struct io_mapped_ubuf *imu = *slot;
unsigned int i; unsigned int i;
*slot = NULL;
if (imu != &dummy_ubuf) { if (imu != &dummy_ubuf) {
if (!refcount_dec_and_test(&imu->refs))
return;
for (i = 0; i < imu->nr_bvecs; i++) for (i = 0; i < imu->nr_bvecs; i++)
unpin_user_page(imu->bvec[i].bv_page); unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages) if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages); io_unaccount_mem(ctx, imu->acct_pages);
kvfree(imu); kvfree(imu);
} }
*slot = NULL;
} }
static void io_rsrc_put_work(struct io_rsrc_node *node) static void io_rsrc_put_work(struct io_rsrc_node *node)
...@@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, ...@@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret; return ret;
} }
static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data, int nr_folios)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
GFP_KERNEL);
if (!new_array)
return false;
new_array[0] = compound_head(page_array[0]);
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
if (data->nr_pages_head > 1)
unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
j = data->nr_pages_head;
nr_pages_left -= data->nr_pages_head;
for (i = 1; i < nr_folios; i++) {
unsigned int nr_unpin;
new_array[i] = page_array[j];
nr_unpin = min_t(unsigned int, nr_pages_left - 1,
data->nr_pages_mid - 1);
if (nr_unpin)
unpin_user_pages(&page_array[j+1], nr_unpin);
j += data->nr_pages_mid;
nr_pages_left -= data->nr_pages_mid;
}
kvfree(page_array);
*pages = new_array;
*nr_pages = nr_folios;
return true;
}
static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
struct io_imu_folio_data *data)
{
struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
if (*nr_pages <= 1)
return false;
data->nr_pages_mid = folio_nr_pages(folio);
if (data->nr_pages_mid == 1)
return false;
data->folio_shift = folio_shift(folio);
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
for (i = 1; i < *nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
continue;
}
if (nr_folios == 1) {
if (folio_page_idx(folio, page_array[i-1]) !=
data->nr_pages_mid - 1)
return false;
data->nr_pages_head = count;
} else if (count != data->nr_pages_mid) {
return false;
}
folio = page_folio(page_array[i]);
if (folio_size(folio) != (1UL << data->folio_shift) ||
folio_page_idx(folio, page_array[i]) != 0)
return false;
count = 1;
nr_folios++;
}
if (nr_folios == 1)
data->nr_pages_head = count;
return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
struct io_mapped_ubuf **pimu, struct io_mapped_ubuf **pimu,
struct page **last_hpage) struct page **last_hpage)
...@@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
unsigned long off; unsigned long off;
size_t size; size_t size;
int ret, nr_pages, i; int ret, nr_pages, i;
struct folio *folio = NULL; struct io_imu_folio_data data;
bool coalesced;
*pimu = (struct io_mapped_ubuf *)&dummy_ubuf; *pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
if (!iov->iov_base) if (!iov->iov_base)
...@@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
/* If it's a huge page, try to coalesce them into a single bvec entry */ /* If it's huge page(s), try to coalesce them into fewer bvec entries */
if (nr_pages > 1) { coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
folio = page_folio(pages[0]);
for (i = 1; i < nr_pages; i++) {
/*
* Pages must be consecutive and on the same folio for
* this to work
*/
if (page_folio(pages[i]) != folio ||
pages[i] != pages[i - 1] + 1) {
folio = NULL;
break;
}
}
if (folio) {
/*
* The pages are bound to the folio, it doesn't
* actually unpin them but drops all but one reference,
* which is usually put down by io_buffer_unmap().
* Note, needs a better helper.
*/
unpin_user_pages(&pages[1], nr_pages - 1);
nr_pages = 1;
}
}
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu) if (!imu)
...@@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, ...@@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
goto done; goto done;
} }
off = (unsigned long) iov->iov_base & ~PAGE_MASK;
size = iov->iov_len; size = iov->iov_len;
/* store original address for later verification */ /* store original address for later verification */
imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len; imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages; imu->nr_bvecs = nr_pages;
imu->folio_shift = PAGE_SHIFT;
imu->folio_mask = PAGE_MASK;
if (coalesced) {
imu->folio_shift = data.folio_shift;
imu->folio_mask = ~((1UL << data.folio_shift) - 1);
}
refcount_set(&imu->refs, 1);
off = (unsigned long) iov->iov_base & ~imu->folio_mask;
*pimu = imu; *pimu = imu;
ret = 0; ret = 0;
if (folio) {
bvec_set_page(&imu->bvec[0], pages[0], size, off);
goto done;
}
for (i = 0; i < nr_pages; i++) { for (i = 0; i < nr_pages; i++) {
size_t vec_len; size_t vec_len;
vec_len = min_t(size_t, size, PAGE_SIZE - off); vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
bvec_set_page(&imu->bvec[i], pages[i], vec_len, off); bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
off = 0; off = 0;
size -= vec_len; size -= vec_len;
...@@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* we know that: * we know that:
* *
* 1) it's a BVEC iter, we set it up * 1) it's a BVEC iter, we set it up
* 2) all bvecs are PAGE_SIZE in size, except potentially the * 2) all bvecs are the same in size, except potentially the
* first and last bvec * first and last bvec
* *
* So just find our index, and adjust the iterator afterwards. * So just find our index, and adjust the iterator afterwards.
* If the offset is within the first bvec (or the whole first * If the offset is within the first bvec (or the whole first
* bvec, just use iov_iter_advance(). This makes it easier * bvec, just use iov_iter_advance(). This makes it easier
* since we can just skip the first segment, which may not * since we can just skip the first segment, which may not
* be PAGE_SIZE aligned. * be folio_size aligned.
*/ */
const struct bio_vec *bvec = imu->bvec; const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) { if (offset < bvec->bv_len) {
/*
* Note, huge pages buffers consists of one large
* bvec entry and should always go this way. The other
* branch doesn't expect non PAGE_SIZE'd chunks.
*/
iter->bvec = bvec; iter->bvec = bvec;
iter->count -= offset; iter->count -= offset;
iter->iov_offset = offset; iter->iov_offset = offset;
...@@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
/* skip first vec */ /* skip first vec */
offset -= bvec->bv_len; offset -= bvec->bv_len;
seg_skip = 1 + (offset >> PAGE_SHIFT); seg_skip = 1 + (offset >> imu->folio_shift);
iter->bvec = bvec + seg_skip; iter->bvec = bvec + seg_skip;
iter->nr_segs -= seg_skip; iter->nr_segs -= seg_skip;
iter->count -= bvec->bv_len + offset; iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ~PAGE_MASK; iter->iov_offset = offset & ~imu->folio_mask;
} }
} }
return 0; return 0;
} }
static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
{
struct io_mapped_ubuf **user_bufs;
struct io_rsrc_data *data;
int i, ret, nbufs;
/*
* Drop our own lock here. We'll setup the data we need and reference
* the source buffers, then re-grab, check, and assign at the end.
*/
mutex_unlock(&ctx->uring_lock);
mutex_lock(&src_ctx->uring_lock);
ret = -ENXIO;
nbufs = src_ctx->nr_user_bufs;
if (!nbufs)
goto out_unlock;
ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
if (ret)
goto out_unlock;
ret = -ENOMEM;
user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
if (!user_bufs)
goto out_free_data;
for (i = 0; i < nbufs; i++) {
struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
refcount_inc(&src->refs);
user_bufs[i] = src;
}
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
if (!ctx->user_bufs) {
ctx->user_bufs = user_bufs;
ctx->buf_data = data;
ctx->nr_user_bufs = nbufs;
return 0;
}
/* someone raced setting up buffers, dump ours */
for (i = 0; i < nbufs; i++)
io_buffer_unmap(ctx, &user_bufs[i]);
io_rsrc_data_free(data);
kfree(user_bufs);
return -EBUSY;
out_free_data:
io_rsrc_data_free(data);
out_unlock:
mutex_unlock(&src_ctx->uring_lock);
mutex_lock(&ctx->uring_lock);
return ret;
}
/*
* Copy the registered buffers from the source ring whose file descriptor
* is given in the src_fd to the current ring. This is identical to registering
* the buffers with ctx, except faster as mappings already exist.
*
* Since the memory is already accounted once, don't account it again.
*/
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_copy_buffers buf;
bool registered_src;
struct file *file;
int ret;
if (ctx->user_bufs || ctx->nr_user_bufs)
return -EBUSY;
if (copy_from_user(&buf, arg, sizeof(buf)))
return -EFAULT;
if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
return -EINVAL;
if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
return -EINVAL;
registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
ret = io_copy_buffers(ctx, file->private_data);
if (!registered_src)
fput(file);
return ret;
}
...@@ -22,8 +22,6 @@ struct io_rsrc_put { ...@@ -22,8 +22,6 @@ struct io_rsrc_put {
}; };
}; };
typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
struct io_rsrc_data { struct io_rsrc_data {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
...@@ -46,10 +44,21 @@ struct io_mapped_ubuf { ...@@ -46,10 +44,21 @@ struct io_mapped_ubuf {
u64 ubuf; u64 ubuf;
u64 ubuf_end; u64 ubuf_end;
unsigned int nr_bvecs; unsigned int nr_bvecs;
unsigned int folio_shift;
unsigned long acct_pages; unsigned long acct_pages;
unsigned long folio_mask;
refcount_t refs;
struct bio_vec bvec[] __counted_by(nr_bvecs); struct bio_vec bvec[] __counted_by(nr_bvecs);
}; };
struct io_imu_folio_data {
/* Head folio can be partially included in the fixed buf */
unsigned int nr_pages_head;
/* For non-head/tail folios, has to be fully included */
unsigned int nr_pages_mid;
unsigned int folio_shift;
};
void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
...@@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter, ...@@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
struct io_mapped_ubuf *imu, struct io_mapped_ubuf *imu,
u64 buf_addr, size_t len); u64 buf_addr, size_t len);
int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
......
...@@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req) ...@@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req)
static bool __io_complete_rw_common(struct io_kiocb *req, long res) static bool __io_complete_rw_common(struct io_kiocb *req, long res)
{ {
if (unlikely(res != req->cqe.res)) { if (unlikely(res != req->cqe.res)) {
if ((res == -EAGAIN || res == -EOPNOTSUPP) && if (res == -EAGAIN && io_rw_should_reissue(req)) {
io_rw_should_reissue(req)) {
/* /*
* Reissue will start accounting again, finish the * Reissue will start accounting again, finish the
* current cycle. * current cycle.
...@@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) ...@@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
io_req_io_end(req); io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
req->cqe.flags |= io_put_kbuf(req, 0); req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);
io_req_rw_cleanup(req, 0); io_req_rw_cleanup(req, 0);
io_req_task_complete(req, ts); io_req_task_complete(req, ts);
...@@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, ...@@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
*/ */
io_req_io_end(req); io_req_io_end(req);
io_req_set_res(req, final_ret, io_req_set_res(req, final_ret,
io_put_kbuf(req, issue_flags)); io_put_kbuf(req, ret, issue_flags));
io_req_rw_cleanup(req, issue_flags); io_req_rw_cleanup(req, issue_flags);
return IOU_OK; return IOU_OK;
} }
...@@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ...@@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
ret = io_iter_do_read(rw, &io->iter); ret = io_iter_do_read(rw, &io->iter);
/*
* Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
* issue, even though they should be returning -EAGAIN. To be safe,
* retry from blocking context for either.
*/
if (ret == -EOPNOTSUPP && force_nonblock)
ret = -EAGAIN;
if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
req->flags &= ~REQ_F_REISSUE; req->flags &= ~REQ_F_REISSUE;
/* If we can poll, just do that. */ /* If we can poll, just do that. */
...@@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) ...@@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
* Put our buffer and post a CQE. If we fail to post a CQE, then * Put our buffer and post a CQE. If we fail to post a CQE, then
* jump to the termination path. This request is then done. * jump to the termination path. This request is then done.
*/ */
cflags = io_put_kbuf(req, issue_flags); cflags = io_put_kbuf(req, ret, issue_flags);
rw->len = 0; /* similarly to above, reset len to 0 */ rw->len = 0; /* similarly to above, reset len to 0 */
if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
...@@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) ...@@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
if (!smp_load_acquire(&req->iopoll_completed)) if (!smp_load_acquire(&req->iopoll_completed))
break; break;
nr_events++; nr_events++;
req->cqe.flags = io_put_kbuf(req, 0); req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
if (req->opcode != IORING_OP_URING_CMD) if (req->opcode != IORING_OP_URING_CMD)
io_req_rw_cleanup(req, 0); io_req_rw_cleanup(req, 0);
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/audit.h> #include <linux/audit.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/io_uring.h> #include <linux/io_uring.h>
#include <uapi/linux/io_uring.h> #include <uapi/linux/io_uring.h>
...@@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) ...@@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;
if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
const struct cred *creds = NULL; const struct cred *creds = NULL;
if (ctx->sq_creds != current_cred()) if (ctx->sq_creds != current_cred())
...@@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ...@@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
return 0; return 0;
if (p->flags & IORING_SETUP_SQ_AFF) { if (p->flags & IORING_SETUP_SQ_AFF) {
struct cpumask allowed_mask;
int cpu = p->sq_thread_cpu; int cpu = p->sq_thread_cpu;
ret = -EINVAL; ret = -EINVAL;
if (cpu >= nr_cpu_ids || !cpu_online(cpu)) cpuset_cpus_allowed(current, &allowed_mask);
if (!cpumask_test_cpu(cpu, &allowed_mask))
goto err_sqpoll; goto err_sqpoll;
sqd->sq_cpu = cpu; sqd->sq_cpu = cpu;
} else { } else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment