Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe: - NAPI fixes and cleanups (Pavel, Olivier) - Add support for absolute timeouts (Pavel) - Fixes for io-wq/sqpoll affinities (Felix) - Efficiency improvements for dealing with huge pages (Chenliang) - Support for a minwait mode, where the application essentially has two timouts - one smaller one that defines the batch timeout, and the overall large one similar to what we had before. This enables efficient use of batching based on count + timeout, while still working well with periods of less intensive workloads - Use ITER_UBUF for single segment sends - Add support for incremental buffer consumption. Right now each operation will always consume a full buffer. With incremental consumption, a recv/read operation only consumes the part of the buffer that it needs to satisfy the operation - Add support for GCOV for io_uring, to help retain a high coverage of test to code ratio - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly converted to a blocking retry - Add support for cloning registered buffers from one ring to another - Misc cleanups (Anuj, me) * tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits) io_uring: add IORING_REGISTER_COPY_BUFFERS method io_uring/register: provide helper to get io_ring_ctx from 'fd' io_uring/rsrc: add reference count to struct io_mapped_ubuf io_uring/rsrc: clear 'slot' entry upfront io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout ...

Merge tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - NAPI fixes and cleanups (Pavel, Olivier) - Add support for absolute timeouts (Pavel) - Fixes for io-wq/sqpoll affinities (Felix) - Efficiency improvements for dealing with huge pages (Chenliang) - Support for a minwait mode, where the application essentially has two timouts - one smaller one that defines the batch timeout, and the overall large one similar to what we had before. This enables efficient use of batching based on count + timeout, while still working well with periods of less intensive workloads - Use ITER_UBUF for single segment sends - Add support for incremental buffer consumption. Right now each operation will always consume a full buffer. With incremental consumption, a recv/read operation only consumes the part of the buffer that it needs to satisfy the operation - Add support for GCOV for io_uring, to help retain a high coverage of test to code ratio - Fix regression with ocfs2, where an odd -EOPNOTSUPP wasn't correctly converted to a blocking retry - Add support for cloning registered buffers from one ring to another - Misc cleanups (Anuj, me) * tag 'for-6.12/io_uring-20240913' of git://git.kernel.dk/linux: (35 commits) io_uring: add IORING_REGISTER_COPY_BUFFERS method io_uring/register: provide helper to get io_ring_ctx from 'fd' io_uring/rsrc: add reference count to struct io_mapped_ubuf io_uring/rsrc: clear 'slot' entry upfront io_uring/io-wq: inherit cpuset of cgroup in io worker io_uring/io-wq: do not allow pinning outside of cpuset io_uring/rw: drop -EOPNOTSUPP check in __io_complete_rw_common() io_uring/rw: treat -EOPNOTSUPP for IOCB_NOWAIT like -EAGAIN io_uring/sqpoll: do not allow pinning outside of cpuset io_uring/eventfd: move refs to refcount_t io_uring: remove unused rsrc_put_fn io_uring: add new line after variable declaration io_uring: add GCOV_PROFILE_URING Kconfig option io_uring/kbuf: add support for incremental buffer consumption io_uring/kbuf: pass in 'len' argument for buffer commit Revert "io_uring: Require zeroed sqe->len on provided-buffers send" io_uring/kbuf: move io_ring_head_to_buf() to kbuf.h io_uring/kbuf: add io_kbuf_commit() helper io_uring/kbuf: shrink nr_iovs/mode in struct buf_sel_arg io_uring: wire up min batch wake timeout ...
3a4d319a · Linus Torvalds · 69a3a0a4 · 7cc2a6ea · 3a4d319a · 3a4d319a
Commit 3a4d319a authored Sep 16, 2024 by Linus Torvalds
20 changed files
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -239,6 +239,9 @@ struct io_ring_ctx {
 		struct io_rings		*rings;
 		struct percpu_ref	refs;

+		clockid_t		clockid;
+		enum tk_offsets		clock_offset;
+
 		enum task_work_notify_mode	notify_method;
 		unsigned			sq_thread_idle;
 	} ____cacheline_aligned_in_smp;

--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -440,11 +440,21 @@ struct io_uring_cqe {
 * IORING_CQE_F_SOCK_NONEMPTY	If set, more data to read after socket recv
 * IORING_CQE_F_NOTIF	Set for notification CQEs. Can be used to distinct
 * 			them from sends.
+ * IORING_CQE_F_BUF_MORE If set, the buffer ID set in the completion will get
+ *			more completions. In other words, the buffer is being
+ *			partially consumed, and will be used by the kernel for
+ *			more completions. This is only set for buffers used via
+ *			the incremental buffer consumption, as provided by
+ *			a ring buffer setup with IOU_PBUF_RING_INC. For any
+ *			other provided buffer type, all completions with a
+ *			buffer passed back is automatically returned to the
+ *			application.
 */
 #define IORING_CQE_F_BUFFER		(1U << 0)
 #define IORING_CQE_F_MORE		(1U << 1)
 #define IORING_CQE_F_SOCK_NONEMPTY	(1U << 2)
 #define IORING_CQE_F_NOTIF		(1U << 3)
+#define IORING_CQE_F_BUF_MORE		(1U << 4)

 #define IORING_CQE_BUFFER_SHIFT		16

@@ -507,6 +517,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_SQ_WAIT		(1U << 2)
 #define IORING_ENTER_EXT_ARG		(1U << 3)
 #define IORING_ENTER_REGISTERED_RING	(1U << 4)
+#define IORING_ENTER_ABS_TIMER		(1U << 5)

 /*
 * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -542,6 +553,7 @@ struct io_uring_params {
 #define IORING_FEAT_LINKED_FILE		(1U << 12)
 #define IORING_FEAT_REG_REG_RING	(1U << 13)
 #define IORING_FEAT_RECVSEND_BUNDLE	(1U << 14)
+#define IORING_FEAT_MIN_TIMEOUT		(1U << 15)

 /*
 * io_uring_register(2) opcodes and arguments
@@ -595,6 +607,11 @@ enum io_uring_register_op {
 	IORING_REGISTER_NAPI			= 27,
 	IORING_UNREGISTER_NAPI			= 28,

+	IORING_REGISTER_CLOCK			= 29,
+
+	/* copy registered buffers from source ring to current ring */
+	IORING_REGISTER_COPY_BUFFERS		= 30,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,

@@ -675,6 +692,21 @@ struct io_uring_restriction {
 	__u32 resv2[3];
 };

+struct io_uring_clock_register {
+	__u32	clockid;
+	__u32	__resv[3];
+};
+
+enum {
+	IORING_REGISTER_SRC_REGISTERED = 1,
+};
+
+struct io_uring_copy_buffers {
+	__u32	src_fd;
+	__u32	flags;
+	__u32	pad[6];
+};
+
 struct io_uring_buf {
 	__u64	addr;
 	__u32	len;
@@ -707,9 +739,17 @@ struct io_uring_buf_ring {
 *			mmap(2) with the offset set as:
 *			IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
 *			to get a virtual mapping for the ring.
+ * IOU_PBUF_RING_INC:	If set, buffers consumed from this buffer ring can be
+ *			consumed incrementally. Normally one (or more) buffers
+ *			are fully consumed. With incremental consumptions, it's
+ *			feasible to register big ranges of buffers, and each
+ *			use of it will consume only as much as it needs. This
+ *			requires that both the kernel and application keep
+ *			track of where the current read/recv index is at.
 */
 enum io_uring_register_pbuf_ring_flags {
 	IOU_PBUF_RING_MMAP	= 1,
+	IOU_PBUF_RING_INC	= 2,
 };

 /* argument for IORING_(UN)REGISTER_PBUF_RING */
@@ -758,7 +798,7 @@ enum io_uring_register_restriction_op {
 struct io_uring_getevents_arg {
 	__u64	sigmask;
 	__u32	sigmask_sz;
-	__u32	pad;
+	__u32	min_wait_usec;
 	__u64	ts;
 };


--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1687,6 +1687,19 @@ config IO_URING
 	  applications to submit and complete IO through submission and
 	  completion rings that are shared between the kernel and application.

+config GCOV_PROFILE_URING
+	bool "Enable GCOV profiling on the io_uring subsystem"
+	depends on GCOV_KERNEL
+	help
+	  Enable GCOV profiling on the io_uring subsystem, to facilitate
+	  code coverage testing.
+
+	  If unsure, say N.
+
+	  Note that this will have a negative impact on the performance of
+	  the io_uring subsystem, hence this should only be enabled for
+	  specific test purposes.
+
 config ADVISE_SYSCALLS
 	bool "Enable madvise/fadvise syscalls" if EXPERT
 	default y

--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -2,6 +2,10 @@
 #
 # Makefile for io_uring

+ifdef CONFIG_GCOV_PROFILE_URING
+GCOV_PROFILE := y
+endif
+
 obj-$(CONFIG_IO_URING)		+= io_uring.o opdef.o kbuf.o rsrc.o notif.o \
 					tctx.o filetable.o rw.o net.o poll.o \
 					eventfd.o uring_cmd.o openclose.o \

--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -15,7 +15,7 @@ struct io_ev_fd {
 	struct eventfd_ctx	*cq_ev_fd;
 	unsigned int		eventfd_async: 1;
 	struct rcu_head		rcu;
-	atomic_t		refs;
+	refcount_t		refs;
 	atomic_t		ops;
 };

@@ -37,7 +37,7 @@ static void io_eventfd_do_signal(struct rcu_head *rcu)

 	eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);

-	if (atomic_dec_and_test(&ev_fd->refs))
+	if (refcount_dec_and_test(&ev_fd->refs))
 		io_eventfd_free(rcu);
 }

@@ -63,7 +63,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
 	 */
 	if (unlikely(!ev_fd))
 		return;
-	if (!atomic_inc_not_zero(&ev_fd->refs))
+	if (!refcount_inc_not_zero(&ev_fd->refs))
 		return;
 	if (ev_fd->eventfd_async && !io_wq_current_is_worker())
 		goto out;
@@ -77,7 +77,7 @@ void io_eventfd_signal(struct io_ring_ctx *ctx)
 		}
 	}
 out:
-	if (atomic_dec_and_test(&ev_fd->refs))
+	if (refcount_dec_and_test(&ev_fd->refs))
 		call_rcu(&ev_fd->rcu, io_eventfd_free);
 }

@@ -126,6 +126,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,
 	ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd);
 	if (IS_ERR(ev_fd->cq_ev_fd)) {
 		int ret = PTR_ERR(ev_fd->cq_ev_fd);
+
 		kfree(ev_fd);
 		return ret;
 	}
@@ -136,7 +137,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg,

 	ev_fd->eventfd_async = eventfd_async;
 	ctx->has_evfd = true;
-	atomic_set(&ev_fd->refs, 1);
+	refcount_set(&ev_fd->refs, 1);
 	atomic_set(&ev_fd->ops, 0);
 	rcu_assign_pointer(ctx->io_ev_fd, ev_fd);
 	return 0;
@@ -151,7 +152,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx)
 	if (ev_fd) {
 		ctx->has_evfd = false;
 		rcu_assign_pointer(ctx->io_ev_fd, NULL);
-		if (atomic_dec_and_test(&ev_fd->refs))
+		if (refcount_dec_and_test(&ev_fd->refs))
 			call_rcu(&ev_fd->rcu, io_eventfd_free);
 		return 0;
 	}

--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -221,7 +221,19 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 			   cqe->user_data, cqe->res, cqe->flags);

 	}
-
 	spin_unlock(&ctx->completion_lock);
+
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	if (ctx->napi_enabled) {
+		seq_puts(m, "NAPI:\tenabled\n");
+		seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+		if (ctx->napi_prefer_busy_poll)
+			seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+		else
+			seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+	} else {
+		seq_puts(m, "NAPI:\tdisabled\n");
+	}
+#endif
 }
 #endif
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/rculist_nulls.h>
 #include <linux/cpu.h>
+#include <linux/cpuset.h>
 #include <linux/task_work.h>
 #include <linux/audit.h>
 #include <linux/mmu_context.h>
@@ -1167,7 +1168,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)

 	if (!alloc_cpumask_var(&wq->cpu_mask, GFP_KERNEL))
 		goto err;
-	cpumask_copy(wq->cpu_mask, cpu_possible_mask);
+	cpuset_cpus_allowed(data->task, wq->cpu_mask);
 	wq->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
 	wq->acct[IO_WQ_ACCT_UNBOUND].max_workers =
 				task_rlimit(current, RLIMIT_NPROC);
@@ -1322,17 +1323,29 @@ static int io_wq_cpu_offline(unsigned int cpu, struct hlist_node *node)

 int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask)
 {
+	cpumask_var_t allowed_mask;
+	int ret = 0;
+
 	if (!tctx || !tctx->io_wq)
 		return -EINVAL;

+	if (!alloc_cpumask_var(&allowed_mask, GFP_KERNEL))
+		return -ENOMEM;
+
 	rcu_read_lock();
-	if (mask)
-		cpumask_copy(tctx->io_wq->cpu_mask, mask);
-	else
-		cpumask_copy(tctx->io_wq->cpu_mask, cpu_possible_mask);
+	cpuset_cpus_allowed(tctx->io_wq->task, allowed_mask);
+	if (mask) {
+		if (cpumask_subset(mask, allowed_mask))
+			cpumask_copy(tctx->io_wq->cpu_mask, mask);
+		else
+			ret = -EINVAL;
+	} else {
+		cpumask_copy(tctx->io_wq->cpu_mask, allowed_mask);
+	}
 	rcu_read_unlock();

-	return 0;
+	free_cpumask_var(allowed_mask);
+	return ret;
 }

 /*

--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -39,8 +39,12 @@ struct io_wait_queue {
 	struct wait_queue_entry wq;
 	struct io_ring_ctx *ctx;
 	unsigned cq_tail;
+	unsigned cq_min_tail;
 	unsigned nr_timeouts;
+	int hit_timeout;
+	ktime_t min_timeout;
 	ktime_t timeout;
+	struct hrtimer t;

 #ifdef CONFIG_NET_RX_BUSY_POLL
 	ktime_t napi_busy_poll_dt;
@@ -437,6 +441,14 @@ static inline bool io_file_can_poll(struct io_kiocb *req)
 	return false;
 }

+static inline ktime_t io_get_time(struct io_ring_ctx *ctx)
+{
+	if (ctx->clockid == CLOCK_MONOTONIC)
+		return ktime_get();
+
+	return ktime_get_with_offset(ctx->clock_offset);
+}
+
 enum {
 	IO_CHECK_CQ_OVERFLOW_BIT,
 	IO_CHECK_CQ_DROPPED_BIT,

--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -70,7 +70,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
 	return true;
 }

-void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
+void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags)
 {
 	/*
 	 * We can add this buffer back to two lists:
@@ -88,12 +88,12 @@ void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
 		struct io_ring_ctx *ctx = req->ctx;

 		spin_lock(&ctx->completion_lock);
-		__io_put_kbuf_list(req, &ctx->io_buffers_comp);
+		__io_put_kbuf_list(req, len, &ctx->io_buffers_comp);
 		spin_unlock(&ctx->completion_lock);
 	} else {
 		lockdep_assert_held(&req->ctx->uring_lock);

-		__io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
+		__io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache);
 	}
 }

@@ -132,12 +132,6 @@ static int io_provided_buffers_select(struct io_kiocb *req, size_t *len,
 	return 1;
 }

-static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br,
-						__u16 head, __u16 mask)
-{
-	return &br->bufs[head & mask];
-}
-
 static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 					  struct io_buffer_list *bl,
 					  unsigned int issue_flags)
@@ -171,9 +165,8 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
 		 * the transfer completes (or if we get -EAGAIN and must poll of
 		 * retry).
 		 */
-		req->flags &= ~REQ_F_BUFFERS_COMMIT;
+		io_kbuf_commit(req, bl, *len, 1);
 		req->buf_list = NULL;
-		bl->head++;
 	}
 	return u64_to_user_ptr(buf->addr);
 }
@@ -189,7 +182,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len,

 	bl = io_buffer_get_list(ctx, req->buf_index);
 	if (likely(bl)) {
-		if (bl->is_buf_ring)
+		if (bl->flags & IOBL_BUF_RING)
 			ret = io_ring_buffer_select(req, len, bl, issue_flags);
 		else
 			ret = io_provided_buffer_select(req, len, bl);
@@ -219,14 +212,25 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
 	buf = io_ring_head_to_buf(br, head, bl->mask);
 	if (arg->max_len) {
 		u32 len = READ_ONCE(buf->len);
-		size_t needed;

 		if (unlikely(!len))
 			return -ENOBUFS;
-		needed = (arg->max_len + len - 1) / len;
-		needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
-		if (nr_avail > needed)
-			nr_avail = needed;
+		/*
+		 * Limit incremental buffers to 1 segment. No point trying
+		 * to peek ahead and map more than we need, when the buffers
+		 * themselves should be large when setup with
+		 * IOU_PBUF_RING_INC.
+		 */
+		if (bl->flags & IOBL_INC) {
+			nr_avail = 1;
+		} else {
+			size_t needed;
+
+			needed = (arg->max_len + len - 1) / len;
+			needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT);
+			if (nr_avail > needed)
+				nr_avail = needed;
+		}
 	}

 	/*
@@ -251,16 +255,21 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,

 	req->buf_index = buf->bid;
 	do {
-		/* truncate end piece, if needed */
-		if (buf->len > arg->max_len)
-			buf->len = arg->max_len;
+		u32 len = buf->len;
+
+		/* truncate end piece, if needed, for non partial buffers */
+		if (len > arg->max_len) {
+			len = arg->max_len;
+			if (!(bl->flags & IOBL_INC))
+				buf->len = len;
+		}

 		iov->iov_base = u64_to_user_ptr(buf->addr);
-		iov->iov_len = buf->len;
+		iov->iov_len = len;
 		iov++;

-		arg->out_len += buf->len;
-		arg->max_len -= buf->len;
+		arg->out_len += len;
+		arg->max_len -= len;
 		if (!arg->max_len)
 			break;

@@ -287,7 +296,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 	if (unlikely(!bl))
 		goto out_unlock;

-	if (bl->is_buf_ring) {
+	if (bl->flags & IOBL_BUF_RING) {
 		ret = io_ring_buffers_peek(req, arg, bl);
 		/*
 		 * Don't recycle these buffers if we need to go through poll.
@@ -297,8 +306,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
 		 * committed them, they cannot be put back in the queue.
 		 */
 		if (ret > 0) {
-			req->flags |= REQ_F_BL_NO_RECYCLE;
-			req->buf_list->head += ret;
+			req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
+			io_kbuf_commit(req, bl, arg->out_len, ret);
 		}
 	} else {
 		ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs);
@@ -320,7 +329,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg)
 	if (unlikely(!bl))
 		return -ENOENT;

-	if (bl->is_buf_ring) {
+	if (bl->flags & IOBL_BUF_RING) {
 		ret = io_ring_buffers_peek(req, arg, bl);
 		if (ret > 0)
 			req->flags |= REQ_F_BUFFERS_COMMIT;
@@ -340,22 +349,22 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
 	if (!nbufs)
 		return 0;

-	if (bl->is_buf_ring) {
+	if (bl->flags & IOBL_BUF_RING) {
 		i = bl->buf_ring->tail - bl->head;
 		if (bl->buf_nr_pages) {
 			int j;

-			if (!bl->is_mmap) {
+			if (!(bl->flags & IOBL_MMAP)) {
 				for (j = 0; j < bl->buf_nr_pages; j++)
 					unpin_user_page(bl->buf_pages[j]);
 			}
 			io_pages_unmap(bl->buf_ring, &bl->buf_pages,
-					&bl->buf_nr_pages, bl->is_mmap);
-			bl->is_mmap = 0;
+					&bl->buf_nr_pages, bl->flags & IOBL_MMAP);
+			bl->flags &= ~IOBL_MMAP;
 		}
 		/* make sure it's seen as empty */
 		INIT_LIST_HEAD(&bl->buf_list);
-		bl->is_buf_ring = 0;
+		bl->flags &= ~IOBL_BUF_RING;
 		return i;
 	}

@@ -442,7 +451,7 @@ int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags)
 	if (bl) {
 		ret = -EINVAL;
 		/* can't use provide/remove buffers command on mapped buffers */
-		if (!bl->is_buf_ring)
+		if (!(bl->flags & IOBL_BUF_RING))
 			ret = __io_remove_buffers(ctx, bl, p->nbufs);
 	}
 	io_ring_submit_unlock(ctx, issue_flags);
@@ -589,7 +598,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
 		}
 	}
 	/* can't add buffers via this command for a mapped buffer ring */
-	if (bl->is_buf_ring) {
+	if (bl->flags & IOBL_BUF_RING) {
 		ret = -EINVAL;
 		goto err;
 	}
@@ -641,8 +650,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
 	bl->buf_pages = pages;
 	bl->buf_nr_pages = nr_pages;
 	bl->buf_ring = br;
-	bl->is_buf_ring = 1;
-	bl->is_mmap = 0;
+	bl->flags |= IOBL_BUF_RING;
+	bl->flags &= ~IOBL_MMAP;
 	return 0;
 error_unpin:
 	unpin_user_pages(pages, nr_pages);
@@ -665,8 +674,7 @@ static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
 		return -ENOMEM;
 	}

-	bl->is_buf_ring = 1;
-	bl->is_mmap = 1;
+	bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
 	return 0;
 }

@@ -683,7 +691,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)

 	if (reg.resv[0] || reg.resv[1] || reg.resv[2])
 		return -EINVAL;
-	if (reg.flags & ~IOU_PBUF_RING_MMAP)
+	if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
 		return -EINVAL;
 	if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
 		if (!reg.ring_addr)
@@ -705,7 +713,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (bl) {
 		/* if mapped buffer ring OR classic exists, don't allow */
-		if (bl->is_buf_ring || !list_empty(&bl->buf_list))
+		if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
 			return -EEXIST;
 	} else {
 		free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
@@ -721,6 +729,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	if (!ret) {
 		bl->nr_entries = reg.ring_entries;
 		bl->mask = reg.ring_entries - 1;
+		if (reg.flags & IOU_PBUF_RING_INC)
+			bl->flags |= IOBL_INC;

 		io_buffer_add_list(ctx, bl, reg.bgid);
 		return 0;
@@ -747,7 +757,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, reg.bgid);
 	if (!bl)
 		return -ENOENT;
-	if (!bl->is_buf_ring)
+	if (!(bl->flags & IOBL_BUF_RING))
 		return -EINVAL;

 	xa_erase(&ctx->io_bl_xa, bl->bgid);
@@ -771,7 +781,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
 	bl = io_buffer_get_list(ctx, buf_status.buf_group);
 	if (!bl)
 		return -ENOENT;
-	if (!bl->is_buf_ring)
+	if (!(bl->flags & IOBL_BUF_RING))
 		return -EINVAL;

 	buf_status.head = bl->head;
@@ -802,7 +812,7 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
 	bl = xa_load(&ctx->io_bl_xa, bgid);
 	/* must be a mmap'able buffer ring and have pages */
 	ret = false;
-	if (bl && bl->is_mmap)
+	if (bl && bl->flags & IOBL_MMAP)
 		ret = atomic_inc_not_zero(&bl->refs);
 	rcu_read_unlock();


--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -4,6 +4,16 @@

 #include <uapi/linux/io_uring.h>

+enum {
+	/* ring mapped provided buffers */
+	IOBL_BUF_RING	= 1,
+	/* ring mapped provided buffers, but mmap'ed by application */
+	IOBL_MMAP	= 2,
+	/* buffers are consumed incrementally rather than always fully */
+	IOBL_INC	= 4,
+
+};
+
 struct io_buffer_list {
 	/*
 	 * If ->buf_nr_pages is set, then buf_pages/buf_ring are used. If not,
@@ -25,12 +35,9 @@ struct io_buffer_list {
 	__u16 head;
 	__u16 mask;

-	atomic_t refs;
+	__u16 flags;

-	/* ring mapped provided buffers */
-	__u8 is_buf_ring;
-	/* ring mapped provided buffers, but mmap'ed by application */
-	__u8 is_mmap;
+	atomic_t refs;
 };

 struct io_buffer {
@@ -52,8 +59,8 @@ struct buf_sel_arg {
 	struct iovec *iovs;
 	size_t out_len;
 	size_t max_len;
-	int nr_iovs;
-	int mode;
+	unsigned short nr_iovs;
+	unsigned short mode;
 };

 void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@@ -73,7 +80,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg);
 int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);

-void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
+void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags);

 bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);

@@ -117,25 +124,55 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
 	return false;
 }

-static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr)
+/* Mapped buffer ring, return io_uring_buf from head */
+#define io_ring_head_to_buf(br, head, mask)	&(br)->bufs[(head) & (mask)]
+
+static inline bool io_kbuf_commit(struct io_kiocb *req,
+				  struct io_buffer_list *bl, int len, int nr)
+{
+	if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT)))
+		return true;
+
+	req->flags &= ~REQ_F_BUFFERS_COMMIT;
+
+	if (unlikely(len < 0))
+		return true;
+
+	if (bl->flags & IOBL_INC) {
+		struct io_uring_buf *buf;
+
+		buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask);
+		if (WARN_ON_ONCE(len > buf->len))
+			len = buf->len;
+		buf->len -= len;
+		if (buf->len) {
+			buf->addr += len;
+			return false;
+		}
+	}
+
+	bl->head += nr;
+	return true;
+}
+
+static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr)
 {
 	struct io_buffer_list *bl = req->buf_list;
+	bool ret = true;

 	if (bl) {
-		if (req->flags & REQ_F_BUFFERS_COMMIT) {
-			bl->head += nr;
-			req->flags &= ~REQ_F_BUFFERS_COMMIT;
-		}
+		ret = io_kbuf_commit(req, bl, len, nr);
 		req->buf_index = bl->bgid;
 	}
 	req->flags &= ~REQ_F_BUFFER_RING;
+	return ret;
 }

-static inline void __io_put_kbuf_list(struct io_kiocb *req,
+static inline void __io_put_kbuf_list(struct io_kiocb *req, int len,
 				      struct list_head *list)
 {
 	if (req->flags & REQ_F_BUFFER_RING) {
-		__io_put_kbuf_ring(req, 1);
+		__io_put_kbuf_ring(req, len, 1);
 	} else {
 		req->buf_index = req->kbuf->bgid;
 		list_add(&req->kbuf->list, list);
@@ -150,11 +187,12 @@ static inline void io_kbuf_drop(struct io_kiocb *req)
 	if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
 		return;

-	__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
+	/* len == 0 is fine here, non-ring will always drop all of it */
+	__io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp);
 }

-static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
-					  unsigned issue_flags)
+static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len,
+					  int nbufs, unsigned issue_flags)
 {
 	unsigned int ret;

@@ -162,22 +200,24 @@ static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs,
 		return 0;

 	ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
-	if (req->flags & REQ_F_BUFFER_RING)
-		__io_put_kbuf_ring(req, nbufs);
-	else
-		__io_put_kbuf(req, issue_flags);
+	if (req->flags & REQ_F_BUFFER_RING) {
+		if (!__io_put_kbuf_ring(req, len, nbufs))
+			ret |= IORING_CQE_F_BUF_MORE;
+	} else {
+		__io_put_kbuf(req, len, issue_flags);
+	}
 	return ret;
 }

-static inline unsigned int io_put_kbuf(struct io_kiocb *req,
+static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len,
 				       unsigned issue_flags)
 {
-	return __io_put_kbufs(req, 1, issue_flags);
+	return __io_put_kbufs(req, len, 1, issue_flags);
 }

-static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs,
-					unsigned issue_flags)
+static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len,
+					int nbufs, unsigned issue_flags)
 {
-	return __io_put_kbufs(req, nbufs, issue_flags);
+	return __io_put_kbufs(req, len, nbufs, issue_flags);
 }
 #endif
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -269,27 +269,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 	return 0;
 }

-/*
- * __io_napi_adjust_timeout() - adjust busy loop timeout
- * @ctx: pointer to io-uring context structure
- * @iowq: pointer to io wait queue
- * @ts: pointer to timespec or NULL
- *
- * Adjust the busy loop timeout according to timespec and busy poll timeout.
- * If the specified NAPI timeout is bigger than the wait timeout, then adjust
- * the NAPI timeout accordingly.
- */
-void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
-			      ktime_t to_wait)
-{
-	ktime_t poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
-
-	if (to_wait)
-		poll_dt = min(poll_dt, to_wait);
-
-	iowq->napi_busy_poll_dt = poll_dt;
-}
-
 /*
 * __io_napi_busy_loop() - execute busy poll loop
 * @ctx: pointer to io-uring context structure
@@ -299,10 +278,18 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow
 */
 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
 {
-	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+	if (ctx->flags & IORING_SETUP_SQPOLL)
+		return;

-	if (!(ctx->flags & IORING_SETUP_SQPOLL))
-		io_napi_blocking_busy_loop(ctx, iowq);
+	iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt);
+	if (iowq->timeout != KTIME_MAX) {
+		ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx));
+
+		iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt);
+	}
+
+	iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
+	io_napi_blocking_busy_loop(ctx, iowq);
 }

 /*

--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -17,8 +17,6 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);

 void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);

-void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
-		struct io_wait_queue *iowq, ktime_t to_wait);
 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);

@@ -27,15 +25,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
 	return !list_empty(&ctx->napi_list);
 }

-static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq,
-					  ktime_t to_wait)
-{
-	if (!io_napi(ctx))
-		return;
-	__io_napi_adjust_timeout(ctx, iowq, to_wait);
-}
-
 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {
@@ -86,11 +75,6 @@ static inline bool io_napi(struct io_ring_ctx *ctx)
 static inline void io_napi_add(struct io_kiocb *req)
 {
 }
-static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
-					  struct io_wait_queue *iowq,
-					  ktime_t to_wait)
-{
-}
 static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
 				     struct io_wait_queue *iowq)
 {

--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -434,8 +434,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		sr->buf_group = req->buf_index;
 		req->buf_list = NULL;
 	}
-	if (req->flags & REQ_F_BUFFER_SELECT && sr->len)
-		return -EINVAL;

 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
@@ -499,11 +497,11 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret,
 	unsigned int cflags;

 	if (!(sr->flags & IORING_RECVSEND_BUNDLE)) {
-		cflags = io_put_kbuf(req, issue_flags);
+		cflags = io_put_kbuf(req, *ret, issue_flags);
 		goto finish;
 	}

-	cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags);
+	cflags = io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), issue_flags);

 	if (bundle_finished || req->flags & REQ_F_BL_EMPTY)
 		goto finish;
@@ -599,7 +597,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 	if (io_do_buffer_select(req)) {
 		struct buf_sel_arg arg = {
 			.iovs = &kmsg->fast_iov,
-			.max_len = INT_MAX,
+			.max_len = min_not_zero(sr->len, INT_MAX),
 			.nr_iovs = 1,
 		};

@@ -618,14 +616,23 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret < 0))
 			return ret;

-		sr->len = arg.out_len;
-		iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret,
-				arg.out_len);
 		if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) {
 			kmsg->free_iov_nr = ret;
 			kmsg->free_iov = arg.iovs;
 			req->flags |= REQ_F_NEED_CLEANUP;
 		}
+		sr->len = arg.out_len;
+
+		if (ret == 1) {
+			sr->buf = arg.iovs[0].iov_base;
+			ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len,
+						&kmsg->msg.msg_iter);
+			if (unlikely(ret))
+				return ret;
+		} else {
+			iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE,
+					arg.iovs, ret, arg.out_len);
+		}
 	}

 	/*
@@ -835,13 +842,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 		cflags |= IORING_CQE_F_SOCK_NONEMPTY;

 	if (sr->flags & IORING_RECVSEND_BUNDLE) {
-		cflags |= io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret),
+		cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret),
 				      issue_flags);
 		/* bundle with no more immediate buffers, we're done */
 		if (req->flags & REQ_F_BL_EMPTY)
 			goto finish;
 	} else {
-		cflags |= io_put_kbuf(req, issue_flags);
+		cflags |= io_put_kbuf(req, *ret, issue_flags);
 	}

 	/*

--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -335,6 +335,31 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }

+static int io_register_clock(struct io_ring_ctx *ctx,
+			     struct io_uring_clock_register __user *arg)
+{
+	struct io_uring_clock_register reg;
+
+	if (copy_from_user(&reg, arg, sizeof(reg)))
+		return -EFAULT;
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+		return -EINVAL;
+
+	switch (reg.clockid) {
+	case CLOCK_MONOTONIC:
+		ctx->clock_offset = 0;
+		break;
+	case CLOCK_BOOTTIME:
+		ctx->clock_offset = TK_OFFS_BOOT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	ctx->clockid = reg.clockid;
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -511,6 +536,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_unregister_napi(ctx, arg);
 		break;
+	case IORING_REGISTER_CLOCK:
+		ret = -EINVAL;
+		if (!arg || nr_args)
+			break;
+		ret = io_register_clock(ctx, arg);
+		break;
+	case IORING_REGISTER_COPY_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_copy_buffers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -519,21 +556,16 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 	return ret;
 }

-SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
-		void __user *, arg, unsigned int, nr_args)
+/*
+ * Given an 'fd' value, return the ctx associated with if. If 'registered' is
+ * true, then the registered index is used. Otherwise, the normal fd table.
+ * Caller must call fput() on the returned file, unless it's an ERR_PTR.
+ */
+struct file *io_uring_register_get_file(int fd, bool registered)
 {
-	struct io_ring_ctx *ctx;
-	long ret = -EBADF;
 	struct file *file;
-	bool use_registered_ring;

-	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
-	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
-
-	if (opcode >= IORING_REGISTER_LAST)
-		return -EINVAL;
-
-	if (use_registered_ring) {
+	if (registered) {
 		/*
 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
 		 * need only dereference our task private array to find it.
@@ -541,27 +573,44 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 		struct io_uring_task *tctx = current->io_uring;

 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
-			return -EINVAL;
+			return ERR_PTR(-EINVAL);
 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
 		file = tctx->registered_rings[fd];
-		if (unlikely(!file))
-			return -EBADF;
 	} else {
 		file = fget(fd);
-		if (unlikely(!file))
-			return -EBADF;
-		ret = -EOPNOTSUPP;
-		if (!io_is_uring_fops(file))
-			goto out_fput;
 	}

+	if (unlikely(!file))
+		return ERR_PTR(-EBADF);
+	if (io_is_uring_fops(file))
+		return file;
+	fput(file);
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
+		void __user *, arg, unsigned int, nr_args)
+{
+	struct io_ring_ctx *ctx;
+	long ret = -EBADF;
+	struct file *file;
+	bool use_registered_ring;
+
+	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
+	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
+
+	if (opcode >= IORING_REGISTER_LAST)
+		return -EINVAL;
+
+	file = io_uring_register_get_file(fd, use_registered_ring);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
 	ctx = file->private_data;

 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
 	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
-out_fput:
 	if (!use_registered_ring)
 		fput(file);
 	return ret;

--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -4,5 +4,6 @@

 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
+struct file *io_uring_register_get_file(int fd, bool registered);

 #endif
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -17,6 +17,7 @@
 #include "openclose.h"
 #include "rsrc.h"
 #include "memmap.h"
+#include "register.h"

 struct io_rsrc_update {
 	struct file			*file;
@@ -114,14 +115,16 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;

+	*slot = NULL;
 	if (imu != &dummy_ubuf) {
+		if (!refcount_dec_and_test(&imu->refs))
+			return;
 		for (i = 0; i < imu->nr_bvecs; i++)
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
 		kvfree(imu);
 	}
-	*slot = NULL;
 }

 static void io_rsrc_put_work(struct io_rsrc_node *node)
@@ -855,6 +858,98 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
 	return ret;
 }

+static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
+				struct io_imu_folio_data *data, int nr_folios)
+{
+	struct page **page_array = *pages, **new_array = NULL;
+	int nr_pages_left = *nr_pages, i, j;
+
+	/* Store head pages only*/
+	new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
+					GFP_KERNEL);
+	if (!new_array)
+		return false;
+
+	new_array[0] = compound_head(page_array[0]);
+	/*
+	 * The pages are bound to the folio, it doesn't
+	 * actually unpin them but drops all but one reference,
+	 * which is usually put down by io_buffer_unmap().
+	 * Note, needs a better helper.
+	 */
+	if (data->nr_pages_head > 1)
+		unpin_user_pages(&page_array[1], data->nr_pages_head - 1);
+
+	j = data->nr_pages_head;
+	nr_pages_left -= data->nr_pages_head;
+	for (i = 1; i < nr_folios; i++) {
+		unsigned int nr_unpin;
+
+		new_array[i] = page_array[j];
+		nr_unpin = min_t(unsigned int, nr_pages_left - 1,
+					data->nr_pages_mid - 1);
+		if (nr_unpin)
+			unpin_user_pages(&page_array[j+1], nr_unpin);
+		j += data->nr_pages_mid;
+		nr_pages_left -= data->nr_pages_mid;
+	}
+	kvfree(page_array);
+	*pages = new_array;
+	*nr_pages = nr_folios;
+	return true;
+}
+
+static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
+					 struct io_imu_folio_data *data)
+{
+	struct page **page_array = *pages;
+	struct folio *folio = page_folio(page_array[0]);
+	unsigned int count = 1, nr_folios = 1;
+	int i;
+
+	if (*nr_pages <= 1)
+		return false;
+
+	data->nr_pages_mid = folio_nr_pages(folio);
+	if (data->nr_pages_mid == 1)
+		return false;
+
+	data->folio_shift = folio_shift(folio);
+	/*
+	 * Check if pages are contiguous inside a folio, and all folios have
+	 * the same page count except for the head and tail.
+	 */
+	for (i = 1; i < *nr_pages; i++) {
+		if (page_folio(page_array[i]) == folio &&
+			page_array[i] == page_array[i-1] + 1) {
+			count++;
+			continue;
+		}
+
+		if (nr_folios == 1) {
+			if (folio_page_idx(folio, page_array[i-1]) !=
+				data->nr_pages_mid - 1)
+				return false;
+
+			data->nr_pages_head = count;
+		} else if (count != data->nr_pages_mid) {
+			return false;
+		}
+
+		folio = page_folio(page_array[i]);
+		if (folio_size(folio) != (1UL << data->folio_shift) ||
+			folio_page_idx(folio, page_array[i]) != 0)
+			return false;
+
+		count = 1;
+		nr_folios++;
+	}
+	if (nr_folios == 1)
+		data->nr_pages_head = count;
+
+	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+}
+
 static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 				  struct io_mapped_ubuf **pimu,
 				  struct page **last_hpage)
@@ -864,7 +959,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	unsigned long off;
 	size_t size;
 	int ret, nr_pages, i;
-	struct folio *folio = NULL;
+	struct io_imu_folio_data data;
+	bool coalesced;

 	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
 	if (!iov->iov_base)
@@ -879,31 +975,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}

-	/* If it's a huge page, try to coalesce them into a single bvec entry */
-	if (nr_pages > 1) {
-		folio = page_folio(pages[0]);
-		for (i = 1; i < nr_pages; i++) {
-			/*
-			 * Pages must be consecutive and on the same folio for
-			 * this to work
-			 */
-			if (page_folio(pages[i]) != folio ||
-			    pages[i] != pages[i - 1] + 1) {
-				folio = NULL;
-				break;
-			}
-		}
-		if (folio) {
-			/*
-			 * The pages are bound to the folio, it doesn't
-			 * actually unpin them but drops all but one reference,
-			 * which is usually put down by io_buffer_unmap().
-			 * Note, needs a better helper.
-			 */
-			unpin_user_pages(&pages[1], nr_pages - 1);
-			nr_pages = 1;
-		}
-	}
+	/* If it's huge page(s), try to coalesce them into fewer bvec entries */
+	coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);

 	imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
 	if (!imu)
@@ -915,23 +988,26 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		goto done;
 	}

-	off = (unsigned long) iov->iov_base & ~PAGE_MASK;
 	size = iov->iov_len;
 	/* store original address for later verification */
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	imu->folio_shift = PAGE_SHIFT;
+	imu->folio_mask = PAGE_MASK;
+	if (coalesced) {
+		imu->folio_shift = data.folio_shift;
+		imu->folio_mask = ~((1UL << data.folio_shift) - 1);
+	}
+	refcount_set(&imu->refs, 1);
+	off = (unsigned long) iov->iov_base & ~imu->folio_mask;
 	*pimu = imu;
 	ret = 0;

-	if (folio) {
-		bvec_set_page(&imu->bvec[0], pages[0], size, off);
-		goto done;
-	}
 	for (i = 0; i < nr_pages; i++) {
 		size_t vec_len;

-		vec_len = min_t(size_t, size, PAGE_SIZE - off);
+		vec_len = min_t(size_t, size, (1UL << imu->folio_shift) - off);
 		bvec_set_page(&imu->bvec[i], pages[i], vec_len, off);
 		off = 0;
 		size -= vec_len;
@@ -1042,23 +1118,18 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 		 * we know that:
 		 *
 		 * 1) it's a BVEC iter, we set it up
-		 * 2) all bvecs are PAGE_SIZE in size, except potentially the
+		 * 2) all bvecs are the same in size, except potentially the
 		 *    first and last bvec
 		 *
 		 * So just find our index, and adjust the iterator afterwards.
 		 * If the offset is within the first bvec (or the whole first
 		 * bvec, just use iov_iter_advance(). This makes it easier
 		 * since we can just skip the first segment, which may not
-		 * be PAGE_SIZE aligned.
+		 * be folio_size aligned.
 		 */
 		const struct bio_vec *bvec = imu->bvec;

 		if (offset < bvec->bv_len) {
-			/*
-			 * Note, huge pages buffers consists of one large
-			 * bvec entry and should always go this way. The other
-			 * branch doesn't expect non PAGE_SIZE'd chunks.
-			 */
 			iter->bvec = bvec;
 			iter->count -= offset;
 			iter->iov_offset = offset;
@@ -1067,14 +1138,104 @@ int io_import_fixed(int ddir, struct iov_iter *iter,

 			/* skip first vec */
 			offset -= bvec->bv_len;
-			seg_skip = 1 + (offset >> PAGE_SHIFT);
+			seg_skip = 1 + (offset >> imu->folio_shift);

 			iter->bvec = bvec + seg_skip;
 			iter->nr_segs -= seg_skip;
 			iter->count -= bvec->bv_len + offset;
-			iter->iov_offset = offset & ~PAGE_MASK;
+			iter->iov_offset = offset & ~imu->folio_mask;
 		}
 	}

 	return 0;
 }
+
+static int io_copy_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
+{
+	struct io_mapped_ubuf **user_bufs;
+	struct io_rsrc_data *data;
+	int i, ret, nbufs;
+
+	/*
+	 * Drop our own lock here. We'll setup the data we need and reference
+	 * the source buffers, then re-grab, check, and assign at the end.
+	 */
+	mutex_unlock(&ctx->uring_lock);
+
+	mutex_lock(&src_ctx->uring_lock);
+	ret = -ENXIO;
+	nbufs = src_ctx->nr_user_bufs;
+	if (!nbufs)
+		goto out_unlock;
+	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
+	if (ret)
+		goto out_unlock;
+
+	ret = -ENOMEM;
+	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	if (!user_bufs)
+		goto out_free_data;
+
+	for (i = 0; i < nbufs; i++) {
+		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+
+		refcount_inc(&src->refs);
+		user_bufs[i] = src;
+	}
+
+	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
+	mutex_unlock(&src_ctx->uring_lock);
+	mutex_lock(&ctx->uring_lock);
+	if (!ctx->user_bufs) {
+		ctx->user_bufs = user_bufs;
+		ctx->buf_data = data;
+		ctx->nr_user_bufs = nbufs;
+		return 0;
+	}
+
+	/* someone raced setting up buffers, dump ours */
+	for (i = 0; i < nbufs; i++)
+		io_buffer_unmap(ctx, &user_bufs[i]);
+	io_rsrc_data_free(data);
+	kfree(user_bufs);
+	return -EBUSY;
+out_free_data:
+	io_rsrc_data_free(data);
+out_unlock:
+	mutex_unlock(&src_ctx->uring_lock);
+	mutex_lock(&ctx->uring_lock);
+	return ret;
+}
+
+/*
+ * Copy the registered buffers from the source ring whose file descriptor
+ * is given in the src_fd to the current ring. This is identical to registering
+ * the buffers with ctx, except faster as mappings already exist.
+ *
+ * Since the memory is already accounted once, don't account it again.
+ */
+int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_copy_buffers buf;
+	bool registered_src;
+	struct file *file;
+	int ret;
+
+	if (ctx->user_bufs || ctx->nr_user_bufs)
+		return -EBUSY;
+	if (copy_from_user(&buf, arg, sizeof(buf)))
+		return -EFAULT;
+	if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED)
+		return -EINVAL;
+	if (memchr_inv(buf.pad, 0, sizeof(buf.pad)))
+		return -EINVAL;
+
+	registered_src = (buf.flags & IORING_REGISTER_SRC_REGISTERED) != 0;
+	file = io_uring_register_get_file(buf.src_fd, registered_src);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	ret = io_copy_buffers(ctx, file->private_data);
+	if (!registered_src)
+		fput(file);
+	return ret;
+}
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -22,8 +22,6 @@ struct io_rsrc_put {
 	};
 };

-typedef void (rsrc_put_fn)(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-
 struct io_rsrc_data {
 	struct io_ring_ctx		*ctx;

@@ -46,10 +44,21 @@ struct io_mapped_ubuf {
 	u64		ubuf;
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
+	unsigned int    folio_shift;
 	unsigned long	acct_pages;
+	unsigned long   folio_mask;
+	refcount_t	refs;
 	struct bio_vec	bvec[] __counted_by(nr_bvecs);
 };

+struct io_imu_folio_data {
+	/* Head folio can be partially included in the fixed buf */
+	unsigned int	nr_pages_head;
+	/* For non-head/tail folios, has to be fully included */
+	unsigned int	nr_pages_mid;
+	unsigned int	folio_shift;
+};
+
 void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
 void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
@@ -59,6 +68,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len);

+int io_register_copy_buffers(struct io_ring_ctx *ctx, void __user *arg);
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,

--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -467,8 +467,7 @@ static void io_req_io_end(struct io_kiocb *req)
 static bool __io_complete_rw_common(struct io_kiocb *req, long res)
 {
 	if (unlikely(res != req->cqe.res)) {
-		if ((res == -EAGAIN || res == -EOPNOTSUPP) &&
-		    io_rw_should_reissue(req)) {
+		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 			/*
 			 * Reissue will start accounting again, finish the
 			 * current cycle.
@@ -511,7 +510,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 	io_req_io_end(req);

 	if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
-		req->cqe.flags |= io_put_kbuf(req, 0);
+		req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0);

 	io_req_rw_cleanup(req, 0);
 	io_req_task_complete(req, ts);
@@ -593,7 +592,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret,
 			 */
 			io_req_io_end(req);
 			io_req_set_res(req, final_ret,
-				       io_put_kbuf(req, issue_flags));
+				       io_put_kbuf(req, ret, issue_flags));
 			io_req_rw_cleanup(req, issue_flags);
 			return IOU_OK;
 		}
@@ -855,6 +854,14 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)

 	ret = io_iter_do_read(rw, &io->iter);

+	/*
+	 * Some file systems like to return -EOPNOTSUPP for an IOCB_NOWAIT
+	 * issue, even though they should be returning -EAGAIN. To be safe,
+	 * retry from blocking context for either.
+	 */
+	if (ret == -EOPNOTSUPP && force_nonblock)
+		ret = -EAGAIN;
+
 	if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) {
 		req->flags &= ~REQ_F_REISSUE;
 		/* If we can poll, just do that. */
@@ -975,7 +982,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
 		 * Put our buffer and post a CQE. If we fail to post a CQE, then
 		 * jump to the termination path. This request is then done.
 		 */
-		cflags = io_put_kbuf(req, issue_flags);
+		cflags = io_put_kbuf(req, ret, issue_flags);
 		rw->len = 0; /* similarly to above, reset len to 0 */

 		if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) {
@@ -1167,7 +1174,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (!smp_load_acquire(&req->iopoll_completed))
 			break;
 		nr_events++;
-		req->cqe.flags = io_put_kbuf(req, 0);
+		req->cqe.flags = io_put_kbuf(req, req->cqe.res, 0);
 		if (req->opcode != IORING_OP_URING_CMD)
 			io_req_rw_cleanup(req, 0);
 	}

--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -10,6 +10,7 @@
 #include <linux/slab.h>
 #include <linux/audit.h>
 #include <linux/security.h>
+#include <linux/cpuset.h>
 #include <linux/io_uring.h>

 #include <uapi/linux/io_uring.h>
@@ -176,7 +177,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
 	if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE)
 		to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE;

-	if (!wq_list_empty(&ctx->iopoll_list) || to_submit) {
+	if (to_submit || !wq_list_empty(&ctx->iopoll_list)) {
 		const struct cred *creds = NULL;

 		if (ctx->sq_creds != current_cred())
@@ -460,10 +461,12 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
 			return 0;

 		if (p->flags & IORING_SETUP_SQ_AFF) {
+			struct cpumask allowed_mask;
 			int cpu = p->sq_thread_cpu;

 			ret = -EINVAL;
-			if (cpu >= nr_cpu_ids || !cpu_online(cpu))
+			cpuset_cpus_allowed(current, &allowed_mask);
+			if (!cpumask_test_cpu(cpu, &allowed_mask))
 				goto err_sqpoll;
 			sqd->sq_cpu = cpu;
 		} else {