Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough

* for-5.19/io_uring: (85 commits) io_uring: don't clear req->kbuf when buffer selection is done io_uring: eliminate the need to track provided buffer ID separately io_uring: move provided buffer state closer to submit state io_uring: move provided and fixed buffers into the same io_kiocb area io_uring: abstract out provided buffer list selection io_uring: never call io_buffer_select() for a buffer re-select io_uring: get rid of hashed provided buffer groups io_uring: always use req->buf_index for the provided buffer group io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set io_uring: kill io_rw_buffer_select() wrapper io_uring: make io_buffer_select() return the user address directly io_uring: kill io_recv_buffer_select() wrapper io_uring: use 'sr' vs 'req->sr_msg' consistently io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg io_uring: check IOPOLL/ioprio support upfront io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread() io_uring: add IORING_SETUP_TASKRUN_FLAG io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used io_uring: set task_work notify method at init time io-wq: use __set_notify_signal() to wake workers ...

Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough
* for-5.19/io_uring: (85 commits) io_uring: don't clear req->kbuf when buffer selection is done io_uring: eliminate the need to track provided buffer ID separately io_uring: move provided buffer state closer to submit state io_uring: move provided and fixed buffers into the same io_kiocb area io_uring: abstract out provided buffer list selection io_uring: never call io_buffer_select() for a buffer re-select io_uring: get rid of hashed provided buffer groups io_uring: always use req->buf_index for the provided buffer group io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set io_uring: kill io_rw_buffer_select() wrapper io_uring: make io_buffer_select() return the user address directly io_uring: kill io_recv_buffer_select() wrapper io_uring: use 'sr' vs 'req->sr_msg' consistently io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg io_uring: check IOPOLL/ioprio support upfront io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread() io_uring: add IORING_SETUP_TASKRUN_FLAG io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used io_uring: set task_work notify method at init time io-wq: use __set_notify_signal() to wake workers ...
13086899 · Jens Axboe · c5eb0a61 · 7ccba24d · 13086899 · 13086899
Commit 13086899 authored May 09, 2022 by Jens Axboe
8 changed files
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
 static bool io_wq_worker_wake(struct io_worker *worker, void *data)
 {
-	set_notify_signal(worker->task);
+	__set_notify_signal(worker->task);
 	wake_up_process(worker->task);
 	return false;
 }
@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
 {
 	if (work && match->fn(work, match->data)) {
 		work->flags |= IO_WQ_WORK_CANCEL;
-		set_notify_signal(worker->task);
+		__set_notify_signal(worker->task);
 		return true;
 	}

--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
 struct io_wq_work {
 	struct io_wq_work_node list;
 	unsigned flags;
+	int cancel_seq;
 };
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)

--- a/fs/io_uring.c
+++ b/fs/io_uring.c
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -355,14 +355,23 @@ static inline void clear_notify_signal(void)
 	smp_mb__after_atomic();
 }
+/*
+ * Returns 'true' if kick_process() is needed to force a transition from
+ * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
+ */
+static inline bool __set_notify_signal(struct task_struct *task)
+{
+	return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+	       !wake_up_state(task, TASK_INTERRUPTIBLE);
+}
 /*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
 static inline void set_notify_signal(struct task_struct *task)
 {
-	if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
+	if (__set_notify_signal(task))
-	    !wake_up_state(task, TASK_INTERRUPTIBLE))
 		kick_process(task);
 }

--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -17,6 +17,7 @@ enum task_work_notify_mode {
 	TWA_NONE,
 	TWA_RESUME,
 	TWA_SIGNAL,
+	TWA_SIGNAL_NO_IPI,
 };
 static inline bool task_work_pending(struct task_struct *task)

--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed,
 	),
 	TP_printk("ring %p, req %p, user_data 0x%llx, "
-		"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
+		  "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
 		  "len=%u, rw_flags=0x%x, buf_index=%d, "
 		  "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
 		  __entry->ctx, __entry->req, __entry->user_data,
@@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed,
 		  (unsigned long long) __entry->pad2, __entry->error)
 );
+/*
+ * io_uring_cqe_overflow - a CQE overflowed
+ *
+ * @ctx:		pointer to a ring context structure
+ * @user_data:		user data associated with the request
+ * @res:		CQE result
+ * @cflags:		CQE flags
+ * @ocqe:		pointer to the overflow cqe (if available)
+ *
+ */
+TRACE_EVENT(io_uring_cqe_overflow,
+	TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
+		 void *ocqe),
+	TP_ARGS(ctx, user_data, res, cflags, ocqe),
+	TP_STRUCT__entry (
+		__field(  void *,		ctx		)
+		__field(  unsigned long long,	user_data	)
+		__field(  s32,			res		)
+		__field(  u32,			cflags		)
+		__field(  void *,		ocqe		)
+	),
+	TP_fast_assign(
+		__entry->ctx		= ctx;
+		__entry->user_data	= user_data;
+		__entry->res		= res;
+		__entry->cflags		= cflags;
+		__entry->ocqe		= ocqe;
+	),
+	TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
+		  "overflow_cqe %p",
+		  __entry->ctx, __entry->user_data, __entry->res,
+		  __entry->cflags, __entry->ocqe)
+);
 #endif /* _TRACE_IO_URING_H */
 /* This part must be outside protection */

--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -102,6 +102,20 @@ enum {
 #define IORING_SETUP_ATTACH_WQ	(1U << 5)	/* attach to existing wq */
 #define IORING_SETUP_R_DISABLED	(1U << 6)	/* start with ring disabled */
 #define IORING_SETUP_SUBMIT_ALL	(1U << 7)	/* continue submit on error */
+/*
+ * Cooperative task running. When requests complete, they often require
+ * forcing the submitter to transition to the kernel to complete. If this
+ * flag is set, work will be done when the task transitions anyway, rather
+ * than force an inter-processor interrupt reschedule. This avoids interrupting
+ * a task running in userspace, and saves an IPI.
+ */
+#define IORING_SETUP_COOP_TASKRUN	(1U << 8)
+/*
+ * If COOP_TASKRUN is set, get notified if task work is available for
+ * running and a kernel transition would be needed to run it. This sets
+ * IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
+ */
+#define IORING_SETUP_TASKRUN_FLAG	(1U << 9)
 enum {
 	IORING_OP_NOP,
@@ -187,6 +201,28 @@ enum {
 #define IORING_POLL_UPDATE_EVENTS	(1U << 1)
 #define IORING_POLL_UPDATE_USER_DATA	(1U << 2)
+/*
+ * ASYNC_CANCEL flags.
+ *
+ * IORING_ASYNC_CANCEL_ALL	Cancel all requests that match the given key
+ * IORING_ASYNC_CANCEL_FD	Key off 'fd' for cancelation rather than the
+ *				request 'user_data'
+ * IORING_ASYNC_CANCEL_ANY	Match any request
+ */
+#define IORING_ASYNC_CANCEL_ALL	(1U << 0)
+#define IORING_ASYNC_CANCEL_FD	(1U << 1)
+#define IORING_ASYNC_CANCEL_ANY	(1U << 2)
+/*
+ * send/sendmsg and recv/recvmsg flags (sqe->addr2)
+ *
+ * IORING_RECVSEND_POLL_FIRST	If set, instead of first attempting to send
+ *				or receive and arm poll if that yields an
+ *				-EAGAIN result, arm poll upfront and skip
+ *				the initial transfer attempt.
+ */
+#define IORING_RECVSEND_POLL_FIRST	(1U << 0)
 /*
 * IO completion data structure (Completion Queue Entry)
 */
@@ -236,6 +272,7 @@ struct io_sqring_offsets {
 */
 #define IORING_SQ_NEED_WAKEUP	(1U << 0) /* needs io_uring_enter wakeup */
 #define IORING_SQ_CQ_OVERFLOW	(1U << 1) /* CQ ring is overflown */
+#define IORING_SQ_TASKRUN	(1U << 2) /* task should enter the kernel */
 struct io_cqring_offsets {
 	__u32 head;

--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
- * is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the
+ * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
- * it will interrupt the targeted task and run the task_work. @TWA_RESUME
+ *
- * work is run only when the task exits the kernel and returns to user mode,
+ * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
- * or before entering guest mode. Fails if the @task is exiting/exited and thus
+ * task and run the task_work, regardless of whether the task is currently
- * it can't process this @work. Otherwise @work->func() will be called when the
+ * running in the kernel or userspace.
- * @task goes through one of the aforementioned transitions, or exits.
+ * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
+ * reschedule IPI to force the targeted task to reschedule and run task_work.
+ * This can be advantageous if there's no strict requirement that the
+ * task_work be run as soon as possible, just whenever the task enters the
+ * kernel anyway.
+ * @TWA_RESUME work is run only when the task exits the kernel and returns to
+ * user mode, or before entering guest mode.
+ *
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task goes through one of
+ * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
 	case TWA_SIGNAL:
 		set_notify_signal(task);
 		break;
+	case TWA_SIGNAL_NO_IPI:
+		__set_notify_signal(task);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		break;