Commit 13086899 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'for-5.19/io_uring' into for-5.19/io_uring-passthrough

* for-5.19/io_uring: (85 commits)
  io_uring: don't clear req->kbuf when buffer selection is done
  io_uring: eliminate the need to track provided buffer ID separately
  io_uring: move provided buffer state closer to submit state
  io_uring: move provided and fixed buffers into the same io_kiocb area
  io_uring: abstract out provided buffer list selection
  io_uring: never call io_buffer_select() for a buffer re-select
  io_uring: get rid of hashed provided buffer groups
  io_uring: always use req->buf_index for the provided buffer group
  io_uring: ignore ->buf_index if REQ_F_BUFFER_SELECT isn't set
  io_uring: kill io_rw_buffer_select() wrapper
  io_uring: make io_buffer_select() return the user address directly
  io_uring: kill io_recv_buffer_select() wrapper
  io_uring: use 'sr' vs 'req->sr_msg' consistently
  io_uring: add POLL_FIRST support for send/sendmsg and recv/recvmsg
  io_uring: check IOPOLL/ioprio support upfront
  io_uring: replace smp_mb() with smp_mb__after_atomic() in io_sq_thread()
  io_uring: add IORING_SETUP_TASKRUN_FLAG
  io_uring: use TWA_SIGNAL_NO_IPI if IORING_SETUP_COOP_TASKRUN is used
  io_uring: set task_work notify method at init time
  io-wq: use __set_notify_signal() to wake workers
  ...
parents c5eb0a61 7ccba24d
...@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe, ...@@ -871,7 +871,7 @@ static bool io_wq_for_each_worker(struct io_wqe *wqe,
static bool io_wq_worker_wake(struct io_worker *worker, void *data) static bool io_wq_worker_wake(struct io_worker *worker, void *data)
{ {
set_notify_signal(worker->task); __set_notify_signal(worker->task);
wake_up_process(worker->task); wake_up_process(worker->task);
return false; return false;
} }
...@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker, ...@@ -991,7 +991,7 @@ static bool __io_wq_worker_cancel(struct io_worker *worker,
{ {
if (work && match->fn(work, match->data)) { if (work && match->fn(work, match->data)) {
work->flags |= IO_WQ_WORK_CANCEL; work->flags |= IO_WQ_WORK_CANCEL;
set_notify_signal(worker->task); __set_notify_signal(worker->task);
return true; return true;
} }
......
...@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) ...@@ -155,6 +155,7 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
struct io_wq_work { struct io_wq_work {
struct io_wq_work_node list; struct io_wq_work_node list;
unsigned flags; unsigned flags;
int cancel_seq;
}; };
static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -355,14 +355,23 @@ static inline void clear_notify_signal(void) ...@@ -355,14 +355,23 @@ static inline void clear_notify_signal(void)
smp_mb__after_atomic(); smp_mb__after_atomic();
} }
/*
* Returns 'true' if kick_process() is needed to force a transition from
* user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
*/
static inline bool __set_notify_signal(struct task_struct *task)
{
return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
!wake_up_state(task, TASK_INTERRUPTIBLE);
}
/* /*
* Called to break out of interruptible wait loops, and enter the * Called to break out of interruptible wait loops, and enter the
* exit_to_user_mode_loop(). * exit_to_user_mode_loop().
*/ */
static inline void set_notify_signal(struct task_struct *task) static inline void set_notify_signal(struct task_struct *task)
{ {
if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && if (__set_notify_signal(task))
!wake_up_state(task, TASK_INTERRUPTIBLE))
kick_process(task); kick_process(task);
} }
......
...@@ -17,6 +17,7 @@ enum task_work_notify_mode { ...@@ -17,6 +17,7 @@ enum task_work_notify_mode {
TWA_NONE, TWA_NONE,
TWA_RESUME, TWA_RESUME,
TWA_SIGNAL, TWA_SIGNAL,
TWA_SIGNAL_NO_IPI,
}; };
static inline bool task_work_pending(struct task_struct *task) static inline bool task_work_pending(struct task_struct *task)
......
...@@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed, ...@@ -530,7 +530,7 @@ TRACE_EVENT(io_uring_req_failed,
), ),
TP_printk("ring %p, req %p, user_data 0x%llx, " TP_printk("ring %p, req %p, user_data 0x%llx, "
"op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "op %d, flags 0x%x, prio=%d, off=%llu, addr=%llu, "
"len=%u, rw_flags=0x%x, buf_index=%d, " "len=%u, rw_flags=0x%x, buf_index=%d, "
"personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d", "personality=%d, file_index=%d, pad=0x%llx/%llx, error=%d",
__entry->ctx, __entry->req, __entry->user_data, __entry->ctx, __entry->req, __entry->user_data,
...@@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed, ...@@ -543,6 +543,46 @@ TRACE_EVENT(io_uring_req_failed,
(unsigned long long) __entry->pad2, __entry->error) (unsigned long long) __entry->pad2, __entry->error)
); );
/*
* io_uring_cqe_overflow - a CQE overflowed
*
* @ctx: pointer to a ring context structure
* @user_data: user data associated with the request
* @res: CQE result
* @cflags: CQE flags
* @ocqe: pointer to the overflow cqe (if available)
*
*/
TRACE_EVENT(io_uring_cqe_overflow,
TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags,
void *ocqe),
TP_ARGS(ctx, user_data, res, cflags, ocqe),
TP_STRUCT__entry (
__field( void *, ctx )
__field( unsigned long long, user_data )
__field( s32, res )
__field( u32, cflags )
__field( void *, ocqe )
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->user_data = user_data;
__entry->res = res;
__entry->cflags = cflags;
__entry->ocqe = ocqe;
),
TP_printk("ring %p, user_data 0x%llx, res %d, flags %x, "
"overflow_cqe %p",
__entry->ctx, __entry->user_data, __entry->res,
__entry->cflags, __entry->ocqe)
);
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -102,6 +102,20 @@ enum { ...@@ -102,6 +102,20 @@ enum {
#define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */ #define IORING_SETUP_ATTACH_WQ (1U << 5) /* attach to existing wq */
#define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */ #define IORING_SETUP_R_DISABLED (1U << 6) /* start with ring disabled */
#define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */ #define IORING_SETUP_SUBMIT_ALL (1U << 7) /* continue submit on error */
/*
* Cooperative task running. When requests complete, they often require
* forcing the submitter to transition to the kernel to complete. If this
* flag is set, work will be done when the task transitions anyway, rather
* than force an inter-processor interrupt reschedule. This avoids interrupting
* a task running in userspace, and saves an IPI.
*/
#define IORING_SETUP_COOP_TASKRUN (1U << 8)
/*
* If COOP_TASKRUN is set, get notified if task work is available for
* running and a kernel transition would be needed to run it. This sets
* IORING_SQ_TASKRUN in the sq ring flags. Not valid with COOP_TASKRUN.
*/
#define IORING_SETUP_TASKRUN_FLAG (1U << 9)
enum { enum {
IORING_OP_NOP, IORING_OP_NOP,
...@@ -187,6 +201,28 @@ enum { ...@@ -187,6 +201,28 @@ enum {
#define IORING_POLL_UPDATE_EVENTS (1U << 1) #define IORING_POLL_UPDATE_EVENTS (1U << 1)
#define IORING_POLL_UPDATE_USER_DATA (1U << 2) #define IORING_POLL_UPDATE_USER_DATA (1U << 2)
/*
* ASYNC_CANCEL flags.
*
* IORING_ASYNC_CANCEL_ALL Cancel all requests that match the given key
* IORING_ASYNC_CANCEL_FD Key off 'fd' for cancelation rather than the
* request 'user_data'
* IORING_ASYNC_CANCEL_ANY Match any request
*/
#define IORING_ASYNC_CANCEL_ALL (1U << 0)
#define IORING_ASYNC_CANCEL_FD (1U << 1)
#define IORING_ASYNC_CANCEL_ANY (1U << 2)
/*
* send/sendmsg and recv/recvmsg flags (sqe->addr2)
*
* IORING_RECVSEND_POLL_FIRST If set, instead of first attempting to send
* or receive and arm poll if that yields an
* -EAGAIN result, arm poll upfront and skip
* the initial transfer attempt.
*/
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
/* /*
* IO completion data structure (Completion Queue Entry) * IO completion data structure (Completion Queue Entry)
*/ */
...@@ -236,6 +272,7 @@ struct io_sqring_offsets { ...@@ -236,6 +272,7 @@ struct io_sqring_offsets {
*/ */
#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ #define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */
#define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */ #define IORING_SQ_CQ_OVERFLOW (1U << 1) /* CQ ring is overflown */
#define IORING_SQ_TASKRUN (1U << 2) /* task should enter the kernel */
struct io_cqring_offsets { struct io_cqring_offsets {
__u32 head; __u32 head;
......
...@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ ...@@ -12,12 +12,22 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* @notify: how to notify the targeted task * @notify: how to notify the targeted task
* *
* Queue @work for task_work_run() below and notify the @task if @notify * Queue @work for task_work_run() below and notify the @task if @notify
* is @TWA_RESUME or @TWA_SIGNAL. @TWA_SIGNAL works like signals, in that the * is @TWA_RESUME, @TWA_SIGNAL, or @TWA_SIGNAL_NO_IPI.
* it will interrupt the targeted task and run the task_work. @TWA_RESUME *
* work is run only when the task exits the kernel and returns to user mode, * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
* or before entering guest mode. Fails if the @task is exiting/exited and thus * task and run the task_work, regardless of whether the task is currently
* it can't process this @work. Otherwise @work->func() will be called when the * running in the kernel or userspace.
* @task goes through one of the aforementioned transitions, or exits. * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
* reschedule IPI to force the targeted task to reschedule and run task_work.
* This can be advantageous if there's no strict requirement that the
* task_work be run as soon as possible, just whenever the task enters the
* kernel anyway.
* @TWA_RESUME work is run only when the task exits the kernel and returns to
* user mode, or before entering guest mode.
*
* Fails if the @task is exiting/exited and thus it can't process this @work.
* Otherwise @work->func() will be called when the @task goes through one of
* the aforementioned transitions, or exits.
* *
* If the targeted task is exiting, then an error is returned and the work item * If the targeted task is exiting, then an error is returned and the work item
* is not queued. It's up to the caller to arrange for an alternative mechanism * is not queued. It's up to the caller to arrange for an alternative mechanism
...@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work, ...@@ -53,6 +63,9 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
case TWA_SIGNAL: case TWA_SIGNAL:
set_notify_signal(task); set_notify_signal(task);
break; break;
case TWA_SIGNAL_NO_IPI:
__set_notify_signal(task);
break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment