Commit d2c84bdc authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux

Pull io_uring updates from Jens Axboe:

 - Make running of task_work internal loops more fair, and unify how the
   different methods deal with them (me)

 - Support for per-ring NAPI. The two minor networking patches are in a
   shared branch with netdev (Stefan)

 - Add support for truncate (Tony)

 - Export SQPOLL utilization stats (Xiaobing)

 - Multishot fixes (Pavel)

 - Fix for a race in manipulating the request flags via poll (Pavel)

 - Cleanup the multishot checking by making it generic, moving it out of
   opcode handlers (Pavel)

 - Various tweaks and cleanups (me, Kunwu, Alexander)

* tag 'for-6.9/io_uring-20240310' of git://git.kernel.dk/linux: (53 commits)
  io_uring: Fix sqpoll utilization check racing with dying sqpoll
  io_uring/net: dedup io_recv_finish req completion
  io_uring: refactor DEFER_TASKRUN multishot checks
  io_uring: fix mshot io-wq checks
  io_uring/net: add io_req_msg_cleanup() helper
  io_uring/net: simplify msghd->msg_inq checking
  io_uring/kbuf: rename REQ_F_PARTIAL_IO to REQ_F_BL_NO_RECYCLE
  io_uring/net: remove dependency on REQ_F_PARTIAL_IO for sr->done_io
  io_uring/net: correctly handle multishot recvmsg retry setup
  io_uring/net: clear REQ_F_BL_EMPTY in the multishot retry handler
  io_uring: fix io_queue_proc modifying req->flags
  io_uring: fix mshot read defer taskrun cqe posting
  io_uring/net: fix overflow check in io_recvmsg_mshot_prep()
  io_uring/net: correct the type of variable
  io_uring/sqpoll: statistics of the true utilization of sq threads
  io_uring/net: move recv/recvmsg flags out of retry loop
  io_uring/kbuf: flag request if buffer pool is empty after buffer pick
  io_uring/net: improve the usercopy for sendmsg/recvmsg
  io_uring/net: move receive multishot out of the generic msghdr path
  io_uring/net: unify how recvmsg and sendmsg copy in the msghdr
  ...
parents 0f1a8766 606559dc
......@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
long do_ftruncate(struct file *file, loff_t length, int small);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
......
......@@ -154,49 +154,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
long do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
struct fd f;
int error;
error = -EINVAL;
if (length < 0)
goto out;
error = -EBADF;
f = fdget(fd);
if (!f.file)
goto out;
/* explicitly opened as large or we are on 64-bit box */
if (f.file->f_flags & O_LARGEFILE)
if (file->f_flags & O_LARGEFILE)
small = 0;
dentry = f.file->f_path.dentry;
dentry = file->f_path.dentry;
inode = dentry->d_inode;
error = -EINVAL;
if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
goto out_putf;
if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
return -EINVAL;
error = -EINVAL;
/* Cannot ftruncate over 2^31 bytes without large file support */
if (small && length > MAX_NON_LFS)
goto out_putf;
return -EINVAL;
error = -EPERM;
/* Check IS_APPEND on real upper inode */
if (IS_APPEND(file_inode(f.file)))
goto out_putf;
if (IS_APPEND(file_inode(file)))
return -EPERM;
sb_start_write(inode->i_sb);
error = security_file_truncate(f.file);
error = security_file_truncate(file);
if (!error)
error = do_truncate(file_mnt_idmap(f.file), dentry, length,
ATTR_MTIME | ATTR_CTIME, f.file);
error = do_truncate(file_mnt_idmap(file), dentry, length,
ATTR_MTIME | ATTR_CTIME, file);
sb_end_write(inode->i_sb);
out_putf:
return error;
}
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
struct fd f;
int error;
if (length < 0)
return -EINVAL;
f = fdget(fd);
if (!f.file)
return -EBADF;
error = do_ftruncate(f.file, length, small);
fdput(f);
out:
return error;
}
......
......@@ -2,6 +2,7 @@
#define IO_URING_TYPES_H
#include <linux/blkdev.h>
#include <linux/hashtable.h>
#include <linux/task_work.h>
#include <linux/bitmap.h>
#include <linux/llist.h>
......@@ -240,12 +241,14 @@ struct io_ring_ctx {
unsigned int poll_activated: 1;
unsigned int drain_disabled: 1;
unsigned int compat: 1;
unsigned int iowq_limits_set : 1;
struct task_struct *submitter_task;
struct io_rings *rings;
struct percpu_ref refs;
enum task_work_notify_mode notify_method;
unsigned sq_thread_idle;
} ____cacheline_aligned_in_smp;
/* submission data */
......@@ -274,10 +277,20 @@ struct io_ring_ctx {
*/
struct io_rsrc_node *rsrc_node;
atomic_t cancel_seq;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
bool poll_multi_queue;
struct io_wq_work_list iopoll_list;
struct io_file_table file_table;
struct io_mapped_ubuf **user_bufs;
unsigned nr_user_files;
unsigned nr_user_bufs;
struct io_mapped_ubuf **user_bufs;
struct io_submit_state submit_state;
......@@ -288,15 +301,6 @@ struct io_ring_ctx {
struct io_alloc_cache apoll_cache;
struct io_alloc_cache netmsg_cache;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
* io_uring instances that don't use IORING_SETUP_SQPOLL.
* For SQPOLL, only the single threaded io_sq_thread() will
* manipulate the list, hence no extra locking is needed there.
*/
struct io_wq_work_list iopoll_list;
bool poll_multi_queue;
/*
* Any cancelable uring_cmd is added to this list in
* ->uring_cmd() by io_uring_cmd_insert_cancelable()
......@@ -343,8 +347,8 @@ struct io_ring_ctx {
spinlock_t completion_lock;
/* IRQ completion list, under ->completion_lock */
struct io_wq_work_list locked_free_list;
unsigned int locked_free_nr;
struct io_wq_work_list locked_free_list;
struct list_head io_buffers_comp;
struct list_head cq_overflow_list;
......@@ -366,9 +370,6 @@ struct io_ring_ctx {
unsigned int file_alloc_start;
unsigned int file_alloc_end;
struct xarray personalities;
u32 pers_next;
struct list_head io_buffers_cache;
/* deferred free list, protected by ->uring_lock */
......@@ -389,6 +390,9 @@ struct io_ring_ctx {
struct wait_queue_head rsrc_quiesce_wq;
unsigned rsrc_quiesce;
u32 pers_next;
struct xarray personalities;
/* hashed buffered write serialization */
struct io_wq_hash *hash_map;
......@@ -405,11 +409,22 @@ struct io_ring_ctx {
/* io-wq management, e.g. thread count */
u32 iowq_limits[2];
bool iowq_limits_set;
struct callback_head poll_wq_task_work;
struct list_head defer_list;
unsigned sq_thread_idle;
#ifdef CONFIG_NET_RX_BUSY_POLL
struct list_head napi_list; /* track busy poll napi_id */
spinlock_t napi_lock; /* napi_list lock */
/* napi busy poll default timeout */
unsigned int napi_busy_poll_to;
bool napi_prefer_busy_poll;
bool napi_enabled;
DECLARE_HASHTABLE(napi_ht, 4);
#endif
/* protected by ->completion_lock */
unsigned evfd_last_cq_tail;
......@@ -455,7 +470,6 @@ enum {
REQ_F_SKIP_LINK_CQES_BIT,
REQ_F_SINGLE_POLL_BIT,
REQ_F_DOUBLE_POLL_BIT,
REQ_F_PARTIAL_IO_BIT,
REQ_F_APOLL_MULTISHOT_BIT,
REQ_F_CLEAR_POLLIN_BIT,
REQ_F_HASH_LOCKED_BIT,
......@@ -463,75 +477,88 @@ enum {
REQ_F_SUPPORT_NOWAIT_BIT,
REQ_F_ISREG_BIT,
REQ_F_POLL_NO_LAZY_BIT,
REQ_F_CANCEL_SEQ_BIT,
REQ_F_CAN_POLL_BIT,
REQ_F_BL_EMPTY_BIT,
REQ_F_BL_NO_RECYCLE_BIT,
/* not a real bit, just to check we're not overflowing the space */
__REQ_F_LAST_BIT,
};
typedef u64 __bitwise io_req_flags_t;
#define IO_REQ_FLAG(bitno) ((__force io_req_flags_t) BIT_ULL((bitno)))
enum {
/* ctx owns file */
REQ_F_FIXED_FILE = BIT(REQ_F_FIXED_FILE_BIT),
REQ_F_FIXED_FILE = IO_REQ_FLAG(REQ_F_FIXED_FILE_BIT),
/* drain existing IO first */
REQ_F_IO_DRAIN = BIT(REQ_F_IO_DRAIN_BIT),
REQ_F_IO_DRAIN = IO_REQ_FLAG(REQ_F_IO_DRAIN_BIT),
/* linked sqes */
REQ_F_LINK = BIT(REQ_F_LINK_BIT),
REQ_F_LINK = IO_REQ_FLAG(REQ_F_LINK_BIT),
/* doesn't sever on completion < 0 */
REQ_F_HARDLINK = BIT(REQ_F_HARDLINK_BIT),
REQ_F_HARDLINK = IO_REQ_FLAG(REQ_F_HARDLINK_BIT),
/* IOSQE_ASYNC */
REQ_F_FORCE_ASYNC = BIT(REQ_F_FORCE_ASYNC_BIT),
REQ_F_FORCE_ASYNC = IO_REQ_FLAG(REQ_F_FORCE_ASYNC_BIT),
/* IOSQE_BUFFER_SELECT */
REQ_F_BUFFER_SELECT = BIT(REQ_F_BUFFER_SELECT_BIT),
REQ_F_BUFFER_SELECT = IO_REQ_FLAG(REQ_F_BUFFER_SELECT_BIT),
/* IOSQE_CQE_SKIP_SUCCESS */
REQ_F_CQE_SKIP = BIT(REQ_F_CQE_SKIP_BIT),
REQ_F_CQE_SKIP = IO_REQ_FLAG(REQ_F_CQE_SKIP_BIT),
/* fail rest of links */
REQ_F_FAIL = BIT(REQ_F_FAIL_BIT),
REQ_F_FAIL = IO_REQ_FLAG(REQ_F_FAIL_BIT),
/* on inflight list, should be cancelled and waited on exit reliably */
REQ_F_INFLIGHT = BIT(REQ_F_INFLIGHT_BIT),
REQ_F_INFLIGHT = IO_REQ_FLAG(REQ_F_INFLIGHT_BIT),
/* read/write uses file position */
REQ_F_CUR_POS = BIT(REQ_F_CUR_POS_BIT),
REQ_F_CUR_POS = IO_REQ_FLAG(REQ_F_CUR_POS_BIT),
/* must not punt to workers */
REQ_F_NOWAIT = BIT(REQ_F_NOWAIT_BIT),
REQ_F_NOWAIT = IO_REQ_FLAG(REQ_F_NOWAIT_BIT),
/* has or had linked timeout */
REQ_F_LINK_TIMEOUT = BIT(REQ_F_LINK_TIMEOUT_BIT),
REQ_F_LINK_TIMEOUT = IO_REQ_FLAG(REQ_F_LINK_TIMEOUT_BIT),
/* needs cleanup */
REQ_F_NEED_CLEANUP = BIT(REQ_F_NEED_CLEANUP_BIT),
REQ_F_NEED_CLEANUP = IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
/* already went through poll handler */
REQ_F_POLLED = BIT(REQ_F_POLLED_BIT),
REQ_F_POLLED = IO_REQ_FLAG(REQ_F_POLLED_BIT),
/* buffer already selected */
REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT),
REQ_F_BUFFER_SELECTED = IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
/* buffer selected from ring, needs commit */
REQ_F_BUFFER_RING = BIT(REQ_F_BUFFER_RING_BIT),
REQ_F_BUFFER_RING = IO_REQ_FLAG(REQ_F_BUFFER_RING_BIT),
/* caller should reissue async */
REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT),
REQ_F_REISSUE = IO_REQ_FLAG(REQ_F_REISSUE_BIT),
/* supports async reads/writes */
REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
REQ_F_SUPPORT_NOWAIT = IO_REQ_FLAG(REQ_F_SUPPORT_NOWAIT_BIT),
/* regular file */
REQ_F_ISREG = BIT(REQ_F_ISREG_BIT),
REQ_F_ISREG = IO_REQ_FLAG(REQ_F_ISREG_BIT),
/* has creds assigned */
REQ_F_CREDS = BIT(REQ_F_CREDS_BIT),
REQ_F_CREDS = IO_REQ_FLAG(REQ_F_CREDS_BIT),
/* skip refcounting if not set */
REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT),
REQ_F_REFCOUNT = IO_REQ_FLAG(REQ_F_REFCOUNT_BIT),
/* there is a linked timeout that has to be armed */
REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT),
REQ_F_ARM_LTIMEOUT = IO_REQ_FLAG(REQ_F_ARM_LTIMEOUT_BIT),
/* ->async_data allocated */
REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT),
REQ_F_ASYNC_DATA = IO_REQ_FLAG(REQ_F_ASYNC_DATA_BIT),
/* don't post CQEs while failing linked requests */
REQ_F_SKIP_LINK_CQES = BIT(REQ_F_SKIP_LINK_CQES_BIT),
REQ_F_SKIP_LINK_CQES = IO_REQ_FLAG(REQ_F_SKIP_LINK_CQES_BIT),
/* single poll may be active */
REQ_F_SINGLE_POLL = BIT(REQ_F_SINGLE_POLL_BIT),
REQ_F_SINGLE_POLL = IO_REQ_FLAG(REQ_F_SINGLE_POLL_BIT),
/* double poll may active */
REQ_F_DOUBLE_POLL = BIT(REQ_F_DOUBLE_POLL_BIT),
/* request has already done partial IO */
REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT),
REQ_F_DOUBLE_POLL = IO_REQ_FLAG(REQ_F_DOUBLE_POLL_BIT),
/* fast poll multishot mode */
REQ_F_APOLL_MULTISHOT = BIT(REQ_F_APOLL_MULTISHOT_BIT),
REQ_F_APOLL_MULTISHOT = IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
/* recvmsg special flag, clear EPOLLIN */
REQ_F_CLEAR_POLLIN = BIT(REQ_F_CLEAR_POLLIN_BIT),
REQ_F_CLEAR_POLLIN = IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
REQ_F_HASH_LOCKED = IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
/* don't use lazy poll wake for this request */
REQ_F_POLL_NO_LAZY = BIT(REQ_F_POLL_NO_LAZY_BIT),
REQ_F_POLL_NO_LAZY = IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
/* cancel sequence is set and valid */
REQ_F_CANCEL_SEQ = IO_REQ_FLAG(REQ_F_CANCEL_SEQ_BIT),
/* file is pollable */
REQ_F_CAN_POLL = IO_REQ_FLAG(REQ_F_CAN_POLL_BIT),
/* buffer list was empty after selection of buffer */
REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT),
/* don't recycle provided buffers for this request */
REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
};
typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
......@@ -592,15 +619,17 @@ struct io_kiocb {
* and after selection it points to the buffer ID itself.
*/
u16 buf_index;
unsigned int flags;
unsigned nr_tw;
/* REQ_F_* flags */
io_req_flags_t flags;
struct io_cqe cqe;
struct io_ring_ctx *ctx;
struct task_struct *task;
struct io_rsrc_node *rsrc_node;
union {
/* store used ubuf, so we can prevent reloading */
struct io_mapped_ubuf *imu;
......@@ -621,10 +650,12 @@ struct io_kiocb {
/* cache ->apoll->events */
__poll_t apoll_events;
};
struct io_rsrc_node *rsrc_node;
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
struct hlist_node hash_node;
/* internal polling, see IORING_FEAT_FAST_POLL */
......
......@@ -48,6 +48,10 @@ void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
void napi_busy_loop_rcu(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget);
#else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void)
{
......
......@@ -148,7 +148,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__field( void *, req )
__field( u64, user_data )
__field( u8, opcode )
__field( unsigned int, flags )
__field( unsigned long long, flags )
__field( struct io_wq_work *, work )
__field( int, rw )
......@@ -159,7 +159,7 @@ TRACE_EVENT(io_uring_queue_async_work,
__entry->ctx = req->ctx;
__entry->req = req;
__entry->user_data = req->cqe.user_data;
__entry->flags = req->flags;
__entry->flags = (__force unsigned long long) req->flags;
__entry->opcode = req->opcode;
__entry->work = &req->work;
__entry->rw = rw;
......@@ -167,10 +167,10 @@ TRACE_EVENT(io_uring_queue_async_work,
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%x, %s queue, work %p",
TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p",
__entry->ctx, __entry->req, __entry->user_data,
__get_str(op_str),
__entry->flags, __entry->rw ? "hashed" : "normal", __entry->work)
__get_str(op_str), __entry->flags,
__entry->rw ? "hashed" : "normal", __entry->work)
);
/**
......@@ -378,7 +378,7 @@ TRACE_EVENT(io_uring_submit_req,
__field( void *, req )
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u32, flags )
__field( unsigned long long, flags )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(req->opcode) )
......@@ -389,16 +389,16 @@ TRACE_EVENT(io_uring_submit_req,
__entry->req = req;
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
__entry->flags = (__force unsigned long long) req->flags;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, "
"sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __get_str(op_str),
__entry->flags, __entry->sq_thread)
__entry->user_data, __get_str(op_str), __entry->flags,
__entry->sq_thread)
);
/*
......@@ -602,29 +602,25 @@ TRACE_EVENT(io_uring_cqe_overflow,
*
* @tctx: pointer to a io_uring_task
* @count: how many functions it ran
* @loops: how many loops it ran
*
*/
TRACE_EVENT(io_uring_task_work_run,
TP_PROTO(void *tctx, unsigned int count, unsigned int loops),
TP_PROTO(void *tctx, unsigned int count),
TP_ARGS(tctx, count, loops),
TP_ARGS(tctx, count),
TP_STRUCT__entry (
__field( void *, tctx )
__field( unsigned int, count )
__field( unsigned int, loops )
),
TP_fast_assign(
__entry->tctx = tctx;
__entry->count = count;
__entry->loops = loops;
),
TP_printk("tctx %p, count %u, loops %u",
__entry->tctx, __entry->count, __entry->loops)
TP_printk("tctx %p, count %u", __entry->tctx, __entry->count)
);
TRACE_EVENT(io_uring_short_write,
......
......@@ -255,6 +255,7 @@ enum io_uring_op {
IORING_OP_FUTEX_WAKE,
IORING_OP_FUTEX_WAITV,
IORING_OP_FIXED_FD_INSTALL,
IORING_OP_FTRUNCATE,
/* this goes last, obviously */
IORING_OP_LAST,
......@@ -570,6 +571,10 @@ enum {
/* return status information for a buffer group */
IORING_REGISTER_PBUF_STATUS = 26,
/* set/clear busy poll settings */
IORING_REGISTER_NAPI = 27,
IORING_UNREGISTER_NAPI = 28,
/* this goes last */
IORING_REGISTER_LAST,
......@@ -703,6 +708,14 @@ struct io_uring_buf_status {
__u32 resv[8];
};
/* argument for IORING_(UN)REGISTER_NAPI */
struct io_uring_napi {
__u32 busy_poll_to;
__u8 prefer_busy_poll;
__u8 pad[3];
__u64 resv;
};
/*
* io_uring_restriction->opcode values
*/
......
......@@ -8,6 +8,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
statx.o net.o msg_ring.o timeout.o \
sqpoll.o fdinfo.o tctx.o poll.o \
cancel.o kbuf.o rsrc.o rw.o opdef.o \
notif.o waitid.o register.o
notif.o waitid.o register.o truncate.o
obj-$(CONFIG_IO_WQ) += io-wq.o
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o
......@@ -58,9 +58,8 @@ bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd)
return false;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
check_seq:
if (cd->seq == req->work.cancel_seq)
if (io_cancel_match_sequence(req, cd->seq))
return false;
req->work.cancel_seq = cd->seq;
}
return true;
......
......@@ -25,4 +25,14 @@ void init_hash_table(struct io_hash_table *table, unsigned size);
int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg);
bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd);
static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence)
{
if ((req->flags & REQ_F_CANCEL_SEQ) && sequence == req->work.cancel_seq)
return true;
req->flags |= REQ_F_CANCEL_SEQ;
req->work.cancel_seq = sequence;
return false;
}
#endif
......@@ -55,6 +55,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
struct io_ring_ctx *ctx = f->private_data;
struct io_overflow_cqe *ocqe;
struct io_rings *r = ctx->rings;
struct rusage sq_usage;
unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1;
unsigned int sq_head = READ_ONCE(r->sq.head);
unsigned int sq_tail = READ_ONCE(r->sq.tail);
......@@ -64,6 +65,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
unsigned int sq_shift = 0;
unsigned int sq_entries, cq_entries;
int sq_pid = -1, sq_cpu = -1;
u64 sq_total_time = 0, sq_work_time = 0;
bool has_lock;
unsigned int i;
......@@ -145,12 +147,24 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) {
struct io_sq_data *sq = ctx->sq_data;
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
/*
* sq->thread might be NULL if we raced with the sqpoll
* thread termination.
*/
if (sq->thread) {
sq_pid = sq->task_pid;
sq_cpu = sq->sq_cpu;
getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+ sq_usage.ru_stime.tv_usec);
sq_work_time = sq->work_time;
}
}
seq_printf(m, "SqThread:\t%d\n", sq_pid);
seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
struct file *f = io_file_from_index(&ctx->file_table, i);
......
......@@ -17,7 +17,7 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset);
int io_register_file_alloc_range(struct io_ring_ctx *ctx,
struct io_uring_file_index_range __user *arg);
unsigned int io_file_get_flags(struct file *file);
io_req_flags_t io_file_get_flags(struct file *file);
static inline void io_file_bitmap_clear(struct io_file_table *table, int bit)
{
......
This diff is collapsed.
......@@ -5,6 +5,7 @@
#include <linux/lockdep.h>
#include <linux/resume_user_mode.h>
#include <linux/kasan.h>
#include <linux/poll.h>
#include <linux/io_uring_types.h>
#include <uapi/linux/eventpoll.h>
#include "io-wq.h"
......@@ -34,6 +35,32 @@ enum {
IOU_STOP_MULTISHOT = -ECANCELED,
};
struct io_wait_queue {
struct wait_queue_entry wq;
struct io_ring_ctx *ctx;
unsigned cq_tail;
unsigned nr_timeouts;
ktime_t timeout;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_busy_poll_to;
bool napi_prefer_busy_poll;
#endif
};
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
int dist = READ_ONCE(ctx->rings->cq.tail) - (int) iowq->cq_tail;
/*
* Wake up if we have enough events, or if a timeout occurred since we
* started waiting. For timeouts, we always want to return to userspace,
* regardless of event count.
*/
return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts;
}
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow);
void io_req_cqe_overflow(struct io_kiocb *req);
int io_run_task_work_sig(struct io_ring_ctx *ctx);
......@@ -56,6 +83,8 @@ void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use);
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts);
void io_req_task_queue_fail(struct io_kiocb *req, int ret);
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts);
struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries);
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count);
void tctx_task_work(struct callback_head *cb);
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
int io_uring_alloc_task_context(struct task_struct *task,
......@@ -207,7 +236,7 @@ static inline void io_ring_submit_unlock(struct io_ring_ctx *ctx,
unsigned issue_flags)
{
lockdep_assert_held(&ctx->uring_lock);
if (issue_flags & IO_URING_F_UNLOCKED)
if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
mutex_unlock(&ctx->uring_lock);
}
......@@ -220,7 +249,7 @@ static inline void io_ring_submit_lock(struct io_ring_ctx *ctx,
* The only exception is when we've detached the request and issue it
* from an async worker thread, grab the lock for that case.
*/
if (issue_flags & IO_URING_F_UNLOCKED)
if (unlikely(issue_flags & IO_URING_F_UNLOCKED))
mutex_lock(&ctx->uring_lock);
lockdep_assert_held(&ctx->uring_lock);
}
......@@ -274,6 +303,8 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline int io_run_task_work(void)
{
bool ret = false;
/*
* Always check-and-clear the task_work notification signal. With how
* signaling works for task_work, we can find it set with nothing to
......@@ -285,18 +316,26 @@ static inline int io_run_task_work(void)
* PF_IO_WORKER never returns to userspace, so check here if we have
* notify work that needs processing.
*/
if (current->flags & PF_IO_WORKER &&
test_thread_flag(TIF_NOTIFY_RESUME)) {
__set_current_state(TASK_RUNNING);
resume_user_mode_work(NULL);
if (current->flags & PF_IO_WORKER) {
if (test_thread_flag(TIF_NOTIFY_RESUME)) {
__set_current_state(TASK_RUNNING);
resume_user_mode_work(NULL);
}
if (current->io_uring) {
unsigned int count = 0;
tctx_task_work_run(current->io_uring, UINT_MAX, &count);
if (count)
ret = true;
}
}
if (task_work_pending(current)) {
__set_current_state(TASK_RUNNING);
task_work_run();
return 1;
ret = true;
}
return 0;
return ret;
}
static inline bool io_task_work_pending(struct io_ring_ctx *ctx)
......@@ -398,4 +437,26 @@ static inline size_t uring_sqe_size(struct io_ring_ctx *ctx)
return 2 * sizeof(struct io_uring_sqe);
return sizeof(struct io_uring_sqe);
}
static inline bool io_file_can_poll(struct io_kiocb *req)
{
if (req->flags & REQ_F_CAN_POLL)
return true;
if (file_can_poll(req->file)) {
req->flags |= REQ_F_CAN_POLL;
return true;
}
return false;
}
enum {
IO_CHECK_CQ_OVERFLOW_BIT,
IO_CHECK_CQ_DROPPED_BIT,
};
static inline bool io_has_work(struct io_ring_ctx *ctx)
{
return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) ||
!llist_empty(&ctx->work_llist);
}
#endif
......@@ -81,15 +81,6 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
struct io_buffer_list *bl;
struct io_buffer *buf;
/*
* For legacy provided buffer mode, don't recycle if we already did
* IO to this buffer. For ring-mapped provided buffer mode, we should
* increment ring->head to explicitly monopolize the buffer to avoid
* multiple use.
*/
if (req->flags & REQ_F_PARTIAL_IO)
return false;
io_ring_submit_lock(ctx, issue_flags);
buf = req->kbuf;
......@@ -102,10 +93,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
return true;
}
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
{
unsigned int cflags;
/*
* We can add this buffer back to two lists:
*
......@@ -118,21 +107,17 @@ unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags)
* We migrate buffers from the comp_list to the issue cache list
* when we need one.
*/
if (req->flags & REQ_F_BUFFER_RING) {
/* no buffers to recycle for this case */
cflags = __io_put_kbuf_list(req, NULL);
} else if (issue_flags & IO_URING_F_UNLOCKED) {
if (issue_flags & IO_URING_F_UNLOCKED) {
struct io_ring_ctx *ctx = req->ctx;
spin_lock(&ctx->completion_lock);
cflags = __io_put_kbuf_list(req, &ctx->io_buffers_comp);
__io_put_kbuf_list(req, &ctx->io_buffers_comp);
spin_unlock(&ctx->completion_lock);
} else {
lockdep_assert_held(&req->ctx->uring_lock);
cflags = __io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
__io_put_kbuf_list(req, &req->ctx->io_buffers_cache);
}
return cflags;
}
static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
......@@ -145,6 +130,8 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len,
list_del(&kbuf->list);
if (*len == 0 || *len > kbuf->len)
*len = kbuf->len;
if (list_empty(&bl->buf_list))
req->flags |= REQ_F_BL_EMPTY;
req->flags |= REQ_F_BUFFER_SELECTED;
req->kbuf = kbuf;
req->buf_index = kbuf->bid;
......@@ -158,12 +145,16 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
unsigned int issue_flags)
{
struct io_uring_buf_ring *br = bl->buf_ring;
__u16 tail, head = bl->head;
struct io_uring_buf *buf;
__u16 head = bl->head;
if (unlikely(smp_load_acquire(&br->tail) == head))
tail = smp_load_acquire(&br->tail);
if (unlikely(tail == head))
return NULL;
if (head + 1 == tail)
req->flags |= REQ_F_BL_EMPTY;
head &= bl->mask;
/* mmaped buffers are always contig */
if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) {
......@@ -180,7 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->buf_list = bl;
req->buf_index = buf->bid;
if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
/*
* If we came in unlocked, we have no choice but to consume the
* buffer here, otherwise nothing ensures that the buffer won't
......
......@@ -57,7 +57,7 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg);
void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx);
unsigned int __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags);
bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags);
......@@ -73,21 +73,9 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req)
* to monopolize the buffer.
*/
if (req->buf_list) {
if (req->flags & REQ_F_PARTIAL_IO) {
/*
* If we end up here, then the io_uring_lock has
* been kept held since we retrieved the buffer.
* For the io-wq case, we already cleared
* req->buf_list when the buffer was retrieved,
* hence it cannot be set here for that case.
*/
req->buf_list->head++;
req->buf_list = NULL;
} else {
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
return true;
}
req->buf_index = req->buf_list->bgid;
req->flags &= ~REQ_F_BUFFER_RING;
return true;
}
return false;
}
......@@ -101,6 +89,8 @@ static inline bool io_do_buffer_select(struct io_kiocb *req)
static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
{
if (req->flags & REQ_F_BL_NO_RECYCLE)
return false;
if (req->flags & REQ_F_BUFFER_SELECTED)
return io_kbuf_recycle_legacy(req, issue_flags);
if (req->flags & REQ_F_BUFFER_RING)
......@@ -108,41 +98,54 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags)
return false;
}
static inline unsigned int __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
static inline void __io_put_kbuf_ring(struct io_kiocb *req)
{
unsigned int ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->buf_list->head++;
}
req->flags &= ~REQ_F_BUFFER_RING;
}
static inline void __io_put_kbuf_list(struct io_kiocb *req,
struct list_head *list)
{
if (req->flags & REQ_F_BUFFER_RING) {
if (req->buf_list) {
req->buf_index = req->buf_list->bgid;
req->buf_list->head++;
}
req->flags &= ~REQ_F_BUFFER_RING;
__io_put_kbuf_ring(req);
} else {
req->buf_index = req->kbuf->bgid;
list_add(&req->kbuf->list, list);
req->flags &= ~REQ_F_BUFFER_SELECTED;
}
return ret;
}
static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req)
{
unsigned int ret;
lockdep_assert_held(&req->ctx->completion_lock);
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
return 0;
return __io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
__io_put_kbuf_list(req, &req->ctx->io_buffers_comp);
return ret;
}
static inline unsigned int io_put_kbuf(struct io_kiocb *req,
unsigned issue_flags)
{
unsigned int ret;
if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)))
if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED)))
return 0;
return __io_put_kbuf(req, issue_flags);
ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT);
if (req->flags & REQ_F_BUFFER_RING)
__io_put_kbuf_ring(req);
else
__io_put_kbuf(req, issue_flags);
return ret;
}
#endif
// SPDX-License-Identifier: GPL-2.0
#include "io_uring.h"
#include "napi.h"
#ifdef CONFIG_NET_RX_BUSY_POLL
/* Timeout for cleanout of stale entries. */
#define NAPI_TIMEOUT (60 * SEC_CONVERSION)
struct io_napi_entry {
unsigned int napi_id;
struct list_head list;
unsigned long timeout;
struct hlist_node node;
struct rcu_head rcu;
};
static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list,
unsigned int napi_id)
{
struct io_napi_entry *e;
hlist_for_each_entry_rcu(e, hash_list, node) {
if (e->napi_id != napi_id)
continue;
e->timeout = jiffies + NAPI_TIMEOUT;
return e;
}
return NULL;
}
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock)
{
struct hlist_head *hash_list;
unsigned int napi_id;
struct sock *sk;
struct io_napi_entry *e;
sk = sock->sk;
if (!sk)
return;
napi_id = READ_ONCE(sk->sk_napi_id);
/* Non-NAPI IDs can be rejected. */
if (napi_id < MIN_NAPI_ID)
return;
hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
rcu_read_lock();
e = io_napi_hash_find(hash_list, napi_id);
if (e) {
e->timeout = jiffies + NAPI_TIMEOUT;
rcu_read_unlock();
return;
}
rcu_read_unlock();
e = kmalloc(sizeof(*e), GFP_NOWAIT);
if (!e)
return;
e->napi_id = napi_id;
e->timeout = jiffies + NAPI_TIMEOUT;
spin_lock(&ctx->napi_lock);
if (unlikely(io_napi_hash_find(hash_list, napi_id))) {
spin_unlock(&ctx->napi_lock);
kfree(e);
return;
}
hlist_add_tail_rcu(&e->node, hash_list);
list_add_tail(&e->list, &ctx->napi_list);
spin_unlock(&ctx->napi_lock);
}
static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
unsigned int i;
spin_lock(&ctx->napi_lock);
hash_for_each(ctx->napi_ht, i, e, node) {
if (time_after(jiffies, e->timeout)) {
list_del(&e->list);
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
}
spin_unlock(&ctx->napi_lock);
}
static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale)
{
if (is_stale)
__io_napi_remove_stale(ctx);
}
static inline bool io_napi_busy_loop_timeout(unsigned long start_time,
unsigned long bp_usec)
{
if (bp_usec) {
unsigned long end_time = start_time + bp_usec;
unsigned long now = busy_loop_current_time();
return time_after(now, end_time);
}
return true;
}
static bool io_napi_busy_loop_should_end(void *data,
unsigned long start_time)
{
struct io_wait_queue *iowq = data;
if (signal_pending(current))
return true;
if (io_should_wake(iowq) || io_has_work(iowq->ctx))
return true;
if (io_napi_busy_loop_timeout(start_time, iowq->napi_busy_poll_to))
return true;
return false;
}
static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
void *loop_end_arg)
{
struct io_napi_entry *e;
bool (*loop_end)(void *, unsigned long) = NULL;
bool is_stale = false;
if (loop_end_arg)
loop_end = io_napi_busy_loop_should_end;
list_for_each_entry_rcu(e, &ctx->napi_list, list) {
napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
if (time_after(jiffies, e->timeout))
is_stale = true;
}
return is_stale;
}
static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
unsigned long start_time = busy_loop_current_time();
void *loop_end_arg = NULL;
bool is_stale = false;
/* Singular lists use a different napi loop end check function and are
* only executed once.
*/
if (list_is_singular(&ctx->napi_list))
loop_end_arg = iowq;
rcu_read_lock();
do {
is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg);
} while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg);
rcu_read_unlock();
io_napi_remove_stale(ctx, is_stale);
}
/*
* io_napi_init() - Init napi settings
* @ctx: pointer to io-uring context structure
*
* Init napi settings in the io-uring context.
*/
void io_napi_init(struct io_ring_ctx *ctx)
{
INIT_LIST_HEAD(&ctx->napi_list);
spin_lock_init(&ctx->napi_lock);
ctx->napi_prefer_busy_poll = false;
ctx->napi_busy_poll_to = READ_ONCE(sysctl_net_busy_poll);
}
/*
* io_napi_free() - Deallocate napi
* @ctx: pointer to io-uring context structure
*
* Free the napi list and the hash table in the io-uring context.
*/
void io_napi_free(struct io_ring_ctx *ctx)
{
struct io_napi_entry *e;
LIST_HEAD(napi_list);
unsigned int i;
spin_lock(&ctx->napi_lock);
hash_for_each(ctx->napi_ht, i, e, node) {
hash_del_rcu(&e->node);
kfree_rcu(e, rcu);
}
spin_unlock(&ctx->napi_lock);
}
/*
* io_napi_register() - Register napi with io-uring
* @ctx: pointer to io-uring context structure
* @arg: pointer to io_uring_napi structure
*
* Register napi in the io-uring context.
*/
int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
const struct io_uring_napi curr = {
.busy_poll_to = ctx->napi_busy_poll_to,
.prefer_busy_poll = ctx->napi_prefer_busy_poll
};
struct io_uring_napi napi;
if (copy_from_user(&napi, arg, sizeof(napi)))
return -EFAULT;
if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
return -EINVAL;
if (copy_to_user(arg, &curr, sizeof(curr)))
return -EFAULT;
WRITE_ONCE(ctx->napi_busy_poll_to, napi.busy_poll_to);
WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
WRITE_ONCE(ctx->napi_enabled, true);
return 0;
}
/*
* io_napi_unregister() - Unregister napi with io-uring
* @ctx: pointer to io-uring context structure
* @arg: pointer to io_uring_napi structure
*
* Unregister napi. If arg has been specified copy the busy poll timeout and
* prefer busy poll setting to the passed in structure.
*/
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
{
const struct io_uring_napi curr = {
.busy_poll_to = ctx->napi_busy_poll_to,
.prefer_busy_poll = ctx->napi_prefer_busy_poll
};
if (arg && copy_to_user(arg, &curr, sizeof(curr)))
return -EFAULT;
WRITE_ONCE(ctx->napi_busy_poll_to, 0);
WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
WRITE_ONCE(ctx->napi_enabled, false);
return 0;
}
/*
* __io_napi_adjust_timeout() - Add napi id to the busy poll list
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
* @ts: pointer to timespec or NULL
*
* Adjust the busy loop timeout according to timespec and busy poll timeout.
*/
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq,
struct timespec64 *ts)
{
unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to);
if (ts) {
struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to);
if (timespec64_compare(ts, &poll_to_ts) > 0) {
*ts = timespec64_sub(*ts, poll_to_ts);
} else {
u64 to = timespec64_to_ns(ts);
do_div(to, 1000);
ts->tv_sec = 0;
ts->tv_nsec = 0;
}
}
iowq->napi_busy_poll_to = poll_to;
}
/*
* __io_napi_busy_loop() - execute busy poll loop
* @ctx: pointer to io-uring context structure
* @iowq: pointer to io wait queue
*
* Execute the busy poll loop and merge the spliced off list.
*/
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq)
{
iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll);
if (!(ctx->flags & IORING_SETUP_SQPOLL) && ctx->napi_enabled)
io_napi_blocking_busy_loop(ctx, iowq);
}
/*
* io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll
* @ctx: pointer to io-uring context structure
*
* Splice of the napi list and execute the napi busy poll loop.
*/
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
{
LIST_HEAD(napi_list);
bool is_stale = false;
if (!READ_ONCE(ctx->napi_busy_poll_to))
return 0;
if (list_empty_careful(&ctx->napi_list))
return 0;
rcu_read_lock();
is_stale = __io_napi_do_busy_loop(ctx, NULL);
rcu_read_unlock();
io_napi_remove_stale(ctx, is_stale);
return 1;
}
#endif
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOU_NAPI_H
#define IOU_NAPI_H
#include <linux/kernel.h>
#include <linux/io_uring.h>
#include <net/busy_poll.h>
#ifdef CONFIG_NET_RX_BUSY_POLL
void io_napi_init(struct io_ring_ctx *ctx);
void io_napi_free(struct io_ring_ctx *ctx);
int io_register_napi(struct io_ring_ctx *ctx, void __user *arg);
int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg);
void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock);
void __io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq, struct timespec64 *ts);
void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq);
int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx);
static inline bool io_napi(struct io_ring_ctx *ctx)
{
return !list_empty(&ctx->napi_list);
}
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
struct timespec64 *ts)
{
if (!io_napi(ctx))
return;
__io_napi_adjust_timeout(ctx, iowq, ts);
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
if (!io_napi(ctx))
return;
__io_napi_busy_loop(ctx, iowq);
}
/*
* io_napi_add() - Add napi id to the busy poll list
* @req: pointer to io_kiocb request
*
* Add the napi id of the socket to the napi busy poll list and hash table.
*/
static inline void io_napi_add(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
struct socket *sock;
if (!READ_ONCE(ctx->napi_busy_poll_to))
return;
sock = sock_from_file(req->file);
if (sock)
__io_napi_add(ctx, sock);
}
#else
static inline void io_napi_init(struct io_ring_ctx *ctx)
{
}
static inline void io_napi_free(struct io_ring_ctx *ctx)
{
}
static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
{
return -EOPNOTSUPP;
}
static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
{
return -EOPNOTSUPP;
}
static inline bool io_napi(struct io_ring_ctx *ctx)
{
return false;
}
static inline void io_napi_add(struct io_kiocb *req)
{
}
static inline void io_napi_adjust_timeout(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
struct timespec64 *ts)
{
}
static inline void io_napi_busy_loop(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq)
{
}
static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx)
{
return 0;
}
#endif /* CONFIG_NET_RX_BUSY_POLL */
#endif
This diff is collapsed.
......@@ -35,6 +35,7 @@
#include "rw.h"
#include "waitid.h"
#include "futex.h"
#include "truncate.h"
static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags)
{
......@@ -474,6 +475,12 @@ const struct io_issue_def io_issue_defs[] = {
.prep = io_install_fixed_fd_prep,
.issue = io_install_fixed_fd,
},
[IORING_OP_FTRUNCATE] = {
.needs_file = 1,
.hash_reg_file = 1,
.prep = io_ftruncate_prep,
.issue = io_ftruncate,
},
};
const struct io_cold_def io_cold_defs[] = {
......@@ -712,6 +719,9 @@ const struct io_cold_def io_cold_defs[] = {
[IORING_OP_FIXED_FD_INSTALL] = {
.name = "FIXED_FD_INSTALL",
},
[IORING_OP_FTRUNCATE] = {
.name = "FTRUNCATE",
},
};
const char *io_uring_get_opcode(u8 opcode)
......
......@@ -15,6 +15,7 @@
#include "io_uring.h"
#include "refs.h"
#include "napi.h"
#include "opdef.h"
#include "kbuf.h"
#include "poll.h"
......@@ -343,8 +344,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
} while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
IO_POLL_REF_MASK);
v &= IO_POLL_REF_MASK;
} while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK);
return IOU_POLL_NO_ACTION;
}
......@@ -539,14 +540,6 @@ static void __io_queue_proc(struct io_poll *poll, struct io_poll_table *pt,
poll->wait.private = (void *) wqe_private;
if (poll->events & EPOLLEXCLUSIVE) {
/*
* Exclusive waits may only wake a limited amount of entries
* rather than all of them, this may interfere with lazy
* wake if someone does wait(events > 1). Ensure we don't do
* lazy wake for those, as we need to process each one as they
* come in.
*/
req->flags |= REQ_F_POLL_NO_LAZY;
add_wait_queue_exclusive(head, &poll->wait);
} else {
add_wait_queue(head, &poll->wait);
......@@ -588,10 +581,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
struct io_poll_table *ipt, __poll_t mask,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
io_init_poll_iocb(poll, mask);
poll->file = req->file;
req->apoll_events = poll->events;
......@@ -618,6 +608,17 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (issue_flags & IO_URING_F_UNLOCKED)
req->flags &= ~REQ_F_HASH_LOCKED;
/*
* Exclusive waits may only wake a limited amount of entries
* rather than all of them, this may interfere with lazy
* wake if someone does wait(events > 1). Ensure we don't do
* lazy wake for those, as we need to process each one as they
* come in.
*/
if (poll->events & EPOLLEXCLUSIVE)
req->flags |= REQ_F_POLL_NO_LAZY;
mask = vfs_poll(req->file, &ipt->pt) & poll->events;
if (unlikely(ipt->error || !ipt->nr_entries)) {
......@@ -652,6 +653,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
__io_poll_execute(req, mask);
return 0;
}
io_napi_add(req);
if (ipt->owning) {
/*
......@@ -727,7 +729,7 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags)
if (!def->pollin && !def->pollout)
return IO_APOLL_ABORTED;
if (!file_can_poll(req->file))
if (!io_file_can_poll(req))
return IO_APOLL_ABORTED;
if (!(req->flags & REQ_F_APOLL_MULTISHOT))
mask |= EPOLLONESHOT;
......@@ -818,9 +820,8 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
if (poll_only && req->opcode != IORING_OP_POLL_ADD)
continue;
if (cd->flags & IORING_ASYNC_CANCEL_ALL) {
if (cd->seq == req->work.cancel_seq)
if (io_cancel_match_sequence(req, cd->seq))
continue;
req->work.cancel_seq = cd->seq;
}
*out_bucket = hb;
return req;
......
......@@ -26,6 +26,7 @@
#include "register.h"
#include "cancel.h"
#include "kbuf.h"
#include "napi.h"
#define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
IORING_REGISTER_LAST + IORING_OP_LAST)
......@@ -550,6 +551,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_register_pbuf_status(ctx, arg);
break;
case IORING_REGISTER_NAPI:
ret = -EINVAL;
if (!arg || nr_args != 1)
break;
ret = io_register_napi(ctx, arg);
break;
case IORING_UNREGISTER_NAPI:
ret = -EINVAL;
if (nr_args != 1)
break;
ret = io_unregister_napi(ctx, arg);
break;
default:
ret = -EINVAL;
break;
......
......@@ -2,8 +2,6 @@
#ifndef IOU_RSRC_H
#define IOU_RSRC_H
#include <net/af_unix.h>
#include "alloc_cache.h"
#define IO_NODE_ALLOC_CACHE_MAX 32
......
......@@ -11,6 +11,7 @@
#include <linux/nospec.h>
#include <linux/compat.h>
#include <linux/io_uring/cmd.h>
#include <linux/indirect_call_wrapper.h>
#include <uapi/linux/io_uring.h>
......@@ -274,7 +275,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res)
* current cycle.
*/
io_req_io_end(req);
req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
return true;
}
req_set_fail(req);
......@@ -341,7 +342,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
io_req_end_write(req);
if (unlikely(res != req->cqe.res)) {
if (res == -EAGAIN && io_rw_should_reissue(req)) {
req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE;
return;
}
req->cqe.res = res;
......@@ -682,7 +683,7 @@ static bool io_rw_should_retry(struct io_kiocb *req)
* just use poll if we can, and don't attempt if the fs doesn't
* support callback based unlocks
*/
if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
if (io_file_can_poll(req) || !(req->file->f_mode & FMODE_BUF_RASYNC))
return false;
wait->wait.func = io_async_buf_func;
......@@ -721,7 +722,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
struct file *file = req->file;
int ret;
if (unlikely(!file || !(file->f_mode & mode)))
if (unlikely(!(file->f_mode & mode)))
return -EBADF;
if (!(req->flags & REQ_F_FIXED_FILE))
......@@ -831,7 +832,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* If we can poll, just do that. For a vectored read, we'll
* need to copy state first.
*/
if (file_can_poll(req->file) && !io_issue_defs[req->opcode].vectored)
if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored)
return -EAGAIN;
/* IOPOLL retry should happen for io-wq threads */
if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL))
......@@ -930,7 +931,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags)
/*
* Multishot MUST be used on a pollable file
*/
if (!file_can_poll(req->file))
if (!io_file_can_poll(req))
return -EBADFD;
ret = __io_read(req, issue_flags);
......
......@@ -15,9 +15,11 @@
#include <uapi/linux/io_uring.h>
#include "io_uring.h"
#include "napi.h"
#include "sqpoll.h"
#define IORING_SQPOLL_CAP_ENTRIES_VALUE 8
#define IORING_TW_CAP_ENTRIES_VALUE 8
enum {
IO_SQ_THREAD_SHOULD_STOP = 0,
......@@ -193,6 +195,9 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
ret = io_submit_sqes(ctx, to_submit);
mutex_unlock(&ctx->uring_lock);
if (io_napi(ctx))
ret += io_napi_sqpoll_busy_poll(ctx);
if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))
wake_up(&ctx->sqo_sq_wait);
if (creds)
......@@ -219,10 +224,52 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd)
return did_sig || test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
}
/*
* Run task_work, processing the retry_list first. The retry_list holds
* entries that we passed on in the previous run, if we had more task_work
* than we were asked to process. Newly queued task_work isn't run until the
* retry list has been fully processed.
*/
static unsigned int io_sq_tw(struct llist_node **retry_list, int max_entries)
{
struct io_uring_task *tctx = current->io_uring;
unsigned int count = 0;
if (*retry_list) {
*retry_list = io_handle_tw_list(*retry_list, &count, max_entries);
if (count >= max_entries)
return count;
max_entries -= count;
}
*retry_list = tctx_task_work_run(tctx, max_entries, &count);
return count;
}
static bool io_sq_tw_pending(struct llist_node *retry_list)
{
struct io_uring_task *tctx = current->io_uring;
return retry_list || !llist_empty(&tctx->task_list);
}
static void io_sq_update_worktime(struct io_sq_data *sqd, struct rusage *start)
{
struct rusage end;
getrusage(current, RUSAGE_SELF, &end);
end.ru_stime.tv_sec -= start->ru_stime.tv_sec;
end.ru_stime.tv_usec -= start->ru_stime.tv_usec;
sqd->work_time += end.ru_stime.tv_usec + end.ru_stime.tv_sec * 1000000;
}
static int io_sq_thread(void *data)
{
struct llist_node *retry_list = NULL;
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
struct rusage start;
unsigned long timeout = 0;
char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
......@@ -251,18 +298,21 @@ static int io_sq_thread(void *data)
}
cap_entries = !list_is_singular(&sqd->ctx_list);
getrusage(current, RUSAGE_SELF, &start);
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
int ret = __io_sq_thread(ctx, cap_entries);
if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list)))
sqt_spin = true;
}
if (io_run_task_work())
if (io_sq_tw(&retry_list, IORING_TW_CAP_ENTRIES_VALUE))
sqt_spin = true;
if (sqt_spin || !time_after(jiffies, timeout)) {
if (sqt_spin)
if (sqt_spin) {
io_sq_update_worktime(sqd, &start);
timeout = jiffies + sqd->sq_thread_idle;
}
if (unlikely(need_resched())) {
mutex_unlock(&sqd->lock);
cond_resched();
......@@ -273,7 +323,7 @@ static int io_sq_thread(void *data)
}
prepare_to_wait(&sqd->wait, &wait, TASK_INTERRUPTIBLE);
if (!io_sqd_events_pending(sqd) && !task_work_pending(current)) {
if (!io_sqd_events_pending(sqd) && !io_sq_tw_pending(retry_list)) {
bool needs_sched = true;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) {
......@@ -312,6 +362,9 @@ static int io_sq_thread(void *data)
timeout = jiffies + sqd->sq_thread_idle;
}
if (retry_list)
io_sq_tw(&retry_list, UINT_MAX);
io_uring_cancel_generic(true, sqd);
sqd->thread = NULL;
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
......
......@@ -16,6 +16,7 @@ struct io_sq_data {
pid_t task_pid;
pid_t task_tgid;
u64 work_time;
unsigned long state;
struct completion exited;
};
......
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "../fs/internal.h"
#include "io_uring.h"
#include "truncate.h"
struct io_ftrunc {
struct file *file;
loff_t len;
};
int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc);
if (sqe->rw_flags || sqe->addr || sqe->len || sqe->buf_index ||
sqe->splice_fd_in || sqe->addr3)
return -EINVAL;
ft->len = READ_ONCE(sqe->off);
req->flags |= REQ_F_FORCE_ASYNC;
return 0;
}
int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags)
{
struct io_ftrunc *ft = io_kiocb_to_cmd(req, struct io_ftrunc);
int ret;
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_ftruncate(req->file, ft->len, 1);
io_req_set_res(req, ret, 0);
return IOU_OK;
}
// SPDX-License-Identifier: GPL-2.0
int io_ftruncate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags);
......@@ -5,6 +5,7 @@
#include <linux/io_uring/cmd.h>
#include <linux/security.h>
#include <linux/nospec.h>
#include <net/sock.h>
#include <uapi/linux/io_uring.h>
#include <asm/ioctls.h>
......
......@@ -112,7 +112,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK);
ret = do_getxattr(mnt_idmap(req->file->f_path.mnt),
ret = do_getxattr(file_mnt_idmap(req->file),
req->file->f_path.dentry,
&ix->ctx);
......
......@@ -6177,8 +6177,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
u16 budget)
enum {
NAPI_F_PREFER_BUSY_POLL = 1,
NAPI_F_END_ON_RESCHED = 2,
};
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
unsigned flags, u16 budget)
{
bool skip_schedule = false;
unsigned long timeout;
......@@ -6198,7 +6203,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
local_bh_disable();
if (prefer_busy_poll) {
if (flags & NAPI_F_PREFER_BUSY_POLL) {
napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
timeout = READ_ONCE(napi->dev->gro_flush_timeout);
if (napi->defer_hard_irqs_count && timeout) {
......@@ -6222,23 +6227,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
static void __napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, unsigned flags, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
void *have_poll_lock = NULL;
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
restart:
napi_poll = NULL;
rcu_read_lock();
napi = napi_by_id(napi_id);
if (!napi)
goto out;
return;
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_disable();
......@@ -6254,14 +6259,14 @@ void napi_busy_loop(unsigned int napi_id,
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
NAPIF_STATE_IN_BUSY_POLL)) {
if (prefer_busy_poll)
if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
NAPIF_STATE_SCHED) != val) {
if (prefer_busy_poll)
if (flags & NAPI_F_PREFER_BUSY_POLL)
set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
}
......@@ -6281,12 +6286,15 @@ void napi_busy_loop(unsigned int napi_id,
break;
if (unlikely(need_resched())) {
if (flags & NAPI_F_END_ON_RESCHED)
break;
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
rcu_read_unlock();
cond_resched();
rcu_read_lock();
if (loop_end(loop_end_arg, start_time))
return;
goto restart;
......@@ -6294,10 +6302,31 @@ void napi_busy_loop(unsigned int napi_id,
cpu_relax();
}
if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
busy_poll_stop(napi, have_poll_lock, flags, budget);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
preempt_enable();
out:
}
void napi_busy_loop_rcu(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned flags = NAPI_F_END_ON_RESCHED;
if (prefer_busy_poll)
flags |= NAPI_F_PREFER_BUSY_POLL;
__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
rcu_read_lock();
__napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment