Commit edafccee authored by Jens Axboe's avatar Jens Axboe

io_uring: add support for pre-mapped user IO buffers

If we have fixed user buffers, we can map them into the kernel when we
setup the io_uring. That avoids the need to do get_user_pages() for
each and every IO.

To utilize this feature, the application must call io_uring_register()
after having setup an io_uring instance, passing in
IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to
an iovec array, and the nr_args should contain how many iovecs the
application wishes to map.

If successful, these buffers are now mapped into the kernel, eligible
for IO. To use these fixed buffers, the application must use the
IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then
set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len
must point to somewhere inside the indexed buffer.

The application may register buffers throughout the lifetime of the
io_uring instance. It can call io_uring_register() with
IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of
buffers, and then register a new set. The application need not
unregister buffers explicitly before shutting down the io_uring
instance.

It's perfectly valid to setup a larger buffer, and then sometimes only
use parts of it for an IO. As long as the range is within the originally
mapped region, it will work just fine.

For now, buffers must not be file backed. If file backed buffers are
passed in, the registration will fail with -1/EOPNOTSUPP. This
restriction may be relaxed in the future.

RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat
arbitrary 1G per buffer size is also imposed.
Reviewed-by: default avatarHannes Reinecke <hare@suse.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 6d0c48ae
...@@ -400,3 +400,4 @@ ...@@ -400,3 +400,4 @@
386 i386 rseq sys_rseq __ia32_sys_rseq 386 i386 rseq sys_rseq __ia32_sys_rseq
425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register
...@@ -345,6 +345,7 @@ ...@@ -345,6 +345,7 @@
334 common rseq __x64_sys_rseq 334 common rseq __x64_sys_rseq
425 common io_uring_setup __x64_sys_io_uring_setup 425 common io_uring_setup __x64_sys_io_uring_setup
426 common io_uring_enter __x64_sys_io_uring_enter 426 common io_uring_enter __x64_sys_io_uring_enter
427 common io_uring_register __x64_sys_io_uring_register
# #
# x32-specific system call numbers start at 512 to avoid cache impact # x32-specific system call numbers start at 512 to avoid cache impact
......
This diff is collapsed.
...@@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries, ...@@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries,
asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit,
u32 min_complete, u32 flags, u32 min_complete, u32 flags,
const sigset_t __user *sig, size_t sigsz); const sigset_t __user *sig, size_t sigsz);
asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op,
void __user *arg, unsigned int nr_args);
/* fs/xattr.c */ /* fs/xattr.c */
asmlinkage long sys_setxattr(const char __user *path, const char __user *name, asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
......
...@@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) ...@@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load)
__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
#define __NR_io_uring_enter 426 #define __NR_io_uring_enter 426
__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter)
#define __NR_io_uring_register 427
__SYSCALL(__NR_io_uring_register, sys_io_uring_register)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 427 #define __NR_syscalls 428
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different
......
...@@ -27,7 +27,10 @@ struct io_uring_sqe { ...@@ -27,7 +27,10 @@ struct io_uring_sqe {
__u32 fsync_flags; __u32 fsync_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
__u64 __pad2[3]; union {
__u16 buf_index; /* index into fixed buffers, if used */
__u64 __pad2[3];
};
}; };
/* /*
...@@ -39,6 +42,8 @@ struct io_uring_sqe { ...@@ -39,6 +42,8 @@ struct io_uring_sqe {
#define IORING_OP_READV 1 #define IORING_OP_READV 1
#define IORING_OP_WRITEV 2 #define IORING_OP_WRITEV 2
#define IORING_OP_FSYNC 3 #define IORING_OP_FSYNC 3
#define IORING_OP_READ_FIXED 4
#define IORING_OP_WRITE_FIXED 5
/* /*
* sqe->fsync_flags * sqe->fsync_flags
...@@ -103,4 +108,10 @@ struct io_uring_params { ...@@ -103,4 +108,10 @@ struct io_uring_params {
struct io_cqring_offsets cq_off; struct io_cqring_offsets cq_off;
}; };
/*
* io_uring_register(2) opcodes and arguments
*/
#define IORING_REGISTER_BUFFERS 0
#define IORING_UNREGISTER_BUFFERS 1
#endif #endif
...@@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents); ...@@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents);
COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL_COMPAT(io_pgetevents);
COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_setup);
COND_SYSCALL(io_uring_enter); COND_SYSCALL(io_uring_enter);
COND_SYSCALL(io_uring_register);
/* fs/xattr.c */ /* fs/xattr.c */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment