Commit a2d79c71 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.3/io_uring-20190711' of git://git.kernel.dk/linux-block

Pull io_uring updates from Jens Axboe:
 "This contains:

   - Support for recvmsg/sendmsg as first class opcodes.

     I don't envision going much further down this path, as there are
     plans in progress to support potentially any system call in an
     async fashion through io_uring. But I think it does make sense to
     have certain core ops available directly, especially those that can
     support a "try this non-blocking" flag/mode. (me)

   - Handle generic short reads automatically.

     This can happen fairly easily if parts of the buffered read is
     cached. Since the application needs to issue another request for
     the remainder, just do this internally and save kernel/user
     roundtrip while providing a nicer more robust API. (me)

   - Support for linked SQEs.

     This allows SQEs to depend on each other, enabling an application
     to eg queue a read-from-this-file,write-to-that-file pair. (me)

   - Fix race in stopping SQ thread (Jackie)"

* tag 'for-5.3/io_uring-20190711' of git://git.kernel.dk/linux-block:
  io_uring: fix io_sq_thread_stop running in front of io_sq_thread
  io_uring: add support for recvmsg()
  io_uring: add support for sendmsg()
  io_uring: add support for sqe links
  io_uring: punt short reads to async context
  uio: make import_iovec()/compat_import_iovec() return bytes on success
parents 964a4eac a4c0b3de
...@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) ...@@ -1479,8 +1479,9 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
return 0; return 0;
} }
static int aio_setup_rw(int rw, const struct iocb *iocb, struct iovec **iovec, static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
bool vectored, bool compat, struct iov_iter *iter) struct iovec **iovec, bool vectored, bool compat,
struct iov_iter *iter)
{ {
void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf; void __user *buf = (void __user *)(uintptr_t)iocb->aio_buf;
size_t len = iocb->aio_nbytes; size_t len = iocb->aio_nbytes;
...@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, ...@@ -1537,7 +1538,7 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb,
return -EINVAL; return -EINVAL;
ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter); ret = aio_setup_rw(READ, iocb, &iovec, vectored, compat, &iter);
if (ret) if (ret < 0)
return ret; return ret;
ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter)); ret = rw_verify_area(READ, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) if (!ret)
...@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, ...@@ -1565,7 +1566,7 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
return -EINVAL; return -EINVAL;
ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter); ret = aio_setup_rw(WRITE, iocb, &iovec, vectored, compat, &iter);
if (ret) if (ret < 0)
return ret; return ret;
ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
if (!ret) { if (!ret) {
......
...@@ -231,6 +231,7 @@ struct io_ring_ctx { ...@@ -231,6 +231,7 @@ struct io_ring_ctx {
struct task_struct *sqo_thread; /* if using sq thread polling */ struct task_struct *sqo_thread; /* if using sq thread polling */
struct mm_struct *sqo_mm; struct mm_struct *sqo_mm;
wait_queue_head_t sqo_wait; wait_queue_head_t sqo_wait;
struct completion sqo_thread_started;
struct { struct {
/* CQ ring */ /* CQ ring */
...@@ -322,6 +323,7 @@ struct io_kiocb { ...@@ -322,6 +323,7 @@ struct io_kiocb {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
struct list_head list; struct list_head list;
struct list_head link_list;
unsigned int flags; unsigned int flags;
refcount_t refs; refcount_t refs;
#define REQ_F_NOWAIT 1 /* must not punt to workers */ #define REQ_F_NOWAIT 1 /* must not punt to workers */
...@@ -330,8 +332,10 @@ struct io_kiocb { ...@@ -330,8 +332,10 @@ struct io_kiocb {
#define REQ_F_SEQ_PREV 8 /* sequential with previous */ #define REQ_F_SEQ_PREV 8 /* sequential with previous */
#define REQ_F_IO_DRAIN 16 /* drain existing IO first */ #define REQ_F_IO_DRAIN 16 /* drain existing IO first */
#define REQ_F_IO_DRAINED 32 /* drain done */ #define REQ_F_IO_DRAINED 32 /* drain done */
#define REQ_F_LINK 64 /* linked sqes */
#define REQ_F_FAIL_LINK 128 /* fail rest of links */
u64 user_data; u64 user_data;
u32 error; /* iopoll result from callback */ u32 result;
u32 sequence; u32 sequence;
struct work_struct work; struct work_struct work;
...@@ -403,6 +407,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -403,6 +407,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
ctx->flags = p->flags; ctx->flags = p->flags;
init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->cq_wait);
init_completion(&ctx->ctx_done); init_completion(&ctx->ctx_done);
init_completion(&ctx->sqo_thread_started);
mutex_init(&ctx->uring_lock); mutex_init(&ctx->uring_lock);
init_waitqueue_head(&ctx->wait); init_waitqueue_head(&ctx->wait);
for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) { for (i = 0; i < ARRAY_SIZE(ctx->pending_async); i++) {
...@@ -584,6 +589,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ...@@ -584,6 +589,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
req->flags = 0; req->flags = 0;
/* one is dropped after submission, the other at completion */ /* one is dropped after submission, the other at completion */
refcount_set(&req->refs, 2); refcount_set(&req->refs, 2);
req->result = 0;
return req; return req;
out: out:
io_ring_drop_ctx_refs(ctx, 1); io_ring_drop_ctx_refs(ctx, 1);
...@@ -599,7 +605,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) ...@@ -599,7 +605,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr)
} }
} }
static void io_free_req(struct io_kiocb *req) static void __io_free_req(struct io_kiocb *req)
{ {
if (req->file && !(req->flags & REQ_F_FIXED_FILE)) if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file); fput(req->file);
...@@ -607,6 +613,63 @@ static void io_free_req(struct io_kiocb *req) ...@@ -607,6 +613,63 @@ static void io_free_req(struct io_kiocb *req)
kmem_cache_free(req_cachep, req); kmem_cache_free(req_cachep, req);
} }
static void io_req_link_next(struct io_kiocb *req)
{
struct io_kiocb *nxt;
/*
* The list should never be empty when we are called here. But could
* potentially happen if the chain is messed up, check to be on the
* safe side.
*/
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list);
if (nxt) {
list_del(&nxt->list);
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
INIT_WORK(&nxt->work, io_sq_wq_submit_work);
queue_work(req->ctx->sqo_wq, &nxt->work);
}
}
/*
* Called if REQ_F_LINK is set, and we fail the head request
*/
static void io_fail_links(struct io_kiocb *req)
{
struct io_kiocb *link;
while (!list_empty(&req->link_list)) {
link = list_first_entry(&req->link_list, struct io_kiocb, list);
list_del(&link->list);
io_cqring_add_event(req->ctx, link->user_data, -ECANCELED);
__io_free_req(link);
}
}
static void io_free_req(struct io_kiocb *req)
{
/*
* If LINK is set, we have dependent requests in this chain. If we
* didn't fail this request, queue the first one up, moving any other
* dependencies to the next request. In case of failure, fail the rest
* of the chain.
*/
if (req->flags & REQ_F_LINK) {
if (req->flags & REQ_F_FAIL_LINK)
io_fail_links(req);
else
io_req_link_next(req);
}
__io_free_req(req);
}
static void io_put_req(struct io_kiocb *req) static void io_put_req(struct io_kiocb *req)
{ {
if (refcount_dec_and_test(&req->refs)) if (refcount_dec_and_test(&req->refs))
...@@ -628,16 +691,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, ...@@ -628,16 +691,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
req = list_first_entry(done, struct io_kiocb, list); req = list_first_entry(done, struct io_kiocb, list);
list_del(&req->list); list_del(&req->list);
io_cqring_fill_event(ctx, req->user_data, req->error); io_cqring_fill_event(ctx, req->user_data, req->result);
(*nr_events)++; (*nr_events)++;
if (refcount_dec_and_test(&req->refs)) { if (refcount_dec_and_test(&req->refs)) {
/* If we're not using fixed files, we have to pair the /* If we're not using fixed files, we have to pair the
* completion part with the file put. Use regular * completion part with the file put. Use regular
* completions for those, only batch free for fixed * completions for those, only batch free for fixed
* file. * file and non-linked commands.
*/ */
if (req->flags & REQ_F_FIXED_FILE) { if ((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
REQ_F_FIXED_FILE) {
reqs[to_free++] = req; reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs)) if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free); io_free_req_many(ctx, reqs, &to_free);
...@@ -776,6 +840,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) ...@@ -776,6 +840,8 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
kiocb_end_write(kiocb); kiocb_end_write(kiocb);
if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, req->user_data, res); io_cqring_add_event(req->ctx, req->user_data, res);
io_put_req(req); io_put_req(req);
} }
...@@ -786,7 +852,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) ...@@ -786,7 +852,9 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
kiocb_end_write(kiocb); kiocb_end_write(kiocb);
req->error = res; if ((req->flags & REQ_F_LINK) && res != req->result)
req->flags |= REQ_F_FAIL_LINK;
req->result = res;
if (res != -EAGAIN) if (res != -EAGAIN)
req->flags |= REQ_F_IOPOLL_COMPLETED; req->flags |= REQ_F_IOPOLL_COMPLETED;
} }
...@@ -929,7 +997,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -929,7 +997,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s,
!kiocb->ki_filp->f_op->iopoll) !kiocb->ki_filp->f_op->iopoll)
return -EOPNOTSUPP; return -EOPNOTSUPP;
req->error = 0;
kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll; kiocb->ki_complete = io_complete_rw_iopoll;
} else { } else {
...@@ -1001,9 +1068,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw, ...@@ -1001,9 +1068,9 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
return 0; return 0;
} }
static int io_import_iovec(struct io_ring_ctx *ctx, int rw, static ssize_t io_import_iovec(struct io_ring_ctx *ctx, int rw,
const struct sqe_submit *s, struct iovec **iovec, const struct sqe_submit *s, struct iovec **iovec,
struct iov_iter *iter) struct iov_iter *iter)
{ {
const struct io_uring_sqe *sqe = s->sqe; const struct io_uring_sqe *sqe = s->sqe;
void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
...@@ -1021,7 +1088,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, ...@@ -1021,7 +1088,7 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw,
opcode = READ_ONCE(sqe->opcode); opcode = READ_ONCE(sqe->opcode);
if (opcode == IORING_OP_READ_FIXED || if (opcode == IORING_OP_READ_FIXED ||
opcode == IORING_OP_WRITE_FIXED) { opcode == IORING_OP_WRITE_FIXED) {
int ret = io_import_fixed(ctx, rw, sqe, iter); ssize_t ret = io_import_fixed(ctx, rw, sqe, iter);
*iovec = NULL; *iovec = NULL;
return ret; return ret;
} }
...@@ -1087,7 +1154,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1087,7 +1154,7 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
struct iov_iter iter; struct iov_iter iter;
struct file *file; struct file *file;
size_t iov_count; size_t iov_count;
int ret; ssize_t read_size, ret;
ret = io_prep_rw(req, s, force_nonblock); ret = io_prep_rw(req, s, force_nonblock);
if (ret) if (ret)
...@@ -1100,16 +1167,30 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1100,16 +1167,30 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s,
return -EINVAL; return -EINVAL;
ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter);
if (ret) if (ret < 0)
return ret; return ret;
read_size = ret;
if (req->flags & REQ_F_LINK)
req->result = read_size;
iov_count = iov_iter_count(&iter); iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
if (!ret) { if (!ret) {
ssize_t ret2; ssize_t ret2;
/* Catch -EAGAIN return for forced non-blocking submission */
ret2 = call_read_iter(file, kiocb, &iter); ret2 = call_read_iter(file, kiocb, &iter);
/*
* In case of a short read, punt to async. This can happen
* if we have data partially cached. Alternatively we can
* return the short read, in which case the application will
* need to issue another SQE and wait for it. That SQE will
* need async punt anyway, so it's more efficient to do it
* here.
*/
if (force_nonblock && ret2 > 0 && ret2 < read_size)
ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) { if (!force_nonblock || ret2 != -EAGAIN) {
io_rw_done(kiocb, ret2); io_rw_done(kiocb, ret2);
} else { } else {
...@@ -1134,7 +1215,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1134,7 +1215,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
struct iov_iter iter; struct iov_iter iter;
struct file *file; struct file *file;
size_t iov_count; size_t iov_count;
int ret; ssize_t ret;
ret = io_prep_rw(req, s, force_nonblock); ret = io_prep_rw(req, s, force_nonblock);
if (ret) if (ret)
...@@ -1147,9 +1228,12 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, ...@@ -1147,9 +1228,12 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s,
return -EINVAL; return -EINVAL;
ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter);
if (ret) if (ret < 0)
return ret; return ret;
if (req->flags & REQ_F_LINK)
req->result = ret;
iov_count = iov_iter_count(&iter); iov_count = iov_iter_count(&iter);
ret = -EAGAIN; ret = -EAGAIN;
...@@ -1253,6 +1337,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1253,6 +1337,8 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
end > 0 ? end : LLONG_MAX, end > 0 ? end : LLONG_MAX,
fsync_flags & IORING_FSYNC_DATASYNC); fsync_flags & IORING_FSYNC_DATASYNC);
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, sqe->user_data, ret); io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req); io_put_req(req);
return 0; return 0;
...@@ -1297,11 +1383,70 @@ static int io_sync_file_range(struct io_kiocb *req, ...@@ -1297,11 +1383,70 @@ static int io_sync_file_range(struct io_kiocb *req,
ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags);
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req->ctx, sqe->user_data, ret); io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req); io_put_req(req);
return 0; return 0;
} }
#if defined(CONFIG_NET)
static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock,
long (*fn)(struct socket *, struct user_msghdr __user *,
unsigned int))
{
struct socket *sock;
int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct user_msghdr __user *msg;
unsigned flags;
flags = READ_ONCE(sqe->msg_flags);
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
msg = (struct user_msghdr __user *) (unsigned long)
READ_ONCE(sqe->addr);
ret = fn(sock, msg, flags);
if (force_nonblock && ret == -EAGAIN)
return ret;
}
io_cqring_add_event(req->ctx, sqe->user_data, ret);
io_put_req(req);
return 0;
}
#endif
static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
#if defined(CONFIG_NET)
return io_send_recvmsg(req, sqe, force_nonblock, __sys_sendmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
bool force_nonblock)
{
#if defined(CONFIG_NET)
return io_send_recvmsg(req, sqe, force_nonblock, __sys_recvmsg_sock);
#else
return -EOPNOTSUPP;
#endif
}
static void io_poll_remove_one(struct io_kiocb *req) static void io_poll_remove_one(struct io_kiocb *req)
{ {
struct io_poll_iocb *poll = &req->poll; struct io_poll_iocb *poll = &req->poll;
...@@ -1549,9 +1694,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -1549,9 +1694,10 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
{ {
int ret, opcode; int ret, opcode;
req->user_data = READ_ONCE(s->sqe->user_data);
if (unlikely(s->index >= ctx->sq_entries)) if (unlikely(s->index >= ctx->sq_entries))
return -EINVAL; return -EINVAL;
req->user_data = READ_ONCE(s->sqe->user_data);
opcode = READ_ONCE(s->sqe->opcode); opcode = READ_ONCE(s->sqe->opcode);
switch (opcode) { switch (opcode) {
...@@ -1586,6 +1732,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -1586,6 +1732,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
case IORING_OP_SYNC_FILE_RANGE: case IORING_OP_SYNC_FILE_RANGE:
ret = io_sync_file_range(req, s->sqe, force_nonblock); ret = io_sync_file_range(req, s->sqe, force_nonblock);
break; break;
case IORING_OP_SENDMSG:
ret = io_sendmsg(req, s->sqe, force_nonblock);
break;
case IORING_OP_RECVMSG:
ret = io_recvmsg(req, s->sqe, force_nonblock);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
...@@ -1595,7 +1747,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ...@@ -1595,7 +1747,7 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return ret; return ret;
if (ctx->flags & IORING_SETUP_IOPOLL) { if (ctx->flags & IORING_SETUP_IOPOLL) {
if (req->error == -EAGAIN) if (req->result == -EAGAIN)
return -EAGAIN; return -EAGAIN;
/* workqueue context doesn't hold uring_lock, grab it now */ /* workqueue context doesn't hold uring_lock, grab it now */
...@@ -1819,31 +1971,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, ...@@ -1819,31 +1971,11 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s,
return 0; return 0;
} }
static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
struct io_submit_state *state) struct sqe_submit *s)
{ {
struct io_kiocb *req;
int ret; int ret;
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~(IOSQE_FIXED_FILE | IOSQE_IO_DRAIN)))
return -EINVAL;
req = io_get_req(ctx, state);
if (unlikely(!req))
return -EAGAIN;
ret = io_req_set_file(ctx, s, state, req);
if (unlikely(ret))
goto out;
ret = io_req_defer(ctx, req, s->sqe);
if (ret) {
if (ret == -EIOCBQUEUED)
ret = 0;
return ret;
}
ret = __io_submit_sqe(ctx, req, s, true); ret = __io_submit_sqe(ctx, req, s, true);
if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
struct io_uring_sqe *sqe_copy; struct io_uring_sqe *sqe_copy;
...@@ -1866,24 +1998,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ...@@ -1866,24 +1998,93 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
/* /*
* Queued up for async execution, worker will release * Queued up for async execution, worker will release
* submit reference when the iocb is actually * submit reference when the iocb is actually submitted.
* submitted.
*/ */
return 0; return 0;
} }
} }
out:
/* drop submission reference */ /* drop submission reference */
io_put_req(req); io_put_req(req);
/* and drop final reference, if we failed */ /* and drop final reference, if we failed */
if (ret) if (ret) {
io_cqring_add_event(ctx, req->user_data, ret);
if (req->flags & REQ_F_LINK)
req->flags |= REQ_F_FAIL_LINK;
io_put_req(req); io_put_req(req);
}
return ret; return ret;
} }
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
struct io_submit_state *state, struct io_kiocb **link)
{
struct io_uring_sqe *sqe_copy;
struct io_kiocb *req;
int ret;
/* enforce forwards compatibility on users */
if (unlikely(s->sqe->flags & ~SQE_VALID_FLAGS)) {
ret = -EINVAL;
goto err;
}
req = io_get_req(ctx, state);
if (unlikely(!req)) {
ret = -EAGAIN;
goto err;
}
ret = io_req_set_file(ctx, s, state, req);
if (unlikely(ret)) {
err_req:
io_free_req(req);
err:
io_cqring_add_event(ctx, s->sqe->user_data, ret);
return;
}
ret = io_req_defer(ctx, req, s->sqe);
if (ret) {
if (ret != -EIOCBQUEUED)
goto err_req;
return;
}
/*
* If we already have a head request, queue this one for async
* submittal once the head completes. If we don't have a head but
* IOSQE_IO_LINK is set in the sqe, start a new head. This one will be
* submitted sync once the chain is complete. If none of those
* conditions are true (normal request), then just queue it.
*/
if (*link) {
struct io_kiocb *prev = *link;
sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy) {
ret = -EAGAIN;
goto err_req;
}
s->sqe = sqe_copy;
memcpy(&req->submit, s, sizeof(*s));
list_add_tail(&req->list, &prev->link_list);
} else if (s->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK;
memcpy(&req->submit, s, sizeof(*s));
INIT_LIST_HEAD(&req->link_list);
*link = req;
} else {
io_queue_sqe(ctx, req, s);
}
}
/* /*
* Batched submission is done, ensure local IO is flushed out. * Batched submission is done, ensure local IO is flushed out.
*/ */
...@@ -1966,7 +2167,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, ...@@ -1966,7 +2167,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
unsigned int nr, bool has_user, bool mm_fault) unsigned int nr, bool has_user, bool mm_fault)
{ {
struct io_submit_state state, *statep = NULL; struct io_submit_state state, *statep = NULL;
int ret, i, submitted = 0; struct io_kiocb *link = NULL;
bool prev_was_link = false;
int i, submitted = 0;
if (nr > IO_PLUG_THRESHOLD) { if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr); io_submit_state_start(&state, ctx, nr);
...@@ -1974,22 +2177,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, ...@@ -1974,22 +2177,30 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes,
} }
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
io_queue_sqe(ctx, link, &link->submit);
link = NULL;
}
prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0;
if (unlikely(mm_fault)) { if (unlikely(mm_fault)) {
ret = -EFAULT; io_cqring_add_event(ctx, sqes[i].sqe->user_data,
-EFAULT);
} else { } else {
sqes[i].has_user = has_user; sqes[i].has_user = has_user;
sqes[i].needs_lock = true; sqes[i].needs_lock = true;
sqes[i].needs_fixed_file = true; sqes[i].needs_fixed_file = true;
ret = io_submit_sqe(ctx, &sqes[i], statep); io_submit_sqe(ctx, &sqes[i], statep, &link);
}
if (!ret) {
submitted++; submitted++;
continue;
} }
io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret);
} }
if (link)
io_queue_sqe(ctx, link, &link->submit);
if (statep) if (statep)
io_submit_state_end(&state); io_submit_state_end(&state);
...@@ -2006,6 +2217,8 @@ static int io_sq_thread(void *data) ...@@ -2006,6 +2217,8 @@ static int io_sq_thread(void *data)
unsigned inflight; unsigned inflight;
unsigned long timeout; unsigned long timeout;
complete(&ctx->sqo_thread_started);
old_fs = get_fs(); old_fs = get_fs();
set_fs(USER_DS); set_fs(USER_DS);
...@@ -2130,6 +2343,8 @@ static int io_sq_thread(void *data) ...@@ -2130,6 +2343,8 @@ static int io_sq_thread(void *data)
static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
{ {
struct io_submit_state state, *statep = NULL; struct io_submit_state state, *statep = NULL;
struct io_kiocb *link = NULL;
bool prev_was_link = false;
int i, submit = 0; int i, submit = 0;
if (to_submit > IO_PLUG_THRESHOLD) { if (to_submit > IO_PLUG_THRESHOLD) {
...@@ -2139,22 +2354,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) ...@@ -2139,22 +2354,30 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit)
for (i = 0; i < to_submit; i++) { for (i = 0; i < to_submit; i++) {
struct sqe_submit s; struct sqe_submit s;
int ret;
if (!io_get_sqring(ctx, &s)) if (!io_get_sqring(ctx, &s))
break; break;
/*
* If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link.
*/
if (!prev_was_link && link) {
io_queue_sqe(ctx, link, &link->submit);
link = NULL;
}
prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0;
s.has_user = true; s.has_user = true;
s.needs_lock = false; s.needs_lock = false;
s.needs_fixed_file = false; s.needs_fixed_file = false;
submit++; submit++;
io_submit_sqe(ctx, &s, statep, &link);
ret = io_submit_sqe(ctx, &s, statep);
if (ret)
io_cqring_add_event(ctx, s.sqe->user_data, ret);
} }
io_commit_sqring(ctx); io_commit_sqring(ctx);
if (link)
io_queue_sqe(ctx, link, &link->submit);
if (statep) if (statep)
io_submit_state_end(statep); io_submit_state_end(statep);
...@@ -2240,6 +2463,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) ...@@ -2240,6 +2463,7 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
static void io_sq_thread_stop(struct io_ring_ctx *ctx) static void io_sq_thread_stop(struct io_ring_ctx *ctx)
{ {
if (ctx->sqo_thread) { if (ctx->sqo_thread) {
wait_for_completion(&ctx->sqo_thread_started);
/* /*
* The park is a bit of a work-around, without it we get * The park is a bit of a work-around, without it we get
* warning spews on shutdown with SQPOLL set and affinity * warning spews on shutdown with SQPOLL set and affinity
......
...@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, ...@@ -1356,7 +1356,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct iovec iovstack[UIO_FASTIOV]; struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack; struct iovec *iov = iovstack;
struct iov_iter iter; struct iov_iter iter;
long error; ssize_t error;
struct fd f; struct fd f;
int type; int type;
...@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, ...@@ -1367,7 +1367,7 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = import_iovec(type, uiov, nr_segs, error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter); ARRAY_SIZE(iovstack), &iov, &iter);
if (!error) { if (error >= 0) {
error = do_vmsplice(f.file, &iter, flags); error = do_vmsplice(f.file, &iter, flags);
kfree(iov); kfree(iov);
} }
...@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io ...@@ -1382,7 +1382,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
struct iovec iovstack[UIO_FASTIOV]; struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack; struct iovec *iov = iovstack;
struct iov_iter iter; struct iov_iter iter;
long error; ssize_t error;
struct fd f; struct fd f;
int type; int type;
...@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io ...@@ -1393,7 +1393,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io
error = compat_import_iovec(type, iov32, nr_segs, error = compat_import_iovec(type, iov32, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter); ARRAY_SIZE(iovstack), &iov, &iter);
if (!error) { if (error >= 0) {
error = do_vmsplice(f.file, &iter, flags); error = do_vmsplice(f.file, &iter, flags);
kfree(iov); kfree(iov);
} }
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
struct pid; struct pid;
struct cred; struct cred;
struct socket;
#define __sockaddr_check_size(size) \ #define __sockaddr_check_size(size) \
BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage))) BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
...@@ -374,6 +375,12 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, ...@@ -374,6 +375,12 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags, unsigned int vlen, unsigned int flags,
bool forbid_cmsg_compat); bool forbid_cmsg_compat);
extern long __sys_sendmsg_sock(struct socket *sock,
struct user_msghdr __user *msg,
unsigned int flags);
extern long __sys_recvmsg_sock(struct socket *sock,
struct user_msghdr __user *msg,
unsigned int flags);
/* helpers which do the actual work for syscalls */ /* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
......
...@@ -267,13 +267,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct ...@@ -267,13 +267,13 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct
size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
struct iov_iter *i); struct iov_iter *i);
int import_iovec(int type, const struct iovec __user * uvector, ssize_t import_iovec(int type, const struct iovec __user * uvector,
unsigned nr_segs, unsigned fast_segs, unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i); struct iovec **iov, struct iov_iter *i);
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
struct compat_iovec; struct compat_iovec;
int compat_import_iovec(int type, const struct compat_iovec __user * uvector, ssize_t compat_import_iovec(int type, const struct compat_iovec __user * uvector,
unsigned nr_segs, unsigned fast_segs, unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i); struct iovec **iov, struct iov_iter *i);
#endif #endif
......
...@@ -27,6 +27,7 @@ struct io_uring_sqe { ...@@ -27,6 +27,7 @@ struct io_uring_sqe {
__u32 fsync_flags; __u32 fsync_flags;
__u16 poll_events; __u16 poll_events;
__u32 sync_range_flags; __u32 sync_range_flags;
__u32 msg_flags;
}; };
__u64 user_data; /* data to be passed back at completion time */ __u64 user_data; /* data to be passed back at completion time */
union { union {
...@@ -40,6 +41,7 @@ struct io_uring_sqe { ...@@ -40,6 +41,7 @@ struct io_uring_sqe {
*/ */
#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ #define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */
#define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ #define IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */
#define IOSQE_IO_LINK (1U << 2) /* links next sqe */
/* /*
* io_uring_setup() flags * io_uring_setup() flags
...@@ -57,6 +59,8 @@ struct io_uring_sqe { ...@@ -57,6 +59,8 @@ struct io_uring_sqe {
#define IORING_OP_POLL_ADD 6 #define IORING_OP_POLL_ADD 6
#define IORING_OP_POLL_REMOVE 7 #define IORING_OP_POLL_REMOVE 7
#define IORING_OP_SYNC_FILE_RANGE 8 #define IORING_OP_SYNC_FILE_RANGE 8
#define IORING_OP_SENDMSG 9
#define IORING_OP_RECVMSG 10
/* /*
* sqe->fsync_flags * sqe->fsync_flags
......
...@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter); ...@@ -1634,9 +1634,9 @@ EXPORT_SYMBOL(dup_iter);
* on-stack array was used or not (and regardless of whether this function * on-stack array was used or not (and regardless of whether this function
* returns an error or not). * returns an error or not).
* *
* Return: 0 on success or negative error code on error. * Return: Negative error code on error, bytes imported on success
*/ */
int import_iovec(int type, const struct iovec __user * uvector, ssize_t import_iovec(int type, const struct iovec __user * uvector,
unsigned nr_segs, unsigned fast_segs, unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i) struct iovec **iov, struct iov_iter *i)
{ {
...@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector, ...@@ -1652,16 +1652,17 @@ int import_iovec(int type, const struct iovec __user * uvector,
} }
iov_iter_init(i, type, p, nr_segs, n); iov_iter_init(i, type, p, nr_segs, n);
*iov = p == *iov ? NULL : p; *iov = p == *iov ? NULL : p;
return 0; return n;
} }
EXPORT_SYMBOL(import_iovec); EXPORT_SYMBOL(import_iovec);
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
#include <linux/compat.h> #include <linux/compat.h>
int compat_import_iovec(int type, const struct compat_iovec __user * uvector, ssize_t compat_import_iovec(int type,
unsigned nr_segs, unsigned fast_segs, const struct compat_iovec __user * uvector,
struct iovec **iov, struct iov_iter *i) unsigned nr_segs, unsigned fast_segs,
struct iovec **iov, struct iov_iter *i)
{ {
ssize_t n; ssize_t n;
struct iovec *p; struct iovec *p;
...@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector, ...@@ -1675,7 +1676,7 @@ int compat_import_iovec(int type, const struct compat_iovec __user * uvector,
} }
iov_iter_init(i, type, p, nr_segs, n); iov_iter_init(i, type, p, nr_segs, n);
*iov = p == *iov ? NULL : p; *iov = p == *iov ? NULL : p;
return 0; return n;
} }
#endif #endif
......
...@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg, ...@@ -80,9 +80,10 @@ int get_compat_msghdr(struct msghdr *kmsg,
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
return compat_import_iovec(save_addr ? READ : WRITE, err = compat_import_iovec(save_addr ? READ : WRITE,
compat_ptr(msg.msg_iov), msg.msg_iovlen, compat_ptr(msg.msg_iov), msg.msg_iovlen,
UIO_FASTIOV, iov, &kmsg->msg_iter); UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
} }
/* Bleech... */ /* Bleech... */
......
...@@ -2222,9 +2222,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg, ...@@ -2222,9 +2222,10 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
kmsg->msg_iocb = NULL; kmsg->msg_iocb = NULL;
return import_iovec(save_addr ? READ : WRITE, err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen, msg.msg_iov, msg.msg_iovlen,
UIO_FASTIOV, iov, &kmsg->msg_iter); UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
} }
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
...@@ -2326,6 +2327,13 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, ...@@ -2326,6 +2327,13 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
/* /*
* BSD sendmsg interface * BSD sendmsg interface
*/ */
long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
unsigned int flags)
{
struct msghdr msg_sys;
return ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
}
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat) bool forbid_cmsg_compat)
...@@ -2500,6 +2508,14 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, ...@@ -2500,6 +2508,14 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
* BSD recvmsg interface * BSD recvmsg interface
*/ */
long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *msg,
unsigned int flags)
{
struct msghdr msg_sys;
return ___sys_recvmsg(sock, msg, &msg_sys, flags, 0);
}
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
bool forbid_cmsg_compat) bool forbid_cmsg_compat)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment