Commit 1bdd3dbf authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block

Pull io_uring fixes and improvements from Jens Axboe:
 "The first five in this series are heavily inspired by the work Al did
  on the aio side to fix the races there.

  The last two re-introduce a feature that was in io_uring before it got
  merged, but which I pulled since we didn't have a good way to have
  BVEC iters that already have a stable reference. These aren't
  necessarily related to block, it's just how io_uring pins fixed
  buffers"

* tag 'io_uring-20190323' of git://git.kernel.dk/linux-block:
  block: add BIO_NO_PAGE_REF flag
  iov_iter: add ITER_BVEC_FLAG_NO_REF flag
  io_uring: mark me as the maintainer
  io_uring: retry bulk slab allocs as single allocs
  io_uring: fix poll races
  io_uring: fix fget/fput handling
  io_uring: add prepped flag
  io_uring: make io_read/write return an integer
  io_uring: use regular request ref counts
parents 2335cbe6 399254aa
...@@ -8096,6 +8096,16 @@ F: include/linux/iommu.h ...@@ -8096,6 +8096,16 @@ F: include/linux/iommu.h
F: include/linux/of_iommu.h F: include/linux/of_iommu.h
F: include/linux/iova.h F: include/linux/iova.h
IO_URING
M: Jens Axboe <axboe@kernel.dk>
L: linux-block@vger.kernel.org
L: linux-fsdevel@vger.kernel.org
T: git git://git.kernel.dk/linux-block
T: git git://git.kernel.dk/liburing
S: Maintained
F: fs/io_uring.c
F: include/uapi/linux/io_uring.h
IP MASQUERADING IP MASQUERADING
M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar>
S: Maintained S: Maintained
......
...@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) ...@@ -849,20 +849,14 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
size = bio_add_page(bio, bv->bv_page, len, size = bio_add_page(bio, bv->bv_page, len,
bv->bv_offset + iter->iov_offset); bv->bv_offset + iter->iov_offset);
if (size == len) { if (size == len) {
if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
struct page *page; struct page *page;
int i; int i;
/*
* For the normal O_DIRECT case, we could skip grabbing this
* reference and then not have to put them again when IO
* completes. But this breaks some in-kernel users, like
* splicing to/from a loop device, where we release the pipe
* pages unconditionally. If we can fix that case, we can
* get rid of the get here and the need to call
* bio_release_pages() at IO completion time.
*/
mp_bvec_for_each_page(page, bv, i) mp_bvec_for_each_page(page, bv, i)
get_page(page); get_page(page);
}
iov_iter_advance(iter, size); iov_iter_advance(iter, size);
return 0; return 0;
} }
...@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ...@@ -925,10 +919,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* This takes either an iterator pointing to user memory, or one pointing to * This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and * kernel pages (BVEC iterator). If we're adding user pages, we pin them and
* map them into the kernel. On IO completion, the caller should put those * map them into the kernel. On IO completion, the caller should put those
* pages. For now, when adding kernel pages, we still grab a reference to the * pages. If we're adding kernel pages, and the caller told us it's safe to
* page. This isn't strictly needed for the common case, but some call paths * do so, we just have to add the pages to the bio directly. We don't grab an
* end up releasing pages from eg a pipe and we can't easily control these. * extra reference to those pages (the user should already have that), and we
* See comment in __bio_iov_bvec_add_pages(). * don't put the page on IO completion. The caller needs to check if the bio is
* flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
* released.
* *
* The function tries, but does not guarantee, to pin as many pages as * The function tries, but does not guarantee, to pin as many pages as
* fit into the bio, or are requested in *iter, whatever is smaller. If * fit into the bio, or are requested in *iter, whatever is smaller. If
...@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) ...@@ -940,6 +936,13 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
const bool is_bvec = iov_iter_is_bvec(iter); const bool is_bvec = iov_iter_is_bvec(iter);
unsigned short orig_vcnt = bio->bi_vcnt; unsigned short orig_vcnt = bio->bi_vcnt;
/*
* If this is a BVEC iter, then the pages are kernel pages. Don't
* release them on IO completion, if the caller asked us to.
*/
if (is_bvec && iov_iter_bvec_no_ref(iter))
bio_set_flag(bio, BIO_NO_PAGE_REF);
do { do {
int ret; int ret;
...@@ -1696,6 +1699,7 @@ static void bio_dirty_fn(struct work_struct *work) ...@@ -1696,6 +1699,7 @@ static void bio_dirty_fn(struct work_struct *work)
next = bio->bi_private; next = bio->bi_private;
bio_set_pages_dirty(bio); bio_set_pages_dirty(bio);
if (!bio_flagged(bio, BIO_NO_PAGE_REF))
bio_release_pages(bio); bio_release_pages(bio);
bio_put(bio); bio_put(bio);
} }
...@@ -1713,6 +1717,7 @@ void bio_check_pages_dirty(struct bio *bio) ...@@ -1713,6 +1717,7 @@ void bio_check_pages_dirty(struct bio *bio)
goto defer; goto defer;
} }
if (!bio_flagged(bio, BIO_NO_PAGE_REF))
bio_release_pages(bio); bio_release_pages(bio);
bio_put(bio); bio_put(bio);
return; return;
......
...@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio) ...@@ -336,12 +336,14 @@ static void blkdev_bio_end_io(struct bio *bio)
if (should_dirty) { if (should_dirty) {
bio_check_pages_dirty(bio); bio_check_pages_dirty(bio);
} else { } else {
if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
struct bvec_iter_all iter_all;
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i, iter_all) bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page); put_page(bvec->bv_page);
}
bio_put(bio); bio_put(bio);
} }
} }
......
This diff is collapsed.
...@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio) ...@@ -1589,12 +1589,14 @@ static void iomap_dio_bio_end_io(struct bio *bio)
if (should_dirty) { if (should_dirty) {
bio_check_pages_dirty(bio); bio_check_pages_dirty(bio);
} else { } else {
if (!bio_flagged(bio, BIO_NO_PAGE_REF)) {
struct bvec_iter_all iter_all;
struct bio_vec *bvec; struct bio_vec *bvec;
int i; int i;
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, i, iter_all) bio_for_each_segment_all(bvec, bio, i, iter_all)
put_page(bvec->bv_page); put_page(bvec->bv_page);
}
bio_put(bio); bio_put(bio);
} }
} }
......
...@@ -215,6 +215,7 @@ struct bio { ...@@ -215,6 +215,7 @@ struct bio {
/* /*
* bio flags * bio flags
*/ */
#define BIO_NO_PAGE_REF 0 /* don't put release vec pages */
#define BIO_SEG_VALID 1 /* bi_phys_segments valid */ #define BIO_SEG_VALID 1 /* bi_phys_segments valid */
#define BIO_CLONED 2 /* doesn't own data */ #define BIO_CLONED 2 /* doesn't own data */
#define BIO_BOUNCED 3 /* bio is a bounce bio */ #define BIO_BOUNCED 3 /* bio is a bounce bio */
......
...@@ -23,14 +23,23 @@ struct kvec { ...@@ -23,14 +23,23 @@ struct kvec {
}; };
enum iter_type { enum iter_type {
ITER_IOVEC = 0, /* set if ITER_BVEC doesn't hold a bv_page ref */
ITER_KVEC = 2, ITER_BVEC_FLAG_NO_REF = 2,
ITER_BVEC = 4,
ITER_PIPE = 8, /* iter types */
ITER_DISCARD = 16, ITER_IOVEC = 4,
ITER_KVEC = 8,
ITER_BVEC = 16,
ITER_PIPE = 32,
ITER_DISCARD = 64,
}; };
struct iov_iter { struct iov_iter {
/*
* Bit 0 is the read/write bit, set if we're writing.
* Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and
* the caller isn't expecting to drop a page reference when done.
*/
unsigned int type; unsigned int type;
size_t iov_offset; size_t iov_offset;
size_t count; size_t count;
...@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i) ...@@ -84,6 +93,11 @@ static inline unsigned char iov_iter_rw(const struct iov_iter *i)
return i->type & (READ | WRITE); return i->type & (READ | WRITE);
} }
static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i)
{
return (i->type & ITER_BVEC_FLAG_NO_REF) != 0;
}
/* /*
* Total number of bytes covered by an iovec. * Total number of bytes covered by an iovec.
* *
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment