Commit e15c2ffa authored by Jens Axboe's avatar Jens Axboe

block: fix O_DIRECT error handling for bio fragments

0eb6ddfb tried to fix this up, but introduced a use-after-free
of dio. Additionally, we still had an issue with error handling,
as reported by Darrick:

"I noticed a regression in xfs/747 (an unreleased xfstest for the
xfs_scrub media scanning feature) on 5.3-rc3.  I'll condense that down
to a simpler reproducer:

error-test: 0 209 linear 8:48 0
error-test: 209 1 error
error-test: 210 6446894 linear 8:48 210

Basically we have a ~3G /dev/sdd and we set up device mapper to fail IO
for sector 209 and to pass the io to the scsi device everywhere else.

On 5.3-rc3, performing a directio pread of this range with a < 1M buffer
(in other words, a request for fewer than MAX_BIO_PAGES bytes) yields
EIO like you'd expect:

pread64(3, 0x7f880e1c7000, 1048576, 0)  = -1 EIO (Input/output error)
pread: Input/output error
+++ exited with 0 +++

But doing it with a larger buffer succeeds(!):

pread64(3, "XFSB\0\0\20\0\0\0\0\0\0\fL\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1146880, 0) = 1146880
read 1146880/1146880 bytes at offset 0
1 MiB, 1 ops; 0.0009 sec (1.124 GiB/sec and 1052.6316 ops/sec)
+++ exited with 0 +++

(Note that the part of the buffer corresponding to the dm-error area is
uninitialized)

On 5.3-rc2, both commands would fail with EIO like you'd expect.  The
only change between rc2 and rc3 is commit 0eb6ddfb ("block: Fix
__blkdev_direct_IO() for bio fragments").

AFAICT we end up in __blkdev_direct_IO with a 1120K buffer, which gets
split into two bios: one for the first BIO_MAX_PAGES worth of data (1MB)
and a second one for the 96k after that."

Fix this by noting that it's always safe to dereference dio if we get
BLK_QC_T_EAGAIN returned, as end_io hasn't been run for that case. So
we can safely increment the dio size before calling submit_bio(), and
then decrement it on failure (not that it really matters, as the bio
and dio are going away).

For error handling, return to the original method of just using 'ret'
for tracking the error, and the size tracking in dio->size.

Fixes: 0eb6ddfb ("block: Fix __blkdev_direct_IO() for bio fragments")
Fixes: 6a43074e ("block: properly handle IOCB_NOWAIT for async O_DIRECT IO")
Reported-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent db341a04
...@@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -349,7 +349,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE; blk_qc_t qc = BLK_QC_T_NONE;
gfp_t gfp; gfp_t gfp;
ssize_t ret; int ret;
if ((pos | iov_iter_alignment(iter)) & if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1)) (bdev_logical_block_size(bdev) - 1))
...@@ -386,8 +386,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -386,8 +386,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
ret = 0; ret = 0;
for (;;) { for (;;) {
int err;
bio_set_dev(bio, bdev); bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9; bio->bi_iter.bi_sector = pos >> 9;
bio->bi_write_hint = iocb->ki_hint; bio->bi_write_hint = iocb->ki_hint;
...@@ -395,10 +393,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -395,10 +393,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
bio->bi_end_io = blkdev_bio_end_io; bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio; bio->bi_ioprio = iocb->ki_ioprio;
err = bio_iov_iter_get_pages(bio, iter); ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(err)) { if (unlikely(ret)) {
if (!ret)
ret = err;
bio->bi_status = BLK_STS_IOERR; bio->bi_status = BLK_STS_IOERR;
bio_endio(bio); bio_endio(bio);
break; break;
...@@ -421,7 +417,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -421,7 +417,6 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
if (nowait) if (nowait)
bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE); bio->bi_opf |= (REQ_NOWAIT | REQ_NOWAIT_INLINE);
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size;
nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
...@@ -433,13 +428,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -433,13 +428,13 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
polled = true; polled = true;
} }
dio->size += bio->bi_iter.bi_size;
qc = submit_bio(bio); qc = submit_bio(bio);
if (qc == BLK_QC_T_EAGAIN) { if (qc == BLK_QC_T_EAGAIN) {
if (!ret) dio->size -= bio->bi_iter.bi_size;
ret = -EAGAIN; ret = -EAGAIN;
goto error; goto error;
} }
ret = dio->size;
if (polled) if (polled)
WRITE_ONCE(iocb->ki_cookie, qc); WRITE_ONCE(iocb->ki_cookie, qc);
...@@ -460,17 +455,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -460,17 +455,16 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
atomic_inc(&dio->ref); atomic_inc(&dio->ref);
} }
dio->size += bio->bi_iter.bi_size;
qc = submit_bio(bio); qc = submit_bio(bio);
if (qc == BLK_QC_T_EAGAIN) { if (qc == BLK_QC_T_EAGAIN) {
if (!ret) dio->size -= bio->bi_iter.bi_size;
ret = -EAGAIN; ret = -EAGAIN;
goto error; goto error;
} }
ret = dio->size;
bio = bio_alloc(gfp, nr_pages); bio = bio_alloc(gfp, nr_pages);
if (!bio) { if (!bio) {
if (!ret)
ret = -EAGAIN; ret = -EAGAIN;
goto error; goto error;
} }
...@@ -496,6 +490,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) ...@@ -496,6 +490,8 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
out: out:
if (!ret) if (!ret)
ret = blk_status_to_errno(dio->bio.bi_status); ret = blk_status_to_errno(dio->bio.bi_status);
if (likely(!ret))
ret = dio->size;
bio_put(&dio->bio); bio_put(&dio->bio);
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment