Commit 07d399cb authored by Josef Bacik's avatar Josef Bacik Committed by David Sterba

btrfs: take the dio extent lock during O_DIRECT operations

Currently we hold the extent lock for the entire duration of a read.
This isn't really necessary in the buffered case, we're protected by the
page lock, however it's necessary for O_DIRECT.

For O_DIRECT reads, if we only locked the extent for the part where we
get the extent, we could potentially race with an O_DIRECT write in the
same region.  This isn't really a problem, unless the read is delayed so
much that the write does the COW, unpins the old extent, and some other
application re-allocates the extent before the read is actually able to
be submitted.  At that point at best we'd have a checksum mismatch, but
at worse we could read data that doesn't belong to us.

To address this potential race we need to make sure we don't have
overlapping, concurrent direct io reads and writes.

To accomplish this use the new EXTENT_DIO_LOCKED bit in the direct IO
case in the same spot as the current extent lock.  The writes will take
this while they're creating the ordered extent, which is also used to
make sure concurrent buffered reads or concurrent direct reads are not
allowed to occur, and drop it after the ordered extent is taken.  For
reads it will act as the current read behavior for the EXTENT_LOCKED
bit, we set it when we're starting the read, we clear it in the end_io
to allow other direct writes to continue.

This still has the drawback of disallowing concurrent overlapping direct
reads from occurring, but that exists with the current extent locking.
Signed-off-by: default avatarJosef Bacik <josef@toxicpanda.com>
Reviewed-by: default avatarDavid Sterba <dsterba@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 7e2a5950
...@@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, ...@@ -40,11 +40,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
int ret = 0; int ret = 0;
/* Direct lock must be taken before the extent lock. */
if (nowait) {
if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state))
return -EAGAIN;
} else {
lock_dio_extent(io_tree, lockstart, lockend, cached_state);
}
while (1) { while (1) {
if (nowait) { if (nowait) {
if (!try_lock_extent(io_tree, lockstart, lockend, if (!try_lock_extent(io_tree, lockstart, lockend,
cached_state)) cached_state)) {
return -EAGAIN; ret = -EAGAIN;
break;
}
} else { } else {
lock_extent(io_tree, lockstart, lockend, cached_state); lock_extent(io_tree, lockstart, lockend, cached_state);
} }
...@@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, ...@@ -120,6 +130,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
cond_resched(); cond_resched();
} }
if (ret)
unlock_dio_extent(io_tree, lockstart, lockend, cached_state);
return ret; return ret;
} }
...@@ -546,7 +558,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, ...@@ -546,7 +558,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
} }
if (unlock_extents) if (unlock_extents)
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
EXTENT_LOCKED | EXTENT_DIO_LOCKED,
&cached_state); &cached_state);
else else
free_extent_state(cached_state); free_extent_state(cached_state);
...@@ -572,8 +585,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, ...@@ -572,8 +585,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
return 0; return 0;
unlock_err: unlock_err:
unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, /*
&cached_state); * Don't use EXTENT_LOCK_BITS here in case we extend it later and forget
* to update this, be explicit that we expect EXTENT_LOCKED and
* EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing.
*/
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state);
err: err:
if (dio_data->data_space_reserved) { if (dio_data->data_space_reserved) {
btrfs_free_reserved_data_space(BTRFS_I(inode), btrfs_free_reserved_data_space(BTRFS_I(inode),
...@@ -596,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ...@@ -596,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (!write && (iomap->type == IOMAP_HOLE)) { if (!write && (iomap->type == IOMAP_HOLE)) {
/* If reading from a hole, unlock and return */ /* If reading from a hole, unlock and return */
unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, clear_extent_bit(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
NULL); EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
return 0; return 0;
} }
...@@ -608,8 +626,9 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ...@@ -608,8 +626,9 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
btrfs_finish_ordered_extent(dio_data->ordered, NULL, btrfs_finish_ordered_extent(dio_data->ordered, NULL,
pos, length, false); pos, length, false);
else else
unlock_extent(&BTRFS_I(inode)->io_tree, pos, clear_extent_bit(&BTRFS_I(inode)->io_tree, pos,
pos + length - 1, NULL); pos + length - 1,
EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
ret = -ENOTBLK; ret = -ENOTBLK;
} }
if (write) { if (write) {
...@@ -641,8 +660,9 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) ...@@ -641,8 +660,9 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio)
dip->file_offset, dip->bytes, dip->file_offset, dip->bytes,
!bio->bi_status); !bio->bi_status);
} else { } else {
unlock_extent(&inode->io_tree, dip->file_offset, clear_extent_bit(&inode->io_tree, dip->file_offset,
dip->file_offset + dip->bytes - 1, NULL); dip->file_offset + dip->bytes - 1,
EXTENT_LOCKED | EXTENT_DIO_LOCKED, NULL);
} }
bbio->bio.bi_private = bbio->private; bbio->bio.bi_private = bbio->private;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment