Commit 91dd8c11 authored by Lukas Czerner's avatar Lukas Czerner Committed by Theodore Ts'o

ext4: prevent race while walking extent tree for fiemap

Currently ext4_ext_walk_space() only takes i_data_sem for read when
searching for the extent at given block with ext4_ext_find_extent().
Then it drops the lock and the extent tree can be changed at will.
However later on we're searching for the 'next' extent, but the extent
tree might already have changed, so the information might not be
accurate.

In fact we can hit BUG_ON(end <= start) if the extent got inserted into
the tree after the one we found and before the block we were searching
for. This has been reproduced by running xfstests 225 in loop on s390x
architecture, but theoretically we could hit this on any other
architecture as well, but probably not as often.

Moreover the extent currently in delayed allocation might be allocated
after we search the extent tree and before we search extent status tree
delayed buffers resulting in those delayed buffers being completely
missed, even though completely written and allocated.

We fix all those problems in several steps:

 1. remove unnecessary callback indirection
 2. rename functions
        ext4_ext_walk_space -> ext4_fill_fiemap_extents
        ext4_ext_fiemap_cb -> ext4_find_delayed_extent
 3. move fiemap_fill_next_extent() into ext4_fill_fiemap_extents()
 4. hold the i_data_sem for:
        ext4_ext_find_extent()
        ext4_ext_next_allocated_block()
        ext4_find_delayed_extent()
 5. call fiemap_fill_next_extent after releasing the i_data_sem
 6. move path reinitialization into the critical section.
Signed-off-by: default avatarLukas Czerner <lczerner@redhat.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent f3b59291
...@@ -143,20 +143,6 @@ struct ext4_ext_path { ...@@ -143,20 +143,6 @@ struct ext4_ext_path {
* structure for external API * structure for external API
*/ */
/*
* to be called by ext4_ext_walk_space()
* negative retcode - error
* positive retcode - signal for ext4_ext_walk_space(), see below
* callback must return valid extent (passed or newly created)
*/
typedef int (*ext_prepare_callback)(struct inode *, ext4_lblk_t,
struct ext4_ext_cache *,
struct ext4_extent *, void *);
#define EXT_CONTINUE 0
#define EXT_BREAK 1
#define EXT_REPEAT 2
/* /*
* Maximum number of logical blocks in a file; ext4_extent's ee_block is * Maximum number of logical blocks in a file; ext4_extent's ee_block is
* __le32. * __le32.
......
...@@ -109,6 +109,9 @@ static int ext4_split_extent_at(handle_t *handle, ...@@ -109,6 +109,9 @@ static int ext4_split_extent_at(handle_t *handle,
int split_flag, int split_flag,
int flags); int flags);
static int ext4_find_delayed_extent(struct inode *inode,
struct ext4_ext_cache *newex);
static int ext4_ext_truncate_extend_restart(handle_t *handle, static int ext4_ext_truncate_extend_restart(handle_t *handle,
struct inode *inode, struct inode *inode,
int needed) int needed)
...@@ -1959,27 +1962,33 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ...@@ -1959,27 +1962,33 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
return err; return err;
} }
static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, static int ext4_fill_fiemap_extents(struct inode *inode,
ext4_lblk_t num, ext_prepare_callback func, ext4_lblk_t block, ext4_lblk_t num,
void *cbdata) struct fiemap_extent_info *fieinfo)
{ {
struct ext4_ext_path *path = NULL; struct ext4_ext_path *path = NULL;
struct ext4_ext_cache cbex; struct ext4_ext_cache cbex;
struct ext4_extent *ex; struct ext4_extent *ex;
ext4_lblk_t next, start = 0, end = 0; ext4_lblk_t next, next_del, start = 0, end = 0;
ext4_lblk_t last = block + num; ext4_lblk_t last = block + num;
int depth, exists, err = 0; int exists, depth = 0, err = 0;
unsigned int flags = 0;
BUG_ON(func == NULL); unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
BUG_ON(inode == NULL);
while (block < last && block != EXT_MAX_BLOCKS) { while (block < last && block != EXT_MAX_BLOCKS) {
num = last - block; num = last - block;
/* find extent for this block */ /* find extent for this block */
down_read(&EXT4_I(inode)->i_data_sem); down_read(&EXT4_I(inode)->i_data_sem);
if (path && ext_depth(inode) != depth) {
/* depth was changed. we have to realloc path */
kfree(path);
path = NULL;
}
path = ext4_ext_find_extent(inode, block, path); path = ext4_ext_find_extent(inode, block, path);
up_read(&EXT4_I(inode)->i_data_sem);
if (IS_ERR(path)) { if (IS_ERR(path)) {
up_read(&EXT4_I(inode)->i_data_sem);
err = PTR_ERR(path); err = PTR_ERR(path);
path = NULL; path = NULL;
break; break;
...@@ -1987,13 +1996,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ...@@ -1987,13 +1996,16 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
depth = ext_depth(inode); depth = ext_depth(inode);
if (unlikely(path[depth].p_hdr == NULL)) { if (unlikely(path[depth].p_hdr == NULL)) {
up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth); EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
err = -EIO; err = -EIO;
break; break;
} }
ex = path[depth].p_ext; ex = path[depth].p_ext;
next = ext4_ext_next_allocated_block(path); next = ext4_ext_next_allocated_block(path);
ext4_ext_drop_refs(path);
flags = 0;
exists = 0; exists = 0;
if (!ex) { if (!ex) {
/* there is no extent yet, so try to allocate /* there is no extent yet, so try to allocate
...@@ -2037,30 +2049,54 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, ...@@ -2037,30 +2049,54 @@ static int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
cbex.ec_block = le32_to_cpu(ex->ee_block); cbex.ec_block = le32_to_cpu(ex->ee_block);
cbex.ec_len = ext4_ext_get_actual_len(ex); cbex.ec_len = ext4_ext_get_actual_len(ex);
cbex.ec_start = ext4_ext_pblock(ex); cbex.ec_start = ext4_ext_pblock(ex);
if (ext4_ext_is_uninitialized(ex))
flags |= FIEMAP_EXTENT_UNWRITTEN;
}
/*
* Find delayed extent and update cbex accordingly. We call
* it even in !exists case to find out whether cbex is the
* last existing extent or not.
*/
next_del = ext4_find_delayed_extent(inode, &cbex);
if (!exists && next_del) {
exists = 1;
flags |= FIEMAP_EXTENT_DELALLOC;
} }
up_read(&EXT4_I(inode)->i_data_sem);
if (unlikely(cbex.ec_len == 0)) { if (unlikely(cbex.ec_len == 0)) {
EXT4_ERROR_INODE(inode, "cbex.ec_len == 0"); EXT4_ERROR_INODE(inode, "cbex.ec_len == 0");
err = -EIO; err = -EIO;
break; break;
} }
err = func(inode, next, &cbex, ex, cbdata);
ext4_ext_drop_refs(path);
if (err < 0) /* This is possible iff next == next_del == EXT_MAX_BLOCKS */
if (next == next_del) {
flags |= FIEMAP_EXTENT_LAST;
if (unlikely(next_del != EXT_MAX_BLOCKS ||
next != EXT_MAX_BLOCKS)) {
EXT4_ERROR_INODE(inode,
"next extent == %u, next "
"delalloc extent = %u",
next, next_del);
err = -EIO;
break; break;
}
}
if (err == EXT_REPEAT) if (exists) {
continue; err = fiemap_fill_next_extent(fieinfo,
else if (err == EXT_BREAK) { (__u64)cbex.ec_block << blksize_bits,
(__u64)cbex.ec_start << blksize_bits,
(__u64)cbex.ec_len << blksize_bits,
flags);
if (err < 0)
break;
if (err == 1) {
err = 0; err = 0;
break; break;
} }
if (ext_depth(inode) != depth) {
/* depth was changed. we have to realloc path */
kfree(path);
path = NULL;
} }
block = cbex.ec_block + cbex.ec_len; block = cbex.ec_block + cbex.ec_len;
...@@ -4493,26 +4529,23 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ...@@ -4493,26 +4529,23 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
} }
/* /*
* Callback function called for each extent to gather FIEMAP information. * If newex is not existing extent (newex->ec_start equals zero) find
* delayed extent at start of newex and update newex accordingly and
* return start of the next delayed extent.
*
* If newex is existing extent (newex->ec_start is not equal zero)
* return start of next delayed extent or EXT_MAX_BLOCKS if no delayed
* extent found. Leave newex unmodified.
*/ */
static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, static int ext4_find_delayed_extent(struct inode *inode,
struct ext4_ext_cache *newex, struct ext4_extent *ex, struct ext4_ext_cache *newex)
void *data)
{ {
struct extent_status es; struct extent_status es;
__u64 logical;
__u64 physical;
__u64 length;
__u32 flags = 0;
ext4_lblk_t next_del; ext4_lblk_t next_del;
int ret = 0;
struct fiemap_extent_info *fieinfo = data;
unsigned char blksize_bits;
es.start = newex->ec_block; es.start = newex->ec_block;
next_del = ext4_es_find_extent(inode, &es); next_del = ext4_es_find_extent(inode, &es);
next = min(next_del, next);
if (newex->ec_start == 0) { if (newex->ec_start == 0) {
/* /*
* No extent in extent-tree contains block @newex->ec_start, * No extent in extent-tree contains block @newex->ec_start,
...@@ -4520,37 +4553,19 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next, ...@@ -4520,37 +4553,19 @@ static int ext4_ext_fiemap_cb(struct inode *inode, ext4_lblk_t next,
*/ */
if (es.len == 0) if (es.len == 0)
/* A hole found. */ /* A hole found. */
return EXT_CONTINUE; return 0;
if (es.start > newex->ec_block) { if (es.start > newex->ec_block) {
/* A hole found. */ /* A hole found. */
newex->ec_len = min(es.start - newex->ec_block, newex->ec_len = min(es.start - newex->ec_block,
newex->ec_len); newex->ec_len);
return EXT_CONTINUE; return 0;
} }
flags |= FIEMAP_EXTENT_DELALLOC;
newex->ec_len = es.start + es.len - newex->ec_block; newex->ec_len = es.start + es.len - newex->ec_block;
} }
if (ex && ext4_ext_is_uninitialized(ex)) return next_del;
flags |= FIEMAP_EXTENT_UNWRITTEN;
if (next == EXT_MAX_BLOCKS)
flags |= FIEMAP_EXTENT_LAST;
blksize_bits = inode->i_sb->s_blocksize_bits;
logical = (__u64)newex->ec_block << blksize_bits;
physical = (__u64)newex->ec_start << blksize_bits;
length = (__u64)newex->ec_len << blksize_bits;
ret = fiemap_fill_next_extent(fieinfo, logical, physical,
length, flags);
if (ret < 0)
return ret;
if (ret == 1)
return EXT_BREAK;
return EXT_CONTINUE;
} }
/* fiemap flags we can handle specified here */ /* fiemap flags we can handle specified here */
#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
...@@ -4772,6 +4787,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) ...@@ -4772,6 +4787,7 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
return err; return err;
} }
int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len) __u64 start, __u64 len)
{ {
...@@ -4799,11 +4815,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ...@@ -4799,11 +4815,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1;
/* /*
* Walk the extent tree gathering extent information. * Walk the extent tree gathering extent information
* ext4_ext_fiemap_cb will push extents back to user. * and pushing extents back to the user.
*/ */
error = ext4_ext_walk_space(inode, start_blk, len_blks, error = ext4_fill_fiemap_extents(inode, start_blk,
ext4_ext_fiemap_cb, fieinfo); len_blks, fieinfo);
} }
return error; return error;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment