Commit 7869a4a6 authored by Theodore Ts'o's avatar Theodore Ts'o

ext4: add support for extent pre-caching

Add a new fiemap flag which forces the all of the extents in an inode
to be cached in the extent_status tree.  This is critically important
when using AIO to a preallocated file, since if we need to read in
blocks from the extent tree, the io_submit(2) system call becomes
synchronous, and the AIO is no longer "A", which is bad.

In addition, for most files which have an external leaf tree block,
the cost of caching the information in the extent status tree will be
less than caching the entire 4k block in the buffer cache.  So it is
generally a win to keep the extent information cached.
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 107a7bd3
...@@ -561,15 +561,16 @@ enum { ...@@ -561,15 +561,16 @@ enum {
#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200
/* /*
* The bit position of this flag must not overlap with any of the * The bit position of these flags must not overlap with any of the
* EXT4_GET_BLOCKS_*. It is used by ext4_ext_find_extent(), * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(),
* read_extent_tree_block(), ext4_split_extent_at(), * read_extent_tree_block(), ext4_split_extent_at(),
* ext4_ext_insert_extent(), and ext4_ext_create_new_leaf() to * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
* indicate that the we shouldn't be caching the extents when reading * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
* from the extent tree while a truncate or punch hole operation * caching the extents when reading from the extent tree while a
* is in progress. * truncate or punch hole operation is in progress.
*/ */
#define EXT4_EX_NOCACHE 0x0400 #define EXT4_EX_NOCACHE 0x0400
#define EXT4_EX_FORCE_CACHE 0x0800
/* /*
* Flags used by ext4_free_blocks * Flags used by ext4_free_blocks
...@@ -601,6 +602,7 @@ enum { ...@@ -601,6 +602,7 @@ enum {
#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) #define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent)
#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64)
#define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_SWAP_BOOT _IO('f', 17)
#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT) #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/* /*
...@@ -1386,6 +1388,7 @@ enum { ...@@ -1386,6 +1388,7 @@ enum {
nolocking */ nolocking */
EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */
EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ EXT4_STATE_ORDERED_MODE, /* data=ordered mode */
EXT4_STATE_EXT_PRECACHED, /* extents have been precached */
}; };
#define EXT4_INODE_BIT_FNS(name, field, offset) \ #define EXT4_INODE_BIT_FNS(name, field, offset) \
...@@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode, ...@@ -2705,7 +2708,7 @@ extern int ext4_find_delalloc_range(struct inode *inode,
extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len); __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
/* move_extent.c */ /* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first, extern void ext4_double_down_write_data_sem(struct inode *first,
......
...@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line, ...@@ -482,7 +482,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (err < 0) if (err < 0)
goto errout; goto errout;
} }
if (buffer_verified(bh)) if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE))
return bh; return bh;
err = __ext4_ext_check(function, line, inode, err = __ext4_ext_check(function, line, inode,
ext_block_hdr(bh), depth, pblk); ext_block_hdr(bh), depth, pblk);
...@@ -526,6 +526,71 @@ __read_extent_tree_block(const char *function, unsigned int line, ...@@ -526,6 +526,71 @@ __read_extent_tree_block(const char *function, unsigned int line,
__read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \ __read_extent_tree_block(__func__, __LINE__, (inode), (pblk), \
(depth), (flags)) (depth), (flags))
/*
* This function is called to cache a file's extent information in the
* extent status tree
*/
int ext4_ext_precache(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_ext_path *path = NULL;
struct buffer_head *bh;
int i = 0, depth, ret = 0;
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1),
GFP_NOFS);
if (path == NULL) {
up_read(&ei->i_data_sem);
return -ENOMEM;
}
/* Don't cache anything if there are no external extent blocks */
if (depth == 0)
goto out;
path[0].p_hdr = ext_inode_hdr(inode);
ret = ext4_ext_check(inode, path[0].p_hdr, depth, 0);
if (ret)
goto out;
path[0].p_idx = EXT_FIRST_INDEX(path[0].p_hdr);
while (i >= 0) {
/*
* If this is a leaf block or we've reached the end of
* the index block, go up
*/
if ((i == depth) ||
path[i].p_idx > EXT_LAST_INDEX(path[i].p_hdr)) {
brelse(path[i].p_bh);
path[i].p_bh = NULL;
i--;
continue;
}
bh = read_extent_tree_block(inode,
ext4_idx_pblock(path[i].p_idx++),
depth - i - 1,
EXT4_EX_FORCE_CACHE);
if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
break;
}
i++;
path[i].p_bh = bh;
path[i].p_hdr = ext_block_hdr(bh);
path[i].p_idx = EXT_FIRST_INDEX(path[i].p_hdr);
}
ext4_set_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
out:
up_read(&ei->i_data_sem);
ext4_ext_drop_refs(path);
kfree(path);
return ret;
}
#ifdef EXT_DEBUG #ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
{ {
...@@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ...@@ -4766,6 +4831,12 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return error; return error;
} }
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
return error;
}
/* fallback to generic here if not in extents fmt */ /* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return generic_block_fiemap(inode, fieinfo, start, len, return generic_block_fiemap(inode, fieinfo, start, len,
......
...@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ...@@ -710,11 +710,8 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
write_lock(&EXT4_I(inode)->i_es_lock); write_lock(&EXT4_I(inode)->i_es_lock);
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk); es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (es && ((es->es_lblk <= lblk) || (es->es_lblk <= end))) if (!es || es->es_lblk > end)
goto out;
__es_insert_extent(inode, &newes); __es_insert_extent(inode, &newes);
out:
write_unlock(&EXT4_I(inode)->i_es_lock); write_unlock(&EXT4_I(inode)->i_es_lock);
} }
...@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, ...@@ -930,6 +927,12 @@ static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
eia = list_entry(a, struct ext4_inode_info, i_es_lru); eia = list_entry(a, struct ext4_inode_info, i_es_lru);
eib = list_entry(b, struct ext4_inode_info, i_es_lru); eib = list_entry(b, struct ext4_inode_info, i_es_lru);
if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
!ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
return 1;
if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) &&
ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED))
return -1;
if (eia->i_touch_when == eib->i_touch_when) if (eia->i_touch_when == eib->i_touch_when)
return 0; return 0;
if (time_after(eia->i_touch_when, eib->i_touch_when)) if (time_after(eia->i_touch_when, eib->i_touch_when))
...@@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ...@@ -943,21 +946,13 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
{ {
struct ext4_inode_info *ei; struct ext4_inode_info *ei;
struct list_head *cur, *tmp; struct list_head *cur, *tmp;
LIST_HEAD(skiped); LIST_HEAD(skipped);
int ret, nr_shrunk = 0; int ret, nr_shrunk = 0;
int retried = 0, skip_precached = 1, nr_skipped = 0;
spin_lock(&sbi->s_es_lru_lock); spin_lock(&sbi->s_es_lru_lock);
/* retry:
* If the inode that is at the head of LRU list is newer than
* last_sorted time, that means that we need to sort this list.
*/
ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
if (sbi->s_es_last_sorted < ei->i_touch_when) {
list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
sbi->s_es_last_sorted = jiffies;
}
list_for_each_safe(cur, tmp, &sbi->s_es_lru) { list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
/* /*
* If we have already reclaimed all extents from extent * If we have already reclaimed all extents from extent
...@@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ...@@ -968,9 +963,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
ei = list_entry(cur, struct ext4_inode_info, i_es_lru); ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
/* Skip the inode that is newer than the last_sorted time */ /*
if (sbi->s_es_last_sorted < ei->i_touch_when) { * Skip the inode that is newer than the last_sorted
list_move_tail(cur, &skiped); * time. Normally we try hard to avoid shrinking
* precached inodes, but we will as a last resort.
*/
if ((sbi->s_es_last_sorted < ei->i_touch_when) ||
(skip_precached && ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED))) {
nr_skipped++;
list_move_tail(cur, &skipped);
continue; continue;
} }
...@@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, ...@@ -990,11 +992,33 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
} }
/* Move the newer inodes into the tail of the LRU list. */ /* Move the newer inodes into the tail of the LRU list. */
list_splice_tail(&skiped, &sbi->s_es_lru); list_splice_tail(&skipped, &sbi->s_es_lru);
INIT_LIST_HEAD(&skipped);
/*
* If we skipped any inodes, and we weren't able to make any
* forward progress, sort the list and try again.
*/
if ((nr_shrunk == 0) && nr_skipped && !retried) {
retried++;
list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
sbi->s_es_last_sorted = jiffies;
ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info,
i_es_lru);
/*
* If there are no non-precached inodes left on the
* list, start releasing precached extents.
*/
if (ext4_test_inode_state(&ei->vfs_inode,
EXT4_STATE_EXT_PRECACHED))
skip_precached = 0;
goto retry;
}
spin_unlock(&sbi->s_es_lru_lock); spin_unlock(&sbi->s_es_lru_lock);
if (locked_ei && nr_shrunk == 0) if (locked_ei && nr_shrunk == 0)
nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan);
return nr_shrunk; return nr_shrunk;
} }
...@@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, ...@@ -1069,10 +1093,16 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
struct rb_node *node; struct rb_node *node;
struct extent_status *es; struct extent_status *es;
int nr_shrunk = 0; int nr_shrunk = 0;
static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
if (ei->i_es_lru_nr == 0) if (ei->i_es_lru_nr == 0)
return 0; return 0;
if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
__ratelimit(&_rs))
ext4_warning(inode->i_sb, "forced shrink of precached extents");
node = rb_first(&tree->root); node = rb_first(&tree->root);
while (node != NULL) { while (node != NULL) {
es = rb_entry(node, struct extent_status, rb_node); es = rb_entry(node, struct extent_status, rb_node);
......
...@@ -624,6 +624,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ...@@ -624,6 +624,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return 0; return 0;
} }
case EXT4_IOC_PRECACHE_EXTENTS:
return ext4_ext_precache(inode);
default: default:
return -ENOTTY; return -ENOTTY;
...@@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ...@@ -688,6 +690,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case EXT4_IOC_MOVE_EXT: case EXT4_IOC_MOVE_EXT:
case FITRIM: case FITRIM:
case EXT4_IOC_RESIZE_FS: case EXT4_IOC_RESIZE_FS:
case EXT4_IOC_PRECACHE_EXTENTS:
break; break;
default: default:
return -ENOIOCTLCMD; return -ENOIOCTLCMD;
......
...@@ -40,6 +40,7 @@ struct fiemap { ...@@ -40,6 +40,7 @@ struct fiemap {
#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ #define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */
#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ #define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */
#define FIEMAP_FLAG_CACHE 0x00000004 /* request caching of the extents */
#define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR) #define FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment