Commit 727011e0 authored by Chris Mason's avatar Chris Mason

Btrfs: allow metadata blocks larger than the page size

A few years ago the btrfs code to support blocks lager than
the page size was disabled to fix a few corner cases in the
page cache handling.  This fixes the code to properly support
large metadata blocks again.

Since current kernels will crash early and often with larger
metadata blocks, this adds an incompat bit so that older kernels
can't mount it.

This also does away with different blocksizes for nodes and leaves.
You get a single block size for all tree blocks.
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 81c9ad23
...@@ -137,6 +137,12 @@ struct btrfs_ordered_sum; ...@@ -137,6 +137,12 @@ struct btrfs_ordered_sum;
#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
/*
* the max metadata block size. This limit is somewhat artificial,
* but the memmove costs go through the roof for larger blocks.
*/
#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
/* /*
* we can actually store much bigger names, but lets not confuse the rest * we can actually store much bigger names, but lets not confuse the rest
* of linux * of linux
...@@ -461,6 +467,19 @@ struct btrfs_super_block { ...@@ -461,6 +467,19 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3) #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
/*
* some patches floated around with a second compression method
* lets save that incompat here for when they do get in
* Note we don't actually support it, we're just reserving the
* number
*/
#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
/*
* older kernels tried to do bigger metadata blocks, but the
* code was pretty buggy. Lets not let them try anymore.
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
...@@ -468,6 +487,7 @@ struct btrfs_super_block { ...@@ -468,6 +487,7 @@ struct btrfs_super_block {
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO) BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
/* /*
...@@ -1555,14 +1575,14 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); ...@@ -1555,14 +1575,14 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(struct extent_buffer *eb) \ static inline u##bits btrfs_##name(struct extent_buffer *eb) \
{ \ { \
type *p = page_address(eb->first_page); \ type *p = page_address(eb->pages[0]); \
u##bits res = le##bits##_to_cpu(p->member); \ u##bits res = le##bits##_to_cpu(p->member); \
return res; \ return res; \
} \ } \
static inline void btrfs_set_##name(struct extent_buffer *eb, \ static inline void btrfs_set_##name(struct extent_buffer *eb, \
u##bits val) \ u##bits val) \
{ \ { \
type *p = page_address(eb->first_page); \ type *p = page_address(eb->pages[0]); \
p->member = cpu_to_le##bits(val); \ p->member = cpu_to_le##bits(val); \
} }
......
...@@ -370,8 +370,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, ...@@ -370,8 +370,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
ret = read_extent_buffer_pages(io_tree, eb, start, ret = read_extent_buffer_pages(io_tree, eb, start,
WAIT_COMPLETE, WAIT_COMPLETE,
btree_get_extent, mirror_num); btree_get_extent, mirror_num);
if (!ret && if (!ret && !verify_parent_transid(io_tree, eb, parent_transid))
!verify_parent_transid(io_tree, eb, parent_transid))
return ret; return ret;
/* /*
...@@ -406,14 +405,11 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ...@@ -406,14 +405,11 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
u64 found_start; u64 found_start;
unsigned long len; unsigned long len;
struct extent_buffer *eb; struct extent_buffer *eb;
int ret;
tree = &BTRFS_I(page->mapping->host)->io_tree; tree = &BTRFS_I(page->mapping->host)->io_tree;
if (page->private == EXTENT_PAGE_PRIVATE) { if (page->private == EXTENT_PAGE_PRIVATE)
WARN_ON(1);
goto out; goto out;
}
if (!page->private) { if (!page->private) {
WARN_ON(1); WARN_ON(1);
goto out; goto out;
...@@ -421,22 +417,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) ...@@ -421,22 +417,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
len = page->private >> 2; len = page->private >> 2;
WARN_ON(len == 0); WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page); eb = find_extent_buffer(tree, start, len);
if (eb == NULL) {
WARN_ON(1);
goto out;
}
ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
btrfs_header_generation(eb));
BUG_ON(ret);
WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
found_start = btrfs_header_bytenr(eb); found_start = btrfs_header_bytenr(eb);
if (found_start != start) { if (found_start != start) {
WARN_ON(1); WARN_ON(1);
goto err; goto err;
} }
if (eb->first_page != page) { if (eb->pages[0] != page) {
WARN_ON(1); WARN_ON(1);
goto err; goto err;
} }
...@@ -537,6 +525,41 @@ static noinline int check_leaf(struct btrfs_root *root, ...@@ -537,6 +525,41 @@ static noinline int check_leaf(struct btrfs_root *root,
return 0; return 0;
} }
struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
struct page *page, int max_walk)
{
struct extent_buffer *eb;
u64 start = page_offset(page);
u64 target = start;
u64 min_start;
if (start < max_walk)
min_start = 0;
else
min_start = start - max_walk;
while (start >= min_start) {
eb = find_extent_buffer(tree, start, 0);
if (eb) {
/*
* we found an extent buffer and it contains our page
* horray!
*/
if (eb->start <= target &&
eb->start + eb->len > target)
return eb;
/* we found an extent buffer that wasn't for us */
free_extent_buffer(eb);
return NULL;
}
if (start == 0)
break;
start -= PAGE_CACHE_SIZE;
}
return NULL;
}
static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_state *state) struct extent_state *state)
{ {
...@@ -547,24 +570,25 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ...@@ -547,24 +570,25 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
struct extent_buffer *eb; struct extent_buffer *eb;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
int ret = 0; int ret = 0;
int reads_done;
tree = &BTRFS_I(page->mapping->host)->io_tree;
if (page->private == EXTENT_PAGE_PRIVATE)
goto out;
if (!page->private) if (!page->private)
goto out; goto out;
tree = &BTRFS_I(page->mapping->host)->io_tree;
len = page->private >> 2; len = page->private >> 2;
WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page); eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
if (eb == NULL) { if (!eb) {
ret = -EIO; ret = -EIO;
goto out; goto out;
} }
reads_done = atomic_dec_and_test(&eb->pages_reading);
if (!reads_done)
goto err;
found_start = btrfs_header_bytenr(eb); found_start = btrfs_header_bytenr(eb);
if (found_start != start) { if (found_start != eb->start) {
printk_ratelimited(KERN_INFO "btrfs bad tree block start " printk_ratelimited(KERN_INFO "btrfs bad tree block start "
"%llu %llu\n", "%llu %llu\n",
(unsigned long long)found_start, (unsigned long long)found_start,
...@@ -572,13 +596,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ...@@ -572,13 +596,6 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO; ret = -EIO;
goto err; goto err;
} }
if (eb->first_page != page) {
printk(KERN_INFO "btrfs bad first page %lu %lu\n",
eb->first_page->index, page->index);
WARN_ON(1);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) { if (check_tree_block_fsid(root, eb)) {
printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n", printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
(unsigned long long)eb->start); (unsigned long long)eb->start);
...@@ -606,14 +623,14 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, ...@@ -606,14 +623,14 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
ret = -EIO; ret = -EIO;
} }
end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
end = eb->start + end - 1;
err: err:
if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) { if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags); clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
btree_readahead_hook(root, eb, eb->start, ret); btree_readahead_hook(root, eb, eb->start, ret);
} }
if (ret && eb)
clear_extent_buffer_uptodate(tree, eb, NULL);
free_extent_buffer(eb); free_extent_buffer(eb);
out: out:
return ret; return ret;
...@@ -637,7 +654,7 @@ static int btree_io_failed_hook(struct bio *failed_bio, ...@@ -637,7 +654,7 @@ static int btree_io_failed_hook(struct bio *failed_bio,
len = page->private >> 2; len = page->private >> 2;
WARN_ON(len == 0); WARN_ON(len == 0);
eb = alloc_extent_buffer(tree, start, len, page); eb = alloc_extent_buffer(tree, start, len);
if (eb == NULL) if (eb == NULL)
goto out; goto out;
...@@ -896,28 +913,14 @@ static int btree_migratepage(struct address_space *mapping, ...@@ -896,28 +913,14 @@ static int btree_migratepage(struct address_space *mapping,
static int btree_writepage(struct page *page, struct writeback_control *wbc) static int btree_writepage(struct page *page, struct writeback_control *wbc)
{ {
struct extent_io_tree *tree; struct extent_io_tree *tree;
struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
struct extent_buffer *eb;
int was_dirty;
tree = &BTRFS_I(page->mapping->host)->io_tree; tree = &BTRFS_I(page->mapping->host)->io_tree;
if (!(current->flags & PF_MEMALLOC)) { if (!(current->flags & PF_MEMALLOC)) {
return extent_write_full_page(tree, page, return extent_write_full_page(tree, page,
btree_get_extent, wbc); btree_get_extent, wbc);
} }
redirty_page_for_writepage(wbc, page); redirty_page_for_writepage(wbc, page);
eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
WARN_ON(!eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
if (!was_dirty) {
spin_lock(&root->fs_info->delalloc_lock);
root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
spin_unlock(&root->fs_info->delalloc_lock);
}
free_extent_buffer(eb);
unlock_page(page); unlock_page(page);
return 0; return 0;
} }
...@@ -954,6 +957,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) ...@@ -954,6 +957,8 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
{ {
struct extent_io_tree *tree; struct extent_io_tree *tree;
struct extent_map_tree *map; struct extent_map_tree *map;
struct extent_buffer *eb;
struct btrfs_root *root;
int ret; int ret;
if (PageWriteback(page) || PageDirty(page)) if (PageWriteback(page) || PageDirty(page))
...@@ -962,6 +967,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags) ...@@ -962,6 +967,13 @@ static int btree_releasepage(struct page *page, gfp_t gfp_flags)
tree = &BTRFS_I(page->mapping->host)->io_tree; tree = &BTRFS_I(page->mapping->host)->io_tree;
map = &BTRFS_I(page->mapping->host)->extent_tree; map = &BTRFS_I(page->mapping->host)->extent_tree;
root = BTRFS_I(page->mapping->host)->root;
if (page->private == EXTENT_PAGE_PRIVATE) {
eb = find_eb_for_page(tree, page, max(root->leafsize, root->nodesize));
free_extent_buffer(eb);
if (eb)
return 0;
}
/* /*
* We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing * We need to mask out eg. __GFP_HIGHMEM and __GFP_DMA32 as we're doing
* slab allocation from alloc_extent_state down the callchain where * slab allocation from alloc_extent_state down the callchain where
...@@ -1074,20 +1086,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, ...@@ -1074,20 +1086,20 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
struct extent_buffer *eb; struct extent_buffer *eb;
eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
bytenr, blocksize, NULL); bytenr, blocksize);
return eb; return eb;
} }
int btrfs_write_tree_block(struct extent_buffer *buf) int btrfs_write_tree_block(struct extent_buffer *buf)
{ {
return filemap_fdatawrite_range(buf->first_page->mapping, buf->start, return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
buf->start + buf->len - 1); buf->start + buf->len - 1);
} }
int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
{ {
return filemap_fdatawait_range(buf->first_page->mapping, return filemap_fdatawait_range(buf->pages[0]->mapping,
buf->start, buf->start + buf->len - 1); buf->start, buf->start + buf->len - 1);
} }
...@@ -1513,41 +1525,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) ...@@ -1513,41 +1525,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
return 0; return 0;
} }
static int bio_ready_for_csum(struct bio *bio)
{
u64 length = 0;
u64 buf_len = 0;
u64 start = 0;
struct page *page;
struct extent_io_tree *io_tree = NULL;
struct bio_vec *bvec;
int i;
int ret;
bio_for_each_segment(bvec, bio, i) {
page = bvec->bv_page;
if (page->private == EXTENT_PAGE_PRIVATE) {
length += bvec->bv_len;
continue;
}
if (!page->private) {
length += bvec->bv_len;
continue;
}
length = bvec->bv_len;
buf_len = page->private >> 2;
start = page_offset(page) + bvec->bv_offset;
io_tree = &BTRFS_I(page->mapping->host)->io_tree;
}
/* are we fully contained in this bio? */
if (buf_len <= length)
return 1;
ret = extent_range_uptodate(io_tree, start + length,
start + buf_len - 1);
return ret;
}
/* /*
* called by the kthread helper functions to finally call the bio end_io * called by the kthread helper functions to finally call the bio end_io
* functions. This is where read checksum verification actually happens * functions. This is where read checksum verification actually happens
...@@ -1563,17 +1540,6 @@ static void end_workqueue_fn(struct btrfs_work *work) ...@@ -1563,17 +1540,6 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio = end_io_wq->bio; bio = end_io_wq->bio;
fs_info = end_io_wq->info; fs_info = end_io_wq->info;
/* metadata bio reads are special because the whole tree block must
* be checksummed at once. This makes sure the entire block is in
* ram and up to date before trying to verify things. For
* blocksize <= pagesize, it is basically a noop
*/
if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
!bio_ready_for_csum(bio)) {
btrfs_queue_worker(&fs_info->endio_meta_workers,
&end_io_wq->work);
return;
}
error = end_io_wq->error; error = end_io_wq->error;
bio->bi_private = end_io_wq->private; bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io; bio->bi_end_io = end_io_wq->end_io;
...@@ -2135,10 +2101,38 @@ int open_ctree(struct super_block *sb, ...@@ -2135,10 +2101,38 @@ int open_ctree(struct super_block *sb,
goto fail_alloc; goto fail_alloc;
} }
if (btrfs_super_leafsize(disk_super) !=
btrfs_super_nodesize(disk_super)) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksizes don't match. node %d leaf %d\n",
btrfs_super_nodesize(disk_super),
btrfs_super_leafsize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
printk(KERN_ERR "BTRFS: couldn't mount because metadata "
"blocksize (%d) was too large\n",
btrfs_super_leafsize(disk_super));
err = -EINVAL;
goto fail_alloc;
}
features = btrfs_super_incompat_flags(disk_super); features = btrfs_super_incompat_flags(disk_super);
features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO) if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
/*
* flag our filesystem as having big metadata blocks if
* they are bigger than the page size
*/
if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
}
btrfs_set_super_incompat_flags(disk_super, features); btrfs_set_super_incompat_flags(disk_super, features);
features = btrfs_super_compat_ro_flags(disk_super) & features = btrfs_super_compat_ro_flags(disk_super) &
...@@ -3122,7 +3116,7 @@ int close_ctree(struct btrfs_root *root) ...@@ -3122,7 +3116,7 @@ int close_ctree(struct btrfs_root *root)
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
{ {
int ret; int ret;
struct inode *btree_inode = buf->first_page->mapping->host; struct inode *btree_inode = buf->pages[0]->mapping->host;
ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf, ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
NULL); NULL);
...@@ -3136,14 +3130,14 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) ...@@ -3136,14 +3130,14 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
int btrfs_set_buffer_uptodate(struct extent_buffer *buf) int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
{ {
struct inode *btree_inode = buf->first_page->mapping->host; struct inode *btree_inode = buf->pages[0]->mapping->host;
return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
buf); buf);
} }
void btrfs_mark_buffer_dirty(struct extent_buffer *buf) void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{ {
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
u64 transid = btrfs_header_generation(buf); u64 transid = btrfs_header_generation(buf);
struct inode *btree_inode = root->fs_info->btree_inode; struct inode *btree_inode = root->fs_info->btree_inode;
int was_dirty; int was_dirty;
...@@ -3212,7 +3206,7 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) ...@@ -3212,7 +3206,7 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{ {
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
int ret; int ret;
ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
if (ret == 0) if (ret == 0)
......
...@@ -3548,26 +3548,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ...@@ -3548,26 +3548,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
inline struct page *extent_buffer_page(struct extent_buffer *eb, inline struct page *extent_buffer_page(struct extent_buffer *eb,
unsigned long i) unsigned long i)
{ {
struct page *p; return eb->pages[i];
struct address_space *mapping;
if (i == 0)
return eb->first_page;
i += eb->start >> PAGE_CACHE_SHIFT;
mapping = eb->first_page->mapping;
if (!mapping)
return NULL;
/*
* extent_buffer_page is only called after pinning the page
* by increasing the reference count. So we know the page must
* be in the radix tree.
*/
rcu_read_lock();
p = radix_tree_lookup(&mapping->page_tree, i);
rcu_read_unlock();
return p;
} }
inline unsigned long num_extent_pages(u64 start, u64 len) inline unsigned long num_extent_pages(u64 start, u64 len)
...@@ -3576,6 +3557,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len) ...@@ -3576,6 +3557,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
(start >> PAGE_CACHE_SHIFT); (start >> PAGE_CACHE_SHIFT);
} }
static void __free_extent_buffer(struct extent_buffer *eb)
{
#if LEAK_DEBUG
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
list_del(&eb->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
if (eb->pages && eb->pages != eb->inline_pages)
kfree(eb->pages);
kmem_cache_free(extent_buffer_cache, eb);
}
static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, u64 start,
unsigned long len, unsigned long len,
...@@ -3608,21 +3602,25 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, ...@@ -3608,21 +3602,25 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
spin_unlock_irqrestore(&leak_lock, flags); spin_unlock_irqrestore(&leak_lock, flags);
#endif #endif
atomic_set(&eb->refs, 1); atomic_set(&eb->refs, 1);
atomic_set(&eb->pages_reading, 0);
if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
struct page **pages;
int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT;
pages = kzalloc(num_pages, mask);
if (!pages) {
__free_extent_buffer(eb);
return NULL;
}
eb->pages = pages;
} else {
eb->pages = eb->inline_pages;
}
return eb; return eb;
} }
static void __free_extent_buffer(struct extent_buffer *eb)
{
#if LEAK_DEBUG
unsigned long flags;
spin_lock_irqsave(&leak_lock, flags);
list_del(&eb->leak_list);
spin_unlock_irqrestore(&leak_lock, flags);
#endif
kmem_cache_free(extent_buffer_cache, eb);
}
/* /*
* Helper for releasing extent buffer page. * Helper for releasing extent buffer page.
*/ */
...@@ -3632,9 +3630,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, ...@@ -3632,9 +3630,6 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
unsigned long index; unsigned long index;
struct page *page; struct page *page;
if (!eb->first_page)
return;
index = num_extent_pages(eb->start, eb->len); index = num_extent_pages(eb->start, eb->len);
if (start_idx >= index) if (start_idx >= index)
return; return;
...@@ -3657,8 +3652,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) ...@@ -3657,8 +3652,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
} }
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len, u64 start, unsigned long len)
struct page *page0)
{ {
unsigned long num_pages = num_extent_pages(start, len); unsigned long num_pages = num_extent_pages(start, len);
unsigned long i; unsigned long i;
...@@ -3674,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, ...@@ -3674,7 +3668,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) { if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock(); rcu_read_unlock();
mark_page_accessed(eb->first_page); mark_page_accessed(eb->pages[0]);
return eb; return eb;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -3683,32 +3677,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, ...@@ -3683,32 +3677,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
if (!eb) if (!eb)
return NULL; return NULL;
if (page0) { for (i = 0; i < num_pages; i++, index++) {
eb->first_page = page0;
i = 1;
index++;
page_cache_get(page0);
mark_page_accessed(page0);
set_page_extent_mapped(page0);
set_page_extent_head(page0, len);
uptodate = PageUptodate(page0);
} else {
i = 0;
}
for (; i < num_pages; i++, index++) {
p = find_or_create_page(mapping, index, GFP_NOFS); p = find_or_create_page(mapping, index, GFP_NOFS);
if (!p) { if (!p) {
WARN_ON(1); WARN_ON(1);
goto free_eb; goto free_eb;
} }
set_page_extent_mapped(p);
mark_page_accessed(p); mark_page_accessed(p);
if (i == 0) { eb->pages[i] = p;
eb->first_page = p;
set_page_extent_head(p, len);
} else {
set_page_private(p, EXTENT_PAGE_PRIVATE);
}
if (!PageUptodate(p)) if (!PageUptodate(p))
uptodate = 0; uptodate = 0;
...@@ -3716,8 +3692,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, ...@@ -3716,8 +3692,6 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* see below about how we avoid a nasty race with release page * see below about how we avoid a nasty race with release page
* and why we unlock later * and why we unlock later
*/ */
if (i != 0)
unlock_page(p);
} }
if (uptodate) if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
...@@ -3751,15 +3725,23 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, ...@@ -3751,15 +3725,23 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
* after the extent buffer is in the radix tree so * after the extent buffer is in the radix tree so
* it doesn't get lost * it doesn't get lost
*/ */
set_page_extent_mapped(eb->first_page); set_page_extent_mapped(eb->pages[0]);
set_page_extent_head(eb->first_page, eb->len); set_page_extent_head(eb->pages[0], eb->len);
if (!page0) SetPageChecked(eb->pages[0]);
unlock_page(eb->first_page); for (i = 1; i < num_pages; i++) {
p = extent_buffer_page(eb, i);
set_page_extent_mapped(p);
ClearPageChecked(p);
unlock_page(p);
}
unlock_page(eb->pages[0]);
return eb; return eb;
free_eb: free_eb:
if (eb->first_page && !page0) for (i = 0; i < num_pages; i++) {
unlock_page(eb->first_page); if (eb->pages[i])
unlock_page(eb->pages[i]);
}
if (!atomic_dec_and_test(&eb->refs)) if (!atomic_dec_and_test(&eb->refs))
return exists; return exists;
...@@ -3776,7 +3758,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, ...@@ -3776,7 +3758,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT); eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
if (eb && atomic_inc_not_zero(&eb->refs)) { if (eb && atomic_inc_not_zero(&eb->refs)) {
rcu_read_unlock(); rcu_read_unlock();
mark_page_accessed(eb->first_page); mark_page_accessed(eb->pages[0]);
return eb; return eb;
} }
rcu_read_unlock(); rcu_read_unlock();
...@@ -3981,8 +3963,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ...@@ -3981,8 +3963,8 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
int ret = 0; int ret = 0;
int locked_pages = 0; int locked_pages = 0;
int all_uptodate = 1; int all_uptodate = 1;
int inc_all_pages = 0;
unsigned long num_pages; unsigned long num_pages;
unsigned long num_reads = 0;
struct bio *bio = NULL; struct bio *bio = NULL;
unsigned long bio_flags = 0; unsigned long bio_flags = 0;
...@@ -4014,8 +3996,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ...@@ -4014,8 +3996,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
lock_page(page); lock_page(page);
} }
locked_pages++; locked_pages++;
if (!PageUptodate(page)) if (!PageUptodate(page)) {
num_reads++;
all_uptodate = 0; all_uptodate = 0;
}
} }
if (all_uptodate) { if (all_uptodate) {
if (start_i == 0) if (start_i == 0)
...@@ -4023,20 +4007,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, ...@@ -4023,20 +4007,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
goto unlock_exit; goto unlock_exit;
} }
atomic_set(&eb->pages_reading, num_reads);
for (i = start_i; i < num_pages; i++) { for (i = start_i; i < num_pages; i++) {
page = extent_buffer_page(eb, i); page = extent_buffer_page(eb, i);
WARN_ON(!PagePrivate(page));
set_page_extent_mapped(page); set_page_extent_mapped(page);
if (i == 0) if (i == 0)
set_page_extent_head(page, eb->len); set_page_extent_head(page, eb->len);
if (inc_all_pages)
page_cache_get(page);
if (!PageUptodate(page)) { if (!PageUptodate(page)) {
if (start_i == 0)
inc_all_pages = 1;
ClearPageError(page); ClearPageError(page);
err = __extent_read_full_page(tree, page, err = __extent_read_full_page(tree, page,
get_extent, &bio, get_extent, &bio,
...@@ -4304,15 +4281,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page, ...@@ -4304,15 +4281,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
{ {
char *dst_kaddr = page_address(dst_page); char *dst_kaddr = page_address(dst_page);
char *src_kaddr; char *src_kaddr;
int must_memmove = 0;
if (dst_page != src_page) { if (dst_page != src_page) {
src_kaddr = page_address(src_page); src_kaddr = page_address(src_page);
} else { } else {
src_kaddr = dst_kaddr; src_kaddr = dst_kaddr;
BUG_ON(areas_overlap(src_off, dst_off, len)); if (areas_overlap(src_off, dst_off, len))
must_memmove = 1;
} }
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); if (must_memmove)
memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
else
memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
} }
void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
...@@ -4382,7 +4364,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, ...@@ -4382,7 +4364,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
"len %lu len %lu\n", dst_offset, len, dst->len); "len %lu len %lu\n", dst_offset, len, dst->len);
BUG_ON(1); BUG_ON(1);
} }
if (!areas_overlap(src_offset, dst_offset, len)) { if (dst_offset < src_offset) {
memcpy_extent_buffer(dst, dst_offset, src_offset, len); memcpy_extent_buffer(dst, dst_offset, src_offset, len);
return; return;
} }
...@@ -4429,7 +4411,8 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ...@@ -4429,7 +4411,8 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
return ret; return ret;
} }
if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { if (atomic_read(&eb->refs) > 1 ||
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
ret = 0; ret = 0;
goto out; goto out;
} }
...@@ -4442,7 +4425,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) ...@@ -4442,7 +4425,6 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
ret = 0; ret = 0;
goto out; goto out;
} }
radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT); radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
out: out:
spin_unlock(&tree->buffer_lock); spin_unlock(&tree->buffer_lock);
......
...@@ -119,16 +119,18 @@ struct extent_state { ...@@ -119,16 +119,18 @@ struct extent_state {
struct list_head leak_list; struct list_head leak_list;
}; };
#define INLINE_EXTENT_BUFFER_PAGES 16
#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
struct extent_buffer { struct extent_buffer {
u64 start; u64 start;
unsigned long len; unsigned long len;
unsigned long map_start; unsigned long map_start;
unsigned long map_len; unsigned long map_len;
struct page *first_page;
unsigned long bflags; unsigned long bflags;
atomic_t refs;
atomic_t pages_reading;
struct list_head leak_list; struct list_head leak_list;
struct rcu_head rcu_head; struct rcu_head rcu_head;
atomic_t refs;
pid_t lock_owner; pid_t lock_owner;
/* count of read lock holders on the extent buffer */ /* count of read lock holders on the extent buffer */
...@@ -152,6 +154,9 @@ struct extent_buffer { ...@@ -152,6 +154,9 @@ struct extent_buffer {
* to unlock * to unlock
*/ */
wait_queue_head_t read_lock_wq; wait_queue_head_t read_lock_wq;
wait_queue_head_t lock_wq;
struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
struct page **pages;
}; };
static inline void extent_set_compress_type(unsigned long *bio_flags, static inline void extent_set_compress_type(unsigned long *bio_flags,
...@@ -251,8 +256,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); ...@@ -251,8 +256,7 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page); void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len, u64 start, unsigned long len);
struct page *page0);
struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
u64 start, unsigned long len); u64 start, unsigned long len);
void free_extent_buffer(struct extent_buffer *eb); void free_extent_buffer(struct extent_buffer *eb);
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "ctree.h" #include "ctree.h"
#include "disk-io.h" #include "disk-io.h"
#include "transaction.h" #include "transaction.h"
#include "print-tree.h"
static int find_name_in_backref(struct btrfs_path *path, const char *name, static int find_name_in_backref(struct btrfs_path *path, const char *name,
int name_len, struct btrfs_inode_ref **ref_ret) int name_len, struct btrfs_inode_ref **ref_ret)
......
...@@ -4384,7 +4384,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) ...@@ -4384,7 +4384,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
* to silence the warning eg. on PowerPC 64. * to silence the warning eg. on PowerPC 64.
*/ */
if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE) if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->first_page); SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy); array_size = btrfs_super_sys_array_size(super_copy);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment