Commit a86c6181 authored by Alex Tomas's avatar Alex Tomas Committed by Linus Torvalds

[PATCH] ext3: add extent map support

On disk extents format:
/*
* this is extent on-disk structure
* it's used at the bottom of the tree
*/
struct ext3_extent {
__le32  ee_block;       /* first logical block extent covers */
__le16  ee_len;         /* number of blocks covered by extent */
__le16  ee_start_hi;    /* high 16 bits of physical block */
__le32  ee_start;       /* low 32 bigs of physical block */
};
Signed-off-by: default avatarAlex Tomas <alex@clusterfs.com>
Signed-off-by: default avatarDave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent c3fcc813
......@@ -5,7 +5,7 @@
obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o
ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o
ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o
......
......@@ -134,8 +134,7 @@ static int ext4_readdir(struct file * filp,
struct buffer_head *bh = NULL;
map_bh.b_state = 0;
err = ext4_get_blocks_handle(NULL, inode, blk, 1,
&map_bh, 0, 0);
err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
if (err > 0) {
page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
&filp->f_ra,
......
This diff is collapsed.
......@@ -615,6 +615,17 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
ext4_std_error(sb, err);
goto fail_free_drop;
}
if (test_opt(sb, EXTENTS)) {
EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
ext4_ext_tree_init(handle, inode);
if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
if (err) goto fail;
EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS);
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "call ext4_journal_dirty_metadata");
err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh);
}
}
ext4_debug("allocating inode %lu\n", inode->i_ino);
goto really_out;
......
......@@ -40,8 +40,6 @@
#include "xattr.h"
#include "acl.h"
static int ext4_writepage_trans_blocks(struct inode *inode);
/*
* Test whether an inode is a fast symlink.
*/
......@@ -804,6 +802,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
ext4_fsblk_t first_block = 0;
J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
J_ASSERT(handle != NULL || create == 0);
depth = ext4_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
......@@ -984,7 +983,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
get_block:
if (ret == 0) {
ret = ext4_get_blocks_handle(handle, inode, iblock,
ret = ext4_get_blocks_wrap(handle, inode, iblock,
max_blocks, bh_result, create, 0);
if (ret > 0) {
bh_result->b_size = (ret << inode->i_blkbits);
......@@ -1008,7 +1007,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
dummy.b_state = 0;
dummy.b_blocknr = -1000;
buffer_trace_init(&dummy.b_history);
err = ext4_get_blocks_handle(handle, inode, block, 1,
err = ext4_get_blocks_wrap(handle, inode, block, 1,
&dummy, create, 1);
/*
* ext4_get_blocks_handle() returns number of blocks
......@@ -1759,7 +1758,7 @@ void ext4_set_aops(struct inode *inode)
* This required during truncate. We need to physically zero the tail end
* of that block so it doesn't yield old data if the file is later grown.
*/
static int ext4_block_truncate_page(handle_t *handle, struct page *page,
int ext4_block_truncate_page(handle_t *handle, struct page *page,
struct address_space *mapping, loff_t from)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
......@@ -2263,6 +2262,9 @@ void ext4_truncate(struct inode *inode)
return;
}
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_truncate(inode, page);
handle = start_transaction(inode);
if (IS_ERR(handle)) {
if (page) {
......@@ -3003,12 +3005,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* block and work out the exact number of indirects which are touched. Pah.
*/
static int ext4_writepage_trans_blocks(struct inode *inode)
int ext4_writepage_trans_blocks(struct inode *inode)
{
int bpp = ext4_journal_blocks_per_page(inode);
int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
int ret;
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_writepage_trans_blocks(inode, bpp);
if (ext4_should_journal_data(inode))
ret = 3 * (bpp + indirects) + 2;
else
......
......@@ -248,7 +248,6 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
return err;
}
default:
return -ENOTTY;
}
......
......@@ -390,6 +390,7 @@ static void ext4_put_super (struct super_block * sb)
struct ext4_super_block *es = sbi->s_es;
int i;
ext4_ext_release(sb);
ext4_xattr_put_super(sb);
jbd2_journal_destroy(sbi->s_journal);
if (!(sb->s_flags & MS_RDONLY)) {
......@@ -454,6 +455,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
#endif
ei->i_block_alloc_info = NULL;
ei->vfs_inode.i_version = 1;
memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
return &ei->vfs_inode;
}
......@@ -677,7 +679,7 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
Opt_grpquota
Opt_grpquota, Opt_extents,
};
static match_table_t tokens = {
......@@ -727,6 +729,7 @@ static match_table_t tokens = {
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
{Opt_extents, "extents"},
{Opt_err, NULL},
{Opt_resize, "resize"},
};
......@@ -1059,6 +1062,9 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_bh:
clear_opt(sbi->s_mount_opt, NOBH);
break;
case Opt_extents:
set_opt (sbi->s_mount_opt, EXTENTS);
break;
default:
printk (KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
......@@ -1787,6 +1793,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
ext4_ext_init(sb);
lock_kernel();
return 0;
......
......@@ -178,8 +178,9 @@ struct ext4_group_desc
#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */
#define EXT4_FL_USER_VISIBLE 0x0003DFFF /* User visible flags */
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */
/*
......@@ -384,6 +385,7 @@ struct ext4_inode {
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */
#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */
#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */
/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
#ifndef _LINUX_EXT2_FS_H
......@@ -582,11 +584,13 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */
#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG)
EXT4_FEATURE_INCOMPAT_META_BG| \
EXT4_FEATURE_INCOMPAT_EXTENTS)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
......@@ -825,6 +829,9 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern void ext4_truncate (struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
struct address_space *mapping, loff_t from);
/* ioctl.c */
extern int ext4_ioctl (struct inode *, struct file *, unsigned int,
......@@ -879,6 +886,28 @@ extern struct inode_operations ext4_special_inode_operations;
extern struct inode_operations ext4_symlink_inode_operations;
extern struct inode_operations ext4_fast_symlink_inode_operations;
/* extents.c */
extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t iblock,
unsigned long max_blocks, struct buffer_head *bh_result,
int create, int extend_disksize);
extern void ext4_ext_truncate(struct inode *, struct page *);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
static inline int
ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
unsigned long max_blocks, struct buffer_head *bh,
int create, int extend_disksize)
{
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
return ext4_ext_get_blocks(handle, inode, block, max_blocks,
bh, create, extend_disksize);
return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh,
create, extend_disksize);
}
#endif /* __KERNEL__ */
......
/*
* Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
* Written by Alex Tomas <alex@clusterfs.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public Licens
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
*/
#ifndef _LINUX_EXT4_EXTENTS
#define _LINUX_EXT4_EXTENTS
#include <linux/ext4_fs.h>
/*
* with AGRESSIVE_TEST defined capacity of index/leaf blocks
* become very little, so index split, in-depth growing and
* other hard changes happens much more often
* this is for debug purposes only
*/
#define AGRESSIVE_TEST_
/*
* with EXTENTS_STATS defined number of blocks and extents
* are collected in truncate path. they'll be showed at
* umount time
*/
#define EXTENTS_STATS__
/*
* if CHECK_BINSEARCH defined, then results of binary search
* will be checked by linear search
*/
#define CHECK_BINSEARCH__
/*
* if EXT_DEBUG is defined you can use 'extdebug' mount option
* to get lots of info what's going on
*/
#define EXT_DEBUG__
#ifdef EXT_DEBUG
#define ext_debug(a...) printk(a)
#else
#define ext_debug(a...)
#endif
/*
* if EXT_STATS is defined then stats numbers are collected
* these number will be displayed at umount time
*/
#define EXT_STATS_
/*
* ext4_inode has i_block array (60 bytes total)
* first 12 bytes store ext4_extent_header
* the remain stores array of ext4_extent
*/
/*
* this is extent on-disk structure
* it's used at the bottom of the tree
*/
struct ext4_extent {
__le32 ee_block; /* first logical block extent covers */
__le16 ee_len; /* number of blocks covered by extent */
__le16 ee_start_hi; /* high 16 bits of physical block */
__le32 ee_start; /* low 32 bigs of physical block */
};
/*
* this is index on-disk structure
* it's used at all the levels, but the bottom
*/
struct ext4_extent_idx {
__le32 ei_block; /* index covers logical blocks from 'block' */
__le32 ei_leaf; /* pointer to the physical block of the next *
* level. leaf or next index could bet here */
__le16 ei_leaf_hi; /* high 16 bits of physical block */
__u16 ei_unused;
};
/*
* each block (leaves and indexes), even inode-stored has header
*/
struct ext4_extent_header {
__le16 eh_magic; /* probably will support different formats */
__le16 eh_entries; /* number of valid entries */
__le16 eh_max; /* capacity of store in entries */
__le16 eh_depth; /* has tree real underlaying blocks? */
__le32 eh_generation; /* generation of the tree */
};
#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a)
/*
* array of ext4_ext_path contains path to some extent
* creation/lookup routines use it for traversal/splitting/etc
* truncate uses it to simulate recursive walking
*/
struct ext4_ext_path {
__u32 p_block;
__u16 p_depth;
struct ext4_extent *p_ext;
struct ext4_extent_idx *p_idx;
struct ext4_extent_header *p_hdr;
struct buffer_head *p_bh;
};
/*
* structure for external API
*/
#define EXT4_EXT_CACHE_NO 0
#define EXT4_EXT_CACHE_GAP 1
#define EXT4_EXT_CACHE_EXTENT 2
/*
* to be called by ext4_ext_walk_space()
* negative retcode - error
* positive retcode - signal for ext4_ext_walk_space(), see below
* callback must return valid extent (passed or newly created)
*/
typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
struct ext4_ext_cache *,
void *);
#define EXT_CONTINUE 0
#define EXT_BREAK 1
#define EXT_REPEAT 2
#define EXT_MAX_BLOCK 0xffffffff
#define EXT_FIRST_EXTENT(__hdr__) \
((struct ext4_extent *) (((char *) (__hdr__)) + \
sizeof(struct ext4_extent_header)))
#define EXT_FIRST_INDEX(__hdr__) \
((struct ext4_extent_idx *) (((char *) (__hdr__)) + \
sizeof(struct ext4_extent_header)))
#define EXT_HAS_FREE_INDEX(__path__) \
(le16_to_cpu((__path__)->p_hdr->eh_entries) \
< le16_to_cpu((__path__)->p_hdr->eh_max))
#define EXT_LAST_EXTENT(__hdr__) \
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_MAX_EXTENT(__hdr__) \
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
#define EXT_MAX_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
}
static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
{
return (struct ext4_extent_header *) bh->b_data;
}
static inline unsigned short ext_depth(struct inode *inode)
{
return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
}
static inline void ext4_ext_tree_changed(struct inode *inode)
{
EXT4_I(inode)->i_ext_generation++;
}
static inline void
ext4_ext_invalidate_cache(struct inode *inode)
{
EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
}
extern int ext4_extent_tree_init(handle_t *, struct inode *);
extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
#endif /* _LINUX_EXT4_EXTENTS */
......@@ -64,6 +64,16 @@ struct ext4_block_alloc_info {
#define rsv_start rsv_window._rsv_start
#define rsv_end rsv_window._rsv_end
/*
* storage for cached extent
*/
struct ext4_ext_cache {
__u32 ec_start;
__u32 ec_block;
__u32 ec_len; /* must be 32bit to return holes */
__u32 ec_type;
};
/*
* third extended file system inode data in memory
*/
......@@ -142,6 +152,9 @@ struct ext4_inode_info {
*/
struct mutex truncate_mutex;
struct inode vfs_inode;
unsigned long i_ext_generation;
struct ext4_ext_cache i_cached_extent;
};
#endif /* _LINUX_EXT4_FS_I */
......@@ -78,6 +78,16 @@ struct ext4_sb_info {
char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */
int s_jquota_fmt; /* Format of quota to use */
#endif
#ifdef EXTENTS_STATS
/* ext4 extents stats */
unsigned long s_ext_min;
unsigned long s_ext_max;
unsigned long s_depth_max;
spinlock_t s_ext_stats_lock;
unsigned long s_ext_blocks;
unsigned long s_ext_extents;
#endif
};
#endif /* _LINUX_EXT4_FS_SB */
......@@ -26,9 +26,14 @@
*
* We may have to touch one inode, one bitmap buffer, up to three
* indirection blocks, the group and superblock summaries, and the data
* block to complete the transaction. */
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify upto
* 5 levels of tree + root which is stored in inode. */
#define EXT4_SINGLEDATA_TRANS_BLOCKS 8U
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
|| test_opt(sb, EXTENTS) ? 27U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
......@@ -42,7 +47,7 @@
* superblock only gets updated once, of course, so don't bother
* counting that again for the quota updates. */
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS + \
#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
EXT4_XATTR_TRANS_BLOCKS - 2 + \
2*EXT4_QUOTA_TRANS_BLOCKS(sb))
......@@ -78,9 +83,9 @@
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment