Commit 734db689 authored by Chris Mason's avatar Chris Mason Committed by Linus Torvalds

[PATCH] reiserfs: block allocator optimizations

From: <mason@suse.com>
From: <jeffm@suse.com>

The current reiserfs allocator pretty much allocates things sequentially
from the start of the disk, it works very nicely for desktop loads but
once you've got more then one proc doing io data files can fragment badly.

One obvious solution is something like ext2's bitmap groups, which puts
file data into different areas of the disk based on which subdirectory
they are in.  The problem with bitmap groups is that if you've got a
group of subdirectories their contents will be spread out all over the
disk, leading to lots of seeks during a sequential read.

This allocator patch uses the packing locality to determine which bitmap
group to allocate from, but when you create a file it looks in the bitmaps
to see how 'full' that packing locality already is.  If it hasn't been
heavily used yet, the packing locality is inherited from the parent
directory putting files in new subdirs close to the parent subdir,
otherwise it is the inode number of the parent directory putting new
files far away from the parent subdir.

The end result is fewer bitmap groups for the same working set.  For
example, one test data set created by 20 procs running in parallel has
6822 subdirs.  And with vanilla reiserfs that would mean 6822
packing localities.  This patch turns that into 26 packing localities.

This makes sequential reads of big directory trees more efficient, but
it also makes the btree more efficient in general.  Things end up sorted
better because groups of subdirs end up with similar keys in the btree,
instead of being spread out all over.

The bitmap grouping code tries to use the start of each bitmap group
for metadata, and offsets the data slightly.  The data and metadata
are still close together, but not completely intermixed like they are
in the default allocator.  The end result is that leaf nodes tend to be
close to each other, making metadata readahead more effective.

The old block allocator had the ability to enforce a minimum
allocation size, but did not use it.  It now tries to do a pass looking
for larger allocation chunks before falling back to the old behaviour
of taking any blocks it can find.

The patch changes the defaults to:

mount -o alloc=skip_busy:dirid_groups:packing_groups

You can get back the old behaviour with mount -o alloc=skip_busy

mount -o alloc=dirid_groups will turn on the bitmap groups
mount -o alloc=packing_groups turns on the packing locality reduction code
mount -o alloc=skip_busy:dirid_groups turns on both dirid_groups and
skip_busy

Finally the patch adds a mount -o alloc=oid_groups, which puts files into
bitmap groups based on a hash of their objectid.  This would be used for
databases or other situations where you have a limited number of very
large files.

This command will tell you how many packing localities are actually in
use:

debugreiserfs -d /dev/xxx | grep '^|.*SD' | sed 's/^.....//' | awk '{print $1}' | sort -u | wc -l
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent fab177a4
This diff is collapsed.
......@@ -176,12 +176,13 @@ int reiserfs_allocate_blocks_for_region(
hint.formatted_node = 0; // We are allocating blocks for unformatted node.
/* only preallocate if this is a small write */
if (blocks_to_allocate <
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize)
if (REISERFS_I(inode)->i_prealloc_count ||
(!(write_bytes & (inode->i_sb->s_blocksize -1)) &&
blocks_to_allocate <
REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize))
hint.preallocate = 1;
else
hint.preallocate = 0;
/* Call block allocator to allocate blocks */
res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate);
if ( res != CARRY_ON ) {
......@@ -467,6 +468,12 @@ int reiserfs_allocate_blocks_for_region(
// the inode.
//
pathrelse(&path);
/*
* cleanup prellocation from previous writes
* if this is a partial block write
*/
if (write_bytes & (inode->i_sb->s_blocksize -1))
reiserfs_discard_prealloc(th, inode);
reiserfs_write_unlock(inode->i_sb);
// go through all the pages/buffers and map the buffers to newly allocated
......@@ -1254,6 +1261,7 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
journal_end(&th, th.t_super, th.t_blocks_allocated);
reiserfs_write_unlock(inode->i_sb);
}
if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
......
......@@ -1660,7 +1660,7 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
sb = dir->i_sb;
/* item head of new item */
ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid;
ih.ih_key.k_dir_id = reiserfs_choose_packing(dir);
ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th));
if (!ih.ih_key.k_objectid) {
err = -ENOMEM;
......@@ -1729,7 +1729,6 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
err = -EEXIST;
goto out_bad_inode;
}
if (old_format_only (sb)) {
if (inode->i_uid & ~0xffff || inode->i_gid & ~0xffff) {
pathrelse (&path_to_key);
......
......@@ -492,7 +492,6 @@ static void reiserfs_clear_inode (struct inode *inode)
REISERFS_I(inode)->i_acl_default = NULL;
}
struct super_operations reiserfs_sops =
{
.alloc_inode = reiserfs_alloc_inode,
......@@ -651,7 +650,7 @@ static int reiserfs_getopt ( struct super_block * s, char ** cur, opt_desc_t * o
reiserfs_warning (s, "head of option \"%s\" is only correct", opt->option_name);
return -1;
}
/* move to the argument, or to next option if argument is not required */
p ++;
......@@ -1345,15 +1344,17 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
memset (sbi, 0, sizeof (struct reiserfs_sb_info));
/* Set default values for options: non-aggressive tails */
REISERFS_SB(s)->s_mount_opt = ( 1 << REISERFS_SMALLTAIL );
/* default block allocator option: skip_busy */
REISERFS_SB(s)->s_alloc_options.bits = ( 1 << 5);
/* If file grew past 4 blocks, start preallocation blocks for it. */
REISERFS_SB(s)->s_alloc_options.preallocmin = 4;
/* no preallocation minimum, be smart in
reiserfs_file_write instead */
REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
/* Preallocate by 16 blocks (17-1) at once */
REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
/* Initialize the rwsem for xattr dir */
init_rwsem(&REISERFS_SB(s)->xattr_dir_sem);
/* setup default block allocator options */
reiserfs_init_alloc_options(s);
jdev_name = NULL;
if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) {
goto error;
......
......@@ -1247,7 +1247,7 @@ struct path {
#define pos_in_item(path) ((path)->pos_in_item)
#define INITIALIZE_PATH(var) \
struct path var = {ILLEGAL_PATH_ELEMENT_OFFSET, }
struct path var = {.path_length = ILLEGAL_PATH_ELEMENT_OFFSET,}
/* Get path element by path and path position. */
#define PATH_OFFSET_PELEMENT(p_s_path,n_offset) ((p_s_path)->path_elements +(n_offset))
......@@ -2149,6 +2149,15 @@ struct buffer_head * get_FEB (struct tree_balance *);
typedef struct __reiserfs_blocknr_hint reiserfs_blocknr_hint_t;
int reiserfs_parse_alloc_options (struct super_block *, char *);
void reiserfs_init_alloc_options (struct super_block *s);
/*
* given a directory, this will tell you what packing locality
* to use for a new object underneat it. The locality is returned
* in disk byte order (le).
*/
u32 reiserfs_choose_packing(struct inode *dir);
int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value);
void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted);
int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment