Commit 17aff938 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] ext3: scalable counters and locks

From: Alex Tomas <bzzz@tmi.comex.ru>

This is a port from ext2 of the fuzzy counters (for Orlov allocator
heuristics) and the hashed spinlocking (for the inode and bloock allocators).
parent c12b9866
......@@ -110,6 +110,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
struct super_block * sb;
struct ext3_group_desc * gdp;
struct ext3_super_block * es;
struct ext3_sb_info *sbi;
int err = 0, ret;
int dquot_freed_blocks = 0;
......@@ -118,6 +119,7 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
printk ("ext3_free_blocks: nonexistent device");
return;
}
sbi = EXT3_SB(sb);
es = EXT3_SB(sb)->s_es;
if (block < le32_to_cpu(es->s_first_data_block) ||
block + count < block ||
......@@ -242,11 +244,12 @@ void ext3_free_blocks (handle_t *handle, struct inode * inode,
}
}
spin_lock(bg_lock(sb, block_group));
spin_lock(sb_bgl_lock(sbi, block_group));
gdp->bg_free_blocks_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) +
dquot_freed_blocks);
spin_unlock(bg_lock(sb, block_group));
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_mod(&sbi->s_freeblocks_counter, count);
/* We dirtied the bitmap block */
BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
......@@ -429,7 +432,7 @@ ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
have_access = 1;
}
if (!claim_block(bg_lock(sb, group), goal, bitmap_bh)) {
if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group), goal, bitmap_bh)) {
/*
* The block was allocated by another thread, or it was
* allocated and then freed by another thread
......@@ -477,11 +480,11 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
int target_block; /* tmp */
int fatal = 0, err;
int performed_allocation = 0;
int free;
int use_reserve = 0;
int free_blocks, root_blocks;
struct super_block *sb;
struct ext3_group_desc *gdp;
struct ext3_super_block *es;
struct ext3_sb_info *sbi;
#ifdef EXT3FS_DEBUG
static int goal_hits = 0, goal_attempts = 0;
#endif
......@@ -500,9 +503,19 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
return 0;
}
sbi = EXT3_SB(sb);
es = EXT3_SB(sb)->s_es;
ext3_debug("goal=%lu.\n", goal);
free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
root_blocks = le32_to_cpu(es->s_r_blocks_count);
if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
sbi->s_resuid != current->fsuid &&
(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
*errp = -ENOSPC;
return 0;
}
/*
* First, test whether the goal block is free.
*/
......@@ -515,9 +528,8 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
if (!gdp)
goto io_error;
free = le16_to_cpu(gdp->bg_free_blocks_count);
free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
if (free > 0) {
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
if (free_blocks > 0) {
ret_block = ((goal - le32_to_cpu(es->s_first_data_block)) %
EXT3_BLOCKS_PER_GROUP(sb));
bitmap_bh = read_block_bitmap(sb, group_no);
......@@ -535,7 +547,6 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
* Now search the rest of the groups. We assume that
* i and gdp correctly point to the last group visited.
*/
repeat:
for (bgi = 0; bgi < EXT3_SB(sb)->s_groups_count; bgi++) {
group_no++;
if (group_no >= EXT3_SB(sb)->s_groups_count)
......@@ -545,10 +556,8 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
*errp = -EIO;
goto out;
}
free = le16_to_cpu(gdp->bg_free_blocks_count);
if (!use_reserve)
free -= EXT3_SB(sb)->s_bgi[group_no].bg_reserved;
if (free <= 0)
free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
if (free_blocks <= 0)
continue;
brelse(bitmap_bh);
......@@ -563,15 +572,6 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
goto allocated;
}
if (!use_reserve &&
(EXT3_SB(sb)->s_resuid == current->fsuid ||
(EXT3_SB(sb)->s_resgid != 0 && in_group_p(EXT3_SB(sb)->s_resgid)) ||
capable(CAP_SYS_RESOURCE))) {
use_reserve = 1;
group_no = 0;
goto repeat;
}
/* No space left on the device */
*errp = -ENOSPC;
goto out;
......@@ -612,13 +612,13 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
}
}
#endif
spin_lock(bg_lock(sb, group_no));
spin_lock(sb_bgl_lock(sbi, group_no));
if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data)
J_ASSERT_BH(bitmap_bh,
!ext3_test_bit(ret_block,
bh2jh(bitmap_bh)->b_committed_data));
ext3_debug("found bit %d\n", ret_block);
spin_unlock(bg_lock(sb, group_no));
spin_unlock(sb_bgl_lock(sbi, group_no));
/* ret_block was blockgroup-relative. Now it becomes fs-relative */
ret_block = target_block;
......@@ -639,10 +639,11 @@ ext3_new_block(handle_t *handle, struct inode *inode, unsigned long goal,
ext3_debug("allocating block %d. Goal hits %d of %d.\n",
ret_block, goal_hits, goal_attempts);
spin_lock(bg_lock(sb, group_no));
spin_lock(sb_bgl_lock(sbi, group_no));
gdp->bg_free_blocks_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1);
spin_unlock(bg_lock(sb, group_no));
spin_unlock(sb_bgl_lock(sbi, group_no));
percpu_counter_mod(&sbi->s_freeblocks_counter, -1);
BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
err = ext3_journal_dirty_metadata(handle, gdp_bh);
......
......@@ -97,6 +97,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
unsigned long bit;
struct ext3_group_desc * gdp;
struct ext3_super_block * es;
struct ext3_sb_info *sbi = EXT3_SB(sb);
int fatal = 0, err;
if (atomic_read(&inode->i_count) > 1) {
......@@ -161,13 +162,17 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
if (fatal) goto error_return;
if (gdp) {
spin_lock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
spin_lock(sb_bgl_lock(sbi, block_group));
gdp->bg_free_inodes_count = cpu_to_le16(
le16_to_cpu(gdp->bg_free_inodes_count) + 1);
if (is_directory)
gdp->bg_used_dirs_count = cpu_to_le16(
le16_to_cpu(gdp->bg_used_dirs_count) - 1);
spin_unlock(&EXT3_SB(sb)->s_bgi[block_group].bg_ialloc_lock);
spin_unlock(sb_bgl_lock(sbi, block_group));
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (is_directory)
percpu_counter_dec(&sbi->s_dirs_counter);
}
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
......@@ -196,11 +201,14 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
static int find_group_dir(struct super_block *sb, struct inode *parent)
{
int ngroups = EXT3_SB(sb)->s_groups_count;
int avefreei = ext3_count_free_inodes(sb) / ngroups;
int freei, avefreei;
struct ext3_group_desc *desc, *best_desc = NULL;
struct buffer_head *bh;
int group, best_group = -1;
freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter);
avefreei = freei / ngroups;
for (group = 0; group < ngroups; group++) {
desc = ext3_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
......@@ -252,17 +260,20 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
struct ext3_super_block *es = sbi->s_es;
int ngroups = sbi->s_groups_count;
int inodes_per_group = EXT3_INODES_PER_GROUP(sb);
int freei = ext3_count_free_inodes(sb);
int avefreei = freei / ngroups;
int freeb = ext3_count_free_blocks(sb);
int avefreeb = freeb / ngroups;
int blocks_per_dir;
int ndirs = ext3_count_dirs(sb);
int freei, avefreei;
int freeb, avefreeb;
int blocks_per_dir, ndirs;
int max_debt, max_dirs, min_blocks, min_inodes;
int group = -1, i;
struct ext3_group_desc *desc;
struct buffer_head *bh;
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
avefreeb = freeb / ngroups;
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
if ((parent == sb->s_root->d_inode) ||
(parent->i_flags & EXT3_TOPDIR_FL)) {
int best_ndir = inodes_per_group;
......@@ -289,8 +300,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
goto fallback;
}
blocks_per_dir = (le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_free_blocks_count)) / ndirs;
blocks_per_dir = (le32_to_cpu(es->s_blocks_count) - freeb) / ndirs;
max_dirs = ndirs / ngroups + inodes_per_group / 16;
min_inodes = avefreei - inodes_per_group / 4;
......@@ -309,7 +319,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent)
desc = ext3_get_group_desc (sb, group, &bh);
if (!desc || !desc->bg_free_inodes_count)
continue;
if (sbi->s_bgi[group].bg_debts >= max_debt)
if (sbi->s_debts[group] >= max_debt)
continue;
if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs)
continue;
......@@ -416,13 +426,15 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
struct buffer_head *bitmap_bh = NULL;
struct buffer_head *bh2;
int group;
unsigned long ino;
unsigned long ino = 0;
struct inode * inode;
struct ext3_group_desc * gdp;
struct ext3_group_desc * gdp = NULL;
struct ext3_super_block * es;
struct ext3_inode_info *ei;
struct ext3_sb_info *sbi;
int err = 0;
struct inode *ret;
int i;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
......@@ -435,7 +447,7 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
ei = EXT3_I(inode);
es = EXT3_SB(sb)->s_es;
repeat:
sbi = EXT3_SB(sb);
if (S_ISDIR(mode)) {
if (test_opt (sb, OLDALLOC))
group = find_group_dir(sb, dir);
......@@ -448,46 +460,52 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
if (group == -1)
goto out;
for (i = 0; i < sbi->s_groups_count; i++) {
gdp = ext3_get_group_desc(sb, group, &bh2);
err = -EIO;
brelse(bitmap_bh);
bitmap_bh = read_inode_bitmap(sb, group);
if (!bitmap_bh)
goto fail;
gdp = ext3_get_group_desc (sb, group, &bh2);
if ((ino = ext3_find_first_zero_bit((unsigned long *)bitmap_bh->b_data,
EXT3_INODES_PER_GROUP(sb))) <
EXT3_INODES_PER_GROUP(sb)) {
ino = ext3_find_first_zero_bit((unsigned long *)
bitmap_bh->b_data, EXT3_INODES_PER_GROUP(sb));
if (ino < EXT3_INODES_PER_GROUP(sb)) {
BUFFER_TRACE(bitmap_bh, "get_write_access");
err = ext3_journal_get_write_access(handle, bitmap_bh);
if (err) goto fail;
if (ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
ino, bitmap_bh->b_data))
goto repeat;
BUFFER_TRACE(bitmap_bh, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bitmap_bh);
if (err) goto fail;
} else {
if (le16_to_cpu(gdp->bg_free_inodes_count) != 0) {
ext3_error (sb, "ext3_new_inode",
"Free inodes count corrupted in group %d",
group);
/* Is it really ENOSPC? */
err = -ENOSPC;
if (sb->s_flags & MS_RDONLY)
if (err)
goto fail;
BUFFER_TRACE(bh2, "get_write_access");
err = ext3_journal_get_write_access(handle, bh2);
if (err) goto fail;
gdp->bg_free_inodes_count = 0;
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (err) goto fail;
if (!ext3_set_bit_atomic(sb_bgl_lock(sbi, group),
ino, bitmap_bh->b_data)) {
/* we won it */
BUFFER_TRACE(bitmap_bh,
"call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle,
bitmap_bh);
if (err)
goto fail;
goto got;
}
/* we lost it */
journal_release_buffer(handle, bitmap_bh);
}
goto repeat;
/*
* This case is possible in concurrent environment. It is very
* rare. We cannot repeat the find_group_xxx() call because
* that will simply return the same blockgroup, because the
* group descriptor metadata has not yet been updated.
* So we just go onto the next blockgroup.
*/
if (++group == sbi->s_groups_count)
group = 0;
}
err = -ENOSPC;
goto out;
got:
ino += group * EXT3_INODES_PER_GROUP(sb) + 1;
if (ino < EXT3_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) {
ext3_error (sb, "ext3_new_inode",
......@@ -500,18 +518,21 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
BUFFER_TRACE(bh2, "get_write_access");
err = ext3_journal_get_write_access(handle, bh2);
if (err) goto fail;
spin_lock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
spin_lock(sb_bgl_lock(sbi, group));
gdp->bg_free_inodes_count =
cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1);
if (S_ISDIR(mode)) {
gdp->bg_used_dirs_count =
cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1);
}
spin_unlock(&EXT3_SB(sb)->s_bgi[group].bg_ialloc_lock);
spin_unlock(sb_bgl_lock(sbi, group));
BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata");
err = ext3_journal_dirty_metadata(handle, bh2);
if (err) goto fail;
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
sb->s_dirt = 1;
inode->i_uid = current->fsuid;
......
......@@ -460,7 +460,7 @@ void ext3_put_super (struct super_block * sb)
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
kfree(sbi->s_bgi);
kfree(sbi->s_debts);
brelse(sbi->s_sbh);
/* Debugging code just in case the in-memory inode orphan list
......@@ -902,7 +902,6 @@ static int ext3_check_descriptors (struct super_block * sb)
unsigned long block = le32_to_cpu(sbi->s_es->s_first_data_block);
struct ext3_group_desc * gdp = NULL;
unsigned long total_free;
unsigned int reserved = le32_to_cpu(sbi->s_es->s_r_blocks_count);
int desc_block = 0;
int i;
......@@ -958,25 +957,6 @@ static int ext3_check_descriptors (struct super_block * sb)
EXT3_SB(sb)->s_es->s_free_blocks_count = cpu_to_le32(total_free);
}
/* distribute reserved blocks over groups -bzzz */
for(i = sbi->s_groups_count - 1; reserved && total_free && i >= 0; i--) {
int free;
gdp = ext3_get_group_desc (sb, i, NULL);
if (!gdp) {
ext3_error (sb, "ext3_check_descriptors",
"cant get descriptor for group %d", i);
return 0;
}
free = le16_to_cpu(gdp->bg_free_blocks_count);
if (free > reserved)
free = reserved;
sbi->s_bgi[i].bg_reserved = free;
reserved -= free;
total_free -= free;
}
total_free = ext3_count_free_inodes(sb);
if (total_free != le32_to_cpu(EXT3_SB(sb)->s_es->s_free_inodes_count)) {
printk("EXT3-fs: invalid s_free_inodes_count %u (real %lu)\n",
......@@ -1346,17 +1326,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
printk (KERN_ERR "EXT3-fs: not enough memory\n");
goto failed_mount;
}
sbi->s_bgi = kmalloc(sbi->s_groups_count * sizeof(struct ext3_bg_info),
sbi->s_debts = kmalloc(sbi->s_groups_count * sizeof(u8),
GFP_KERNEL);
if (!sbi->s_bgi) {
if (!sbi->s_debts) {
printk("EXT3-fs: not enough memory to allocate s_bgi\n");
goto failed_mount2;
}
memset(sbi->s_bgi, 0, sbi->s_groups_count * sizeof(struct ext3_bg_info));
for (i = 0; i < sbi->s_groups_count; i++) {
spin_lock_init(&sbi->s_bgi[i].bg_balloc_lock);
spin_lock_init(&sbi->s_bgi[i].bg_ialloc_lock);
}
memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(u8));
percpu_counter_init(&sbi->s_freeblocks_counter);
percpu_counter_init(&sbi->s_freeinodes_counter);
percpu_counter_init(&sbi->s_dirs_counter);
bgl_lock_init(&sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logic_sb_block, i);
sbi->s_group_desc[i] = sb_bread(sb, block);
......@@ -1469,12 +1451,19 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
"writeback");
percpu_counter_mod(&sbi->s_freeblocks_counter,
ext3_count_free_blocks(sb));
percpu_counter_mod(&sbi->s_freeinodes_counter,
ext3_count_free_inodes(sb));
percpu_counter_mod(&sbi->s_dirs_counter,
ext3_count_dirs(sb));
return 0;
failed_mount3:
journal_destroy(sbi->s_journal);
failed_mount2:
kfree(sbi->s_bgi);
kfree(sbi->s_debts);
for (i = 0; i < db_count; i++)
brelse(sbi->s_group_desc[i]);
kfree(sbi->s_group_desc);
......
......@@ -19,15 +19,10 @@
#ifdef __KERNEL__
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#endif
struct ext3_bg_info {
u8 bg_debts;
spinlock_t bg_balloc_lock;
spinlock_t bg_ialloc_lock;
unsigned long bg_reserved;
} ____cacheline_aligned_in_smp;
/*
* third extended-fs super-block data in memory
*/
......@@ -57,7 +52,11 @@ struct ext3_sb_info {
u32 s_next_generation;
u32 s_hash_seed[4];
int s_def_hash_version;
struct ext3_bg_info *s_bgi;
u8 *s_debts;
struct percpu_counter s_freeblocks_counter;
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct blockgroup_lock s_blockgroup_lock;
/* Journaling */
struct inode * s_journal_inode;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment