Commit 149b3060 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Mostly performance and bug fixes, plus some cleanups.  The one new
  feature this merge window is a new ioctl EXT4_IOC_SWAP_BOOT which
  allows installation of a hidden inode designed for boot loaders."

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (50 commits)
  ext4: fix type-widening bug in inode table readahead code
  ext4: add check for inodes_count overflow in new resize ioctl
  ext4: fix Kconfig documentation for CONFIG_EXT4_DEBUG
  ext4: fix online resizing for ext3-compat file systems
  jbd2: trace when lock_buffer in do_get_write_access takes a long time
  ext4: mark metadata blocks using bh flags
  buffer: add BH_Prio and BH_Meta flags
  ext4: mark all metadata I/O with REQ_META
  ext4: fix readdir error in case inline_data+^dir_index.
  ext4: fix readdir error in the case of inline_data+dir_index
  jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset
  ext4: mext_insert_extents should update extent block checksum
  ext4: move quota initialization out of inode allocation transaction
  ext4: reserve xattr index for Rich ACL support
  jbd2: reduce journal_head size
  ext4: clear buffer_uninit flag when submitting IO
  ext4: use io_end for multiple bios
  ext4: make ext4_bio_write_page() use BH_Async_Write flags
  ext4: Use kstrtoul() instead of parse_strtoul()
  ext4: defragmentation code cleanup
  ...
parents b0ca4d01 0d606e2c
......@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
session_write_kbytes This file is read-only and shows the number of
kilobytes of data that have been written to this
filesystem since it was mounted.
reserved_clusters This is RW file and contains number of reserved
clusters in the file system which will be used
in the specific situations to avoid costly
zeroout, unexpected ENOSPC, or possible data
loss. The default is 2% or 4096 clusters,
whichever is smaller and this can be changed
however it can never exceed number of clusters
in the file system. If there is not enough space
for the reserved space when mounting the file
mount will _not_ fail.
..............................................................................
Ioctls
......@@ -587,6 +598,16 @@ Table of Ext4 specific ioctls
bitmaps and inode table, the userspace tool thus
just passes the new number of blocks.
EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
(like i_blocks, i_size, i_flags, ...) from
the specified inode with inode
EXT4_BOOT_LOADER_INO (#5). This is typically
used to store a boot loader in a secure part of
the filesystem, where it can't be changed by a
normal user by accident.
The data blocks of the previous boot loader
will be associated with the given inode.
..............................................................................
References
......
......@@ -2987,6 +2987,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
/* Take care of bh's that straddle the end of the device */
guard_bh_eod(rw, bio, bh);
if (buffer_meta(bh))
rw |= REQ_META;
if (buffer_prio(bh))
rw |= REQ_PRIO;
bio_get(bio);
submit_bio(rw, bio);
......
......@@ -71,4 +71,5 @@ config EXT4_DEBUG
Enables run-time debugging support for the ext4 filesystem.
If you select Y here, then you will be able to turn on debugging
with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
with a command such as:
echo 1 > /sys/module/ext4/parameters/mballoc_debug
......@@ -29,6 +29,23 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
* balloc.c contains the blocks allocation and deallocation routines
*/
/*
* Calculate block group number for a given block number
*/
ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block)
{
ext4_group_t group;
if (test_opt2(sb, STD_GROUP_SIZE))
group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
block) >>
(EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
else
ext4_get_group_no_and_offset(sb, block, &group, NULL);
return group;
}
/*
* Calculate the block group number and offset into the block/cluster
* allocation bitmap, given a block number
......@@ -49,14 +66,18 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
}
static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
ext4_group_t block_group)
/*
* Check whether the 'block' lives within the 'block_group'. Returns 1 if so
* and 0 otherwise.
*/
static inline int ext4_block_in_group(struct super_block *sb,
ext4_fsblk_t block,
ext4_group_t block_group)
{
ext4_group_t actual_group;
ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
if (actual_group == block_group)
return 1;
return 0;
actual_group = ext4_get_group_number(sb, block);
return (actual_group == block_group) ? 1 : 0;
}
/* Return the number of clusters used for file system metadata; this
......@@ -420,7 +441,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
trace_ext4_read_block_bitmap_load(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ, bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
ext4_validate_block_bitmap(sb, desc, block_group, bh);
......@@ -478,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
s64 nclusters, unsigned int flags)
{
s64 free_clusters, dirty_clusters, root_clusters;
s64 free_clusters, dirty_clusters, rsv, resv_clusters;
struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
free_clusters = percpu_counter_read_positive(fcc);
dirty_clusters = percpu_counter_read_positive(dcc);
resv_clusters = atomic64_read(&sbi->s_resv_clusters);
/*
* r_blocks_count should always be multiple of the cluster ratio so
* we are safe to do a plane bit shift only.
*/
root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
resv_clusters;
if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
if (free_clusters - (nclusters + rsv + dirty_clusters) <
EXT4_FREECLUSTERS_WATERMARK) {
free_clusters = percpu_counter_sum_positive(fcc);
dirty_clusters = percpu_counter_sum_positive(dcc);
......@@ -499,15 +522,21 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
/* Check whether we have space after accounting for current
* dirty clusters & root reserved clusters.
*/
if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
if (free_clusters >= (rsv + nclusters + dirty_clusters))
return 1;
/* Hm, nope. Are (enough) root reserved clusters available? */
if (uid_eq(sbi->s_resuid, current_fsuid()) ||
(!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) ||
capable(CAP_SYS_RESOURCE) ||
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
(flags & EXT4_MB_USE_ROOT_BLOCKS)) {
if (free_clusters >= (nclusters + dirty_clusters +
resv_clusters))
return 1;
}
/* No free blocks. Let's see if we can dip into reserved pool */
if (flags & EXT4_MB_USE_RESERVED) {
if (free_clusters >= (nclusters + dirty_clusters))
return 1;
}
......
......@@ -46,7 +46,8 @@ static int is_dx_dir(struct inode *inode)
if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_COMPAT_DIR_INDEX) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1)))
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
return 1;
return 0;
......@@ -115,14 +116,6 @@ static int ext4_readdir(struct file *filp,
int ret = 0;
int dir_has_error = 0;
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
ret = ext4_read_inline_dir(filp, dirent, filldir,
&has_inline_data);
if (has_inline_data)
return ret;
}
if (is_dx_dir(inode)) {
err = ext4_dx_readdir(filp, dirent, filldir);
if (err != ERR_BAD_DX_DIR) {
......@@ -136,6 +129,15 @@ static int ext4_readdir(struct file *filp,
ext4_clear_inode_flag(file_inode(filp),
EXT4_INODE_INDEX);
}
if (ext4_has_inline_data(inode)) {
int has_inline_data = 1;
ret = ext4_read_inline_dir(filp, dirent, filldir,
&has_inline_data);
if (has_inline_data)
return ret;
}
stored = 0;
offset = filp->f_pos & (sb->s_blocksize - 1);
......
This diff is collapsed.
......@@ -270,5 +270,10 @@ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix,
0xffff);
}
#define ext4_ext_dirty(handle, inode, path) \
__ext4_ext_dirty(__func__, __LINE__, (handle), (inode), (path))
int __ext4_ext_dirty(const char *where, unsigned int line, handle_t *handle,
struct inode *inode, struct ext4_ext_path *path);
#endif /* _EXT4_EXTENTS */
......@@ -43,6 +43,8 @@ handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
{
journal_t *journal;
might_sleep();
trace_ext4_journal_start(sb, nblocks, _RET_IP_);
if (sb->s_flags & MS_RDONLY)
return ERR_PTR(-EROFS);
......@@ -113,6 +115,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
{
int err = 0;
might_sleep();
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err)
......@@ -209,6 +213,10 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
{
int err = 0;
might_sleep();
set_buffer_meta(bh);
set_buffer_prio(bh);
if (ext4_handle_valid(handle)) {
err = jbd2_journal_dirty_metadata(handle, bh);
if (err) {
......
......@@ -29,11 +29,13 @@
* block to complete the transaction.
*
* For extents-enabled fs we may have to allocate and modify up to
* 5 levels of tree + root which are stored in the inode. */
* 5 levels of tree, data block (for each of these we need bitmap + group
* summaries), root which is stored in the inode, sb
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
? 27U : 8U)
? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
......@@ -194,16 +196,20 @@ static inline void ext4_journal_callback_add(handle_t *handle,
* ext4_journal_callback_del: delete a registered callback
* @handle: active journal transaction handle on which callback was registered
* @jce: registered journal callback entry to unregister
* Return true if object was sucessfully removed
*/
static inline void ext4_journal_callback_del(handle_t *handle,
static inline bool ext4_journal_callback_try_del(handle_t *handle,
struct ext4_journal_cb_entry *jce)
{
bool deleted;
struct ext4_sb_info *sbi =
EXT4_SB(handle->h_transaction->t_journal->j_private);
spin_lock(&sbi->s_md_lock);
deleted = !list_empty(&jce->jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
return deleted;
}
int
......
This diff is collapsed.
......@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (journal->j_flags & JBD2_BARRIER &&
!jbd2_trans_will_send_data_barrier(journal, commit_tid))
needs_barrier = true;
jbd2_log_start_commit(journal, commit_tid);
ret = jbd2_log_wait_commit(journal, commit_tid);
ret = jbd2_complete_transaction(journal, commit_tid);
if (needs_barrier) {
err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
if (!ret)
......
......@@ -166,7 +166,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
trace_ext4_load_inode_bitmap(sb, block_group);
bh->b_end_io = ext4_end_bitmap_read;
get_bh(bh);
submit_bh(READ, bh);
submit_bh(READ | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
put_bh(bh);
......@@ -666,6 +666,23 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ei = EXT4_I(inode);
sbi = EXT4_SB(sb);
/*
* Initalize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
dquot_initialize(inode);
if (!goal)
goal = sbi->s_inode_goal;
......@@ -697,7 +714,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
goto fail;
goto out;
/*
* Check free inodes count before loading bitmap.
......@@ -711,7 +728,7 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (!inode_bitmap_bh)
goto fail;
goto out;
repeat_in_this_group:
ino = ext4_find_next_zero_bit((unsigned long *)
......@@ -733,13 +750,16 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
handle_type, nblocks);
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
goto fail;
ext4_std_error(sb, err);
goto out;
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
......@@ -755,8 +775,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
got:
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
......@@ -768,7 +790,8 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
err = ext4_journal_get_write_access(handle, block_bitmap_bh);
if (err) {
brelse(block_bitmap_bh);
goto fail;
ext4_std_error(sb, err);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
......@@ -787,14 +810,18 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
......@@ -840,8 +867,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err)
goto fail;
if (err) {
ext4_std_error(sb, err);
goto out;
}
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
......@@ -851,16 +880,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = dir->i_gid;
} else
inode_init_owner(inode, dir, mode);
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
......@@ -889,7 +908,9 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
* twice.
*/
err = -EIO;
goto fail;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
goto out;
}
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
......@@ -899,7 +920,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
__u32 csum;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
......@@ -918,7 +938,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
dquot_initialize(inode);
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
......@@ -952,24 +971,17 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
goto really_out;
fail:
ext4_std_error(sb, err);
out:
iput(inode);
ret = ERR_PTR(err);
really_out:
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
fail_drop:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
clear_nlink(inode);
unlock_new_inode(inode);
out:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
......
This diff is collapsed.
......@@ -19,7 +19,8 @@
#define EXT4_XATTR_SYSTEM_DATA "data"
#define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS))
#define EXT4_INLINE_DOTDOT_SIZE 4
#define EXT4_INLINE_DOTDOT_OFFSET 2
#define EXT4_INLINE_DOTDOT_SIZE 4
int ext4_get_inline_size(struct inode *inode)
{
......@@ -1289,6 +1290,120 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry,
return ret;
}
/*
* This function fills a red-black tree with information from an
* inlined dir. It returns the number directory entries loaded
* into the tree. If there is an error it is returned in err.
*/
int htree_inlinedir_to_tree(struct file *dir_file,
struct inode *dir, ext4_lblk_t block,
struct dx_hash_info *hinfo,
__u32 start_hash, __u32 start_minor_hash,
int *has_inline_data)
{
int err = 0, count = 0;
unsigned int parent_ino;
int pos;
struct ext4_dir_entry_2 *de;
struct inode *inode = file_inode(dir_file);
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
struct ext4_dir_entry_2 fake;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
return ret;
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
up_read(&EXT4_I(inode)->xattr_sem);
*has_inline_data = 0;
goto out;
}
inline_size = ext4_get_inline_size(inode);
dir_buf = kmalloc(inline_size, GFP_NOFS);
if (!dir_buf) {
ret = -ENOMEM;
up_read(&EXT4_I(inode)->xattr_sem);
goto out;
}
ret = ext4_read_inline_data(inode, dir_buf, inline_size, &iloc);
up_read(&EXT4_I(inode)->xattr_sem);
if (ret < 0)
goto out;
pos = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
while (pos < inline_size) {
/*
* As inlined dir doesn't store any information about '.' and
* only the inode number of '..' is stored, we have to handle
* them differently.
*/
if (pos == 0) {
fake.inode = cpu_to_le32(inode->i_ino);
fake.name_len = 1;
strcpy(fake.name, ".");
fake.rec_len = ext4_rec_len_to_disk(
EXT4_DIR_REC_LEN(fake.name_len),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_OFFSET;
} else if (pos == EXT4_INLINE_DOTDOT_OFFSET) {
fake.inode = cpu_to_le32(parent_ino);
fake.name_len = 2;
strcpy(fake.name, "..");
fake.rec_len = ext4_rec_len_to_disk(
EXT4_DIR_REC_LEN(fake.name_len),
inline_size);
ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
de = &fake;
pos = EXT4_INLINE_DOTDOT_SIZE;
} else {
de = (struct ext4_dir_entry_2 *)(dir_buf + pos);
pos += ext4_rec_len_from_disk(de->rec_len, inline_size);
if (ext4_check_dir_entry(inode, dir_file, de,
iloc.bh, dir_buf,
inline_size, pos)) {
ret = count;
goto out;
}
}
ext4fs_dirhash(de->name, de->name_len, hinfo);
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
(hinfo->minor_hash < start_minor_hash)))
continue;
if (de->inode == 0)
continue;
err = ext4_htree_store_dirent(dir_file,
hinfo->hash, hinfo->minor_hash, de);
if (err) {
count = err;
goto out;
}
count++;
}
ret = count;
out:
kfree(dir_buf);
brelse(iloc.bh);
return ret;
}
/*
* So this function is called when the volume is mkfsed with
* dir_index disabled. In order to keep f_pos persistent
* after we convert from an inlined dir to a blocked based,
* we just pretend that we are a normal dir and return the
* offset as if '.' and '..' really take place.
*
*/
int ext4_read_inline_dir(struct file *filp,
void *dirent, filldir_t filldir,
int *has_inline_data)
......@@ -1302,6 +1417,7 @@ int ext4_read_inline_dir(struct file *filp,
int ret, inline_size = 0;
struct ext4_iloc iloc;
void *dir_buf = NULL;
int dotdot_offset, dotdot_size, extra_offset, extra_size;
ret = ext4_get_inode_loc(inode, &iloc);
if (ret)
......@@ -1330,8 +1446,21 @@ int ext4_read_inline_dir(struct file *filp,
sb = inode->i_sb;
stored = 0;
parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
offset = filp->f_pos;
while (!error && !stored && filp->f_pos < inode->i_size) {
/*
* dotdot_offset and dotdot_size is the real offset and
* size for ".." and "." if the dir is block based while
* the real size for them are only EXT4_INLINE_DOTDOT_SIZE.
* So we will use extra_offset and extra_size to indicate them
* during the inline dir iteration.
*/
dotdot_offset = EXT4_DIR_REC_LEN(1);
dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
extra_size = extra_offset + inline_size;
while (!error && !stored && filp->f_pos < extra_size) {
revalidate:
/*
* If the version has changed since the last call to
......@@ -1340,15 +1469,23 @@ int ext4_read_inline_dir(struct file *filp,
* dir to make sure.
*/
if (filp->f_version != inode->i_version) {
for (i = 0;
i < inode->i_size && i < offset;) {
for (i = 0; i < extra_size && i < offset;) {
/*
* "." is with offset 0 and
* ".." is dotdot_offset.
*/
if (!i) {
/* skip "." and ".." if needed. */
i += EXT4_INLINE_DOTDOT_SIZE;
i = dotdot_offset;
continue;
} else if (i == dotdot_offset) {
i = dotdot_size;
continue;
}
/* for other entry, the real offset in
* the buf has to be tuned accordingly.
*/
de = (struct ext4_dir_entry_2 *)
(dir_buf + i);
(dir_buf + i - extra_offset);
/* It's too expensive to do a full
* dirent test each time round this
* loop, but we do have to test at
......@@ -1356,43 +1493,47 @@ int ext4_read_inline_dir(struct file *filp,
* failure will be detected in the
* dirent test below. */
if (ext4_rec_len_from_disk(de->rec_len,
inline_size) < EXT4_DIR_REC_LEN(1))
extra_size) < EXT4_DIR_REC_LEN(1))
break;
i += ext4_rec_len_from_disk(de->rec_len,
inline_size);
extra_size);
}
offset = i;
filp->f_pos = offset;
filp->f_version = inode->i_version;
}
while (!error && filp->f_pos < inode->i_size) {
while (!error && filp->f_pos < extra_size) {
if (filp->f_pos == 0) {
error = filldir(dirent, ".", 1, 0, inode->i_ino,
DT_DIR);
if (error)
break;
stored++;
filp->f_pos = dotdot_offset;
continue;
}
error = filldir(dirent, "..", 2, 0, parent_ino,
DT_DIR);
if (filp->f_pos == dotdot_offset) {
error = filldir(dirent, "..", 2,
dotdot_offset,
parent_ino, DT_DIR);
if (error)
break;
stored++;
filp->f_pos = offset = EXT4_INLINE_DOTDOT_SIZE;
filp->f_pos = dotdot_size;
continue;
}
de = (struct ext4_dir_entry_2 *)(dir_buf + offset);
de = (struct ext4_dir_entry_2 *)
(dir_buf + filp->f_pos - extra_offset);
if (ext4_check_dir_entry(inode, filp, de,
iloc.bh, dir_buf,
inline_size, offset)) {
extra_size, filp->f_pos)) {
ret = stored;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len,
inline_size);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
......@@ -1415,9 +1556,8 @@ int ext4_read_inline_dir(struct file *filp,
stored++;
}
filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
inline_size);
extra_size);
}
offset = 0;
}
out:
kfree(dir_buf);
......
This diff is collapsed.
......@@ -17,9 +17,201 @@
#include <asm/uaccess.h>
#include "ext4_jbd2.h"
#include "ext4.h"
#include "ext4_extents.h"
#define MAX_32_NUM ((((unsigned long long) 1) << 32) - 1)
/**
* Swap memory between @a and @b for @len bytes.
*
* @a: pointer to first memory area
* @b: pointer to second memory area
* @len: number of bytes to swap
*
*/
static void memswap(void *a, void *b, size_t len)
{
unsigned char *ap, *bp;
unsigned char tmp;
ap = (unsigned char *)a;
bp = (unsigned char *)b;
while (len-- > 0) {
tmp = *ap;
*ap = *bp;
*bp = tmp;
ap++;
bp++;
}
}
/**
* Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors.
*
* Therefore you have to make sure, that calling this method twice
* will revert all changes.
*
* @inode1: pointer to first inode
* @inode2: pointer to second inode
*/
static void swap_inode_data(struct inode *inode1, struct inode *inode2)
{
loff_t isize;
struct ext4_inode_info *ei1;
struct ext4_inode_info *ei2;
ei1 = EXT4_I(inode1);
ei2 = EXT4_I(inode2);
memswap(&inode1->i_flags, &inode2->i_flags, sizeof(inode1->i_flags));
memswap(&inode1->i_version, &inode2->i_version,
sizeof(inode1->i_version));
memswap(&inode1->i_blocks, &inode2->i_blocks,
sizeof(inode1->i_blocks));
memswap(&inode1->i_bytes, &inode2->i_bytes, sizeof(inode1->i_bytes));
memswap(&inode1->i_atime, &inode2->i_atime, sizeof(inode1->i_atime));
memswap(&inode1->i_mtime, &inode2->i_mtime, sizeof(inode1->i_mtime));
memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree));
memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr));
isize = i_size_read(inode1);
i_size_write(inode1, i_size_read(inode2));
i_size_write(inode2, isize);
}
/**
* Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes.
*
* @sb: the super block of the filesystem
* @inode: the inode to swap with EXT4_BOOT_LOADER_INO
*
*/
static long swap_inode_boot_loader(struct super_block *sb,
struct inode *inode)
{
handle_t *handle;
int err;
struct inode *inode_bl;
struct ext4_inode_info *ei;
struct ext4_inode_info *ei_bl;
struct ext4_sb_info *sbi;
if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode)) {
err = -EINVAL;
goto swap_boot_out;
}
if (!inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) {
err = -EPERM;
goto swap_boot_out;
}
sbi = EXT4_SB(sb);
ei = EXT4_I(inode);
inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO);
if (IS_ERR(inode_bl)) {
err = PTR_ERR(inode_bl);
goto swap_boot_out;
}
ei_bl = EXT4_I(inode_bl);
filemap_flush(inode->i_mapping);
filemap_flush(inode_bl->i_mapping);
/* Protect orig inodes against a truncate and make sure,
* that only 1 swap_inode_boot_loader is running. */
ext4_inode_double_lock(inode, inode_bl);
truncate_inode_pages(&inode->i_data, 0);
truncate_inode_pages(&inode_bl->i_data, 0);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(inode);
ext4_inode_block_unlocked_dio(inode_bl);
inode_dio_wait(inode);
inode_dio_wait(inode_bl);
handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2);
if (IS_ERR(handle)) {
err = -EINVAL;
goto swap_boot_out;
}
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
if (inode_bl->i_nlink == 0) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
i_gid_write(inode_bl, 0);
inode_bl->i_flags = 0;
ei_bl->i_flags = 0;
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
if (EXT4_HAS_INCOMPAT_FEATURE(sb,
EXT4_FEATURE_INCOMPAT_EXTENTS)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data));
}
swap_inode_data(inode, inode_bl);
inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
inode_bl->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
ext4_warning(inode->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
} else {
err = ext4_mark_inode_dirty(handle, inode_bl);
if (err < 0) {
ext4_warning(inode_bl->i_sb,
"couldn't mark inode #%lu dirty (err %d)",
inode_bl->i_ino, err);
/* Revert all changes: */
swap_inode_data(inode, inode_bl);
ext4_mark_inode_dirty(handle, inode);
}
}
ext4_journal_stop(handle);
ext4_double_up_write_data_sem(inode, inode_bl);
ext4_inode_resume_unlocked_dio(inode);
ext4_inode_resume_unlocked_dio(inode_bl);
ext4_inode_double_unlock(inode, inode_bl);
iput(inode_bl);
swap_boot_out:
return err;
}
long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
......@@ -83,17 +275,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (!capable(CAP_SYS_RESOURCE))
goto flags_out;
}
if (oldflags & EXT4_EXTENTS_FL) {
/* We don't support clearning extent flags */
if (!(flags & EXT4_EXTENTS_FL)) {
err = -EOPNOTSUPP;
goto flags_out;
}
} else if (flags & EXT4_EXTENTS_FL) {
/* migrate the file */
if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
migrate = 1;
flags &= ~EXT4_EXTENTS_FL;
}
if (flags & EXT4_EOFBLOCKS_FL) {
/* we don't support adding EOFBLOCKS flag */
......@@ -137,8 +320,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
err = ext4_change_inode_journal_flag(inode, jflag);
if (err)
goto flags_out;
if (migrate)
err = ext4_ext_migrate(inode);
if (migrate) {
if (flags & EXT4_EXTENTS_FL)
err = ext4_ext_migrate(inode);
else
err = ext4_ind_migrate(inode);
}
flags_out:
mutex_unlock(&inode->i_mutex);
mnt_drop_write_file(filp);
......@@ -357,9 +545,13 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return err;
}
case EXT4_IOC_SWAP_BOOT:
if (!(filp->f_mode & FMODE_WRITE))
return -EBADF;
return swap_inode_boot_loader(sb, inode);
case EXT4_IOC_RESIZE_FS: {
ext4_fsblk_t n_blocks_count;
struct super_block *sb = inode->i_sb;
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
......
This diff is collapsed.
......@@ -426,7 +426,6 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
return retval;
}
return retval;
}
int ext4_ext_migrate(struct inode *inode)
......@@ -606,3 +605,64 @@ int ext4_ext_migrate(struct inode *inode)
return retval;
}
/*
* Migrate a simple extent-based inode to use the i_blocks[] array
*/
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
ext4_fsblk_t blk;
handle_t *handle;
int ret;
if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_INCOMPAT_EXTENTS) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
EXT4_FEATURE_RO_COMPAT_BIGALLOC))
return -EOPNOTSUPP;
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
if (IS_ERR(handle))
return PTR_ERR(handle);
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
if (ret)
goto errout;
eh = ext_inode_hdr(inode);
ex = EXT_FIRST_EXTENT(eh);
if (ext4_blocks_count(es) > EXT4_MAX_BLOCK_FILE_PHYS ||
eh->eh_depth != 0 || le16_to_cpu(eh->eh_entries) > 1) {
ret = -EOPNOTSUPP;
goto errout;
}
if (eh->eh_entries == 0)
blk = len = 0;
else {
len = le16_to_cpu(ex->ee_len);
blk = ext4_ext_pblock(ex);
if (len > EXT4_NDIR_BLOCKS) {
ret = -EOPNOTSUPP;
goto errout;
}
}
ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
memset(ei->i_data, 0, sizeof(ei->i_data));
for (i=0; i < len; i++)
ei->i_data[i] = cpu_to_le32(blk++);
ext4_mark_inode_dirty(handle, inode);
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
return ret;
}
......@@ -7,7 +7,7 @@
#include "ext4.h"
/* Checksumming functions */
static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct mmp_struct, mmp_checksum);
......@@ -54,7 +54,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE_SYNC, bh);
submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
wait_on_buffer(bh);
sb_end_write(sb);
if (unlikely(!buffer_uptodate(bh)))
......@@ -86,7 +86,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
get_bh(*bh);
lock_buffer(*bh);
(*bh)->b_end_io = end_buffer_read_sync;
submit_bh(READ_SYNC, *bh);
submit_bh(READ_SYNC | REQ_META | REQ_PRIO, *bh);
wait_on_buffer(*bh);
if (!buffer_uptodate(*bh)) {
brelse(*bh);
......
......@@ -144,12 +144,13 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
}
/**
* double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
* ext4_double_down_write_data_sem - Acquire two inodes' write lock
* of i_data_sem
*
* Acquire write lock of i_data_sem of the two inodes
*/
static void
double_down_write_data_sem(struct inode *first, struct inode *second)
void
ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
{
if (first < second) {
down_write(&EXT4_I(first)->i_data_sem);
......@@ -162,14 +163,15 @@ double_down_write_data_sem(struct inode *first, struct inode *second)
}
/**
* double_up_write_data_sem - Release two inodes' write lock of i_data_sem
* ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
*
* @orig_inode: original inode structure to be released its lock first
* @donor_inode: donor inode structure to be released its lock second
* Release write lock of i_data_sem of two inodes (orig and donor).
*/
static void
double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
void
ext4_double_up_write_data_sem(struct inode *orig_inode,
struct inode *donor_inode)
{
up_write(&EXT4_I(orig_inode)->i_data_sem);
up_write(&EXT4_I(donor_inode)->i_data_sem);
......@@ -407,18 +409,7 @@ mext_insert_extents(handle_t *handle, struct inode *orig_inode,
mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
end_ext, eh, range_to_move);
if (depth) {
ret = ext4_handle_dirty_metadata(handle, orig_inode,
orig_path->p_bh);
if (ret)
return ret;
} else {
ret = ext4_mark_inode_dirty(handle, orig_inode);
if (ret < 0)
return ret;
}
return 0;
return ext4_ext_dirty(handle, orig_inode, orig_path);
}
/**
......@@ -737,6 +728,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
donor_off += dext_alen;
orig_off += dext_alen;
BUG_ON(replaced_count > count);
/* Already moved the expected blocks */
if (replaced_count >= count)
break;
......@@ -814,7 +806,13 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
page_cache_release(page[0]);
return -ENOMEM;
}
/*
* grab_cache_page_write_begin() may not wait on page's writeback if
* BDI not demand that. But it is reasonable to be very conservative
* here and explicitly wait on page's writeback
*/
wait_on_page_writeback(page[0]);
wait_on_page_writeback(page[1]);
if (inode1 > inode2) {
struct page *tmp;
tmp = page[0];
......@@ -856,7 +854,6 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
int err = 0;
err = ext4_get_block(inode, block, bh, 0);
if (err) {
SetPageError(page);
......@@ -976,7 +973,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* necessary, just swap data blocks between orig and donor.
*/
if (uninit) {
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
* fallback to data copying */
uninit = mext_check_coverage(orig_inode, orig_blk_offset,
......@@ -990,7 +987,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
goto drop_data_sem;
if (!uninit) {
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto data_copy;
}
if ((page_has_private(pagep[0]) &&
......@@ -1004,7 +1001,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
donor_inode, orig_blk_offset,
block_len_in_page, err);
drop_data_sem:
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
goto unlock_pages;
}
data_copy:
......@@ -1033,7 +1030,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
}
/* Perform all necessary steps similar write_begin()/write_end()
* but keeping in mind that i_size will not change */
*err = __block_write_begin(pagep[0], from, from + replaced_size,
*err = __block_write_begin(pagep[0], from, replaced_size,
ext4_get_block);
if (!*err)
*err = block_commit_write(pagep[0], from, from + replaced_size);
......@@ -1065,11 +1062,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
* Extents are swapped already, but we are not able to copy data.
* Try to swap extents to it's original places
*/
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
orig_blk_offset,
block_len_in_page, &err2);
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
if (replaced_count != block_len_in_page) {
EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
"Unable to copy data block,"
......@@ -1209,15 +1206,15 @@ mext_check_arguments(struct inode *orig_inode,
}
/**
* mext_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
* ext4_inode_double_lock - Lock i_mutex on both @inode1 and @inode2
*
* @inode1: the inode structure
* @inode2: the inode structure
*
* Lock two inodes' i_mutex
*/
static void
mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
void
ext4_inode_double_lock(struct inode *inode1, struct inode *inode2)
{
BUG_ON(inode1 == inode2);
if (inode1 < inode2) {
......@@ -1230,15 +1227,15 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
}
/**
* mext_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
* ext4_inode_double_unlock - Release i_mutex on both @inode1 and @inode2
*
* @inode1: the inode that is released first
* @inode2: the inode that is released second
*
*/
static void
mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
void
ext4_inode_double_unlock(struct inode *inode1, struct inode *inode2)
{
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
......@@ -1333,7 +1330,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
return -EINVAL;
}
/* Protect orig and donor inodes against a truncate */
mext_inode_double_lock(orig_inode, donor_inode);
ext4_inode_double_lock(orig_inode, donor_inode);
/* Wait for all existing dio workers */
ext4_inode_block_unlocked_dio(orig_inode);
......@@ -1342,7 +1339,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
inode_dio_wait(donor_inode);
/* Protect extent tree against block allocations via delalloc */
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* Check the filesystem environment whether move_extent can be done */
ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
donor_start, &len);
......@@ -1466,7 +1463,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
* b. racing with ->readpage, ->write_begin, and ext4_get_block
* in move_extent_per_page
*/
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
while (orig_page_offset <= seq_end_page) {
......@@ -1500,7 +1497,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
block_len_in_page = rest_blocks;
}
double_down_write_data_sem(orig_inode, donor_inode);
ext4_double_down_write_data_sem(orig_inode, donor_inode);
if (ret < 0)
break;
......@@ -1538,10 +1535,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
ext4_ext_drop_refs(holecheck_path);
kfree(holecheck_path);
}
double_up_write_data_sem(orig_inode, donor_inode);
ext4_double_up_write_data_sem(orig_inode, donor_inode);
ext4_inode_resume_unlocked_dio(orig_inode);
ext4_inode_resume_unlocked_dio(donor_inode);
mext_inode_double_unlock(orig_inode, donor_inode);
ext4_inode_double_unlock(orig_inode, donor_inode);
return ret;
}
......@@ -416,15 +416,16 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum, old_csum;
__u32 csum;
__le32 save_csum;
int size;
size = count_offset + (count * sizeof(struct dx_entry));
old_csum = t->dt_checksum;
save_csum = t->dt_checksum;
t->dt_checksum = 0;
csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
t->dt_checksum = old_csum;
t->dt_checksum = save_csum;
return cpu_to_le32(csum);
}
......@@ -971,6 +972,17 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
hinfo.hash_version +=
EXT4_SB(dir->i_sb)->s_hash_unsigned;
hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed;
if (ext4_has_inline_data(dir)) {
int has_inline_data = 1;
count = htree_inlinedir_to_tree(dir_file, dir, 0,
&hinfo, start_hash,
start_minor_hash,
&has_inline_data);
if (has_inline_data) {
*next_hash = ~0;
return count;
}
}
count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo,
start_hash, start_minor_hash);
*next_hash = ~0;
......@@ -1455,24 +1467,6 @@ struct dentry *ext4_get_parent(struct dentry *child)
return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino));
}
#define S_SHIFT 12
static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
[S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = EXT4_FT_DIR,
[S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV,
[S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV,
[S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO,
[S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK,
[S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK,
};
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
/*
* Move count entries from end of map between two memory locations.
* Returns pointer to last entry moved.
......@@ -2251,8 +2245,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
......@@ -2286,8 +2279,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0,
NULL, EXT4_HT_DIR, credits);
......@@ -2396,8 +2388,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
dquot_initialize(dir);
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode,
&dentry->d_name,
......@@ -2826,8 +2817,7 @@ static int ext4_symlink(struct inode *dir,
* quota blocks, sb is already counted in previous macros).
*/
credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb);
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3;
}
retry:
inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO,
......
This diff is collapsed.
......@@ -272,7 +272,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk >= last_blk)
goto next_group;
group_data[bb_index].block_bitmap = start_blk++;
ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
......@@ -284,7 +284,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk >= last_blk)
goto next_group;
group_data[ib_index].inode_bitmap = start_blk++;
ext4_get_group_no_and_offset(sb, start_blk - 1, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count--;
if (flexbg_size > 1)
......@@ -296,7 +296,7 @@ static int ext4_alloc_group_tables(struct super_block *sb,
if (start_blk + EXT4_SB(sb)->s_itb_per_group > last_blk)
goto next_group;
group_data[it_index].inode_table = start_blk;
ext4_get_group_no_and_offset(sb, start_blk, &group, NULL);
group = ext4_get_group_number(sb, start_blk - 1);
group -= group_data[0].group;
group_data[group].free_blocks_count -=
EXT4_SB(sb)->s_itb_per_group;
......@@ -392,7 +392,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
ext4_group_t group;
int err;
ext4_get_group_no_and_offset(sb, block, &group, NULL);
group = ext4_get_group_number(sb, block);
start = ext4_group_first_block_no(sb, group);
group -= flex_gd->groups[0].group;
......@@ -1341,6 +1341,8 @@ static void ext4_update_super(struct super_block *sb,
/* Update the global fs size fields */
sbi->s_groups_count += flex_gd->count;
sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
/* Update the reserved block counts only once the new group is
* active. */
......@@ -1879,7 +1881,11 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
/* Nothing need to do */
return 0;
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
n_group = ext4_get_group_number(sb, n_blocks_count - 1);
if (n_group > (0xFFFFFFFFUL / EXT4_INODES_PER_GROUP(sb))) {
ext4_warning(sb, "resize would cause inodes_count overflow");
return -EINVAL;
}
ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
......
......@@ -81,6 +81,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
static struct file_system_type ext2_fs_type = {
......@@ -353,10 +354,13 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
int error = is_journal_aborted(journal);
struct ext4_journal_cb_entry *jce, *tmp;
struct ext4_journal_cb_entry *jce;
BUG_ON(txn->t_state == T_FINISHED);
spin_lock(&sbi->s_md_lock);
list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
while (!list_empty(&txn->t_private_list)) {
jce = list_entry(txn->t_private_list.next,
struct ext4_journal_cb_entry, jce_list);
list_del_init(&jce->jce_list);
spin_unlock(&sbi->s_md_lock);
jce->jce_func(sb, jce, error);
......@@ -1948,16 +1952,16 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
if ((sbi->s_es->s_feature_ro_compat &
cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
/* Use new metadata_csum algorithm */
__u16 old_csum;
__le16 save_csum;
__u32 csum32;
old_csum = gdp->bg_checksum;
save_csum = gdp->bg_checksum;
gdp->bg_checksum = 0;
csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
sbi->s_desc_size);
gdp->bg_checksum = old_csum;
gdp->bg_checksum = save_csum;
crc = csum32 & 0xFFFF;
goto out;
......@@ -2379,17 +2383,15 @@ struct ext4_attr {
int offset;
};
static int parse_strtoul(const char *buf,
unsigned long max, unsigned long *value)
static int parse_strtoull(const char *buf,
unsigned long long max, unsigned long long *value)
{
char *endp;
*value = simple_strtoul(skip_spaces(buf), &endp, 0);
endp = skip_spaces(endp);
if (*endp || *value > max)
return -EINVAL;
int ret;
return 0;
ret = kstrtoull(skip_spaces(buf), 0, value);
if (!ret && *value > max)
ret = -EINVAL;
return ret;
}
static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
......@@ -2431,11 +2433,13 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
const char *buf, size_t count)
{
unsigned long t;
int ret;
if (parse_strtoul(buf, 0x40000000, &t))
return -EINVAL;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
if (t && !is_power_of_2(t))
if (t && (!is_power_of_2(t) || t > 0x40000000))
return -EINVAL;
sbi->s_inode_readahead_blks = t;
......@@ -2456,13 +2460,36 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
{
unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
unsigned long t;
int ret;
if (parse_strtoul(buf, 0xffffffff, &t))
return -EINVAL;
ret = kstrtoul(skip_spaces(buf), 0, &t);
if (ret)
return ret;
*ui = t;
return count;
}
static ssize_t reserved_clusters_show(struct ext4_attr *a,
struct ext4_sb_info *sbi, char *buf)
{
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
}
static ssize_t reserved_clusters_store(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
{
unsigned long long val;
int ret;
if (parse_strtoull(buf, -1ULL, &val))
return -EINVAL;
ret = ext4_reserve_clusters(sbi, val);
return ret ? ret : count;
}
static ssize_t trigger_test_error(struct ext4_attr *a,
struct ext4_sb_info *sbi,
const char *buf, size_t count)
......@@ -2500,6 +2527,7 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
EXT4_RO_ATTR(delayed_allocation_blocks);
EXT4_RO_ATTR(session_write_kbytes);
EXT4_RO_ATTR(lifetime_write_kbytes);
EXT4_RW_ATTR(reserved_clusters);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
inode_readahead_blks_store, s_inode_readahead_blks);
EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
......@@ -2517,6 +2545,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(delayed_allocation_blocks),
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
......@@ -3192,6 +3221,40 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
{
ext4_fsblk_t resv_clusters;
/*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
* out of space like for example punch hole, or converting
* uninitialized extents in delalloc path. In most cases such
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
return resv_clusters;
}
static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
{
ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
sbi->s_cluster_bits;
if (count >= clusters)
return -EINVAL;
atomic64_set(&sbi->s_resv_clusters, count);
return 0;
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
char *orig_data = kstrdup(data, GFP_KERNEL);
......@@ -3526,6 +3589,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
/* Do we have standard group size of blocksize * 8 blocks ? */
if (sbi->s_blocks_per_group == blocksize << 3)
set_opt2(sb, STD_GROUP_SIZE);
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
......@@ -3698,6 +3765,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_err_report.function = print_daily_error_info;
sbi->s_err_report.data = (unsigned long) sb;
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sb);
err = percpu_counter_init(&sbi->s_freeclusters_counter,
ext4_count_free_clusters(sb));
if (!err) {
......@@ -3723,9 +3793,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_max_writeback_mb_bump = 128;
sbi->s_extent_max_zeroout_kb = 32;
/* Register extent status tree shrinker */
ext4_es_register_shrinker(sb);
/*
* set up enough so that it can read an inode
*/
......@@ -3911,6 +3978,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"available");
}
err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
if (err) {
ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
"reserved pool", ext4_calculate_resv_clusters(sbi));
goto failed_mount4a;
}
err = ext4_setup_system_zone(sb);
if (err) {
ext4_msg(sb, KERN_ERR, "failed to initialize system "
......@@ -4010,6 +4084,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_journal = NULL;
}
failed_mount3:
ext4_es_unregister_shrinker(sb);
del_timer(&sbi->s_err_report);
if (sbi->s_flex_groups)
ext4_kvfree(sbi->s_flex_groups);
......@@ -4177,7 +4252,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
goto out_bdev;
}
journal->j_private = sb;
ll_rw_block(READ, 1, &journal->j_sb_buffer);
ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &journal->j_sb_buffer);
wait_on_buffer(journal->j_sb_buffer);
if (!buffer_uptodate(journal->j_sb_buffer)) {
ext4_msg(sb, KERN_ERR, "I/O error on journal device");
......@@ -4742,9 +4817,10 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
struct super_block *sb = dentry->d_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_fsblk_t overhead = 0;
ext4_fsblk_t overhead = 0, resv_blocks;
u64 fsid;
s64 bfree;
resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
if (!test_opt(sb, MINIX_DF))
overhead = sbi->s_overhead;
......@@ -4756,8 +4832,9 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
/* prevent underflow in case that few free space is available */
buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
if (buf->f_bfree < ext4_r_blocks_count(es))
buf->f_bavail = buf->f_bfree -
(ext4_r_blocks_count(es) + resv_blocks);
if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
buf->f_bavail = 0;
buf->f_files = le32_to_cpu(es->s_inodes_count);
buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
......@@ -4945,6 +5022,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
return PTR_ERR(qf_inode);
}
/* Don't account quota for quota files to avoid recursion */
qf_inode->i_flags |= S_NOQUOTA;
err = dquot_enable(qf_inode, type, format_id, flags);
iput(qf_inode);
......
......@@ -122,17 +122,18 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
struct ext4_xattr_header *hdr)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum, old;
__u32 csum;
__le32 save_csum;
__le64 dsk_block_nr = cpu_to_le64(block_nr);
old = hdr->h_checksum;
save_csum = hdr->h_checksum;
hdr->h_checksum = 0;
block_nr = cpu_to_le64(block_nr);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
sizeof(block_nr));
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
EXT4_BLOCK_SIZE(inode->i_sb));
hdr->h_checksum = old;
hdr->h_checksum = save_csum;
return cpu_to_le32(csum);
}
......
......@@ -22,6 +22,7 @@
#define EXT4_XATTR_INDEX_LUSTRE 5
#define EXT4_XATTR_INDEX_SECURITY 6
#define EXT4_XATTR_INDEX_SYSTEM 7
#define EXT4_XATTR_INDEX_RICHACL 8
struct ext4_xattr_header {
__le32 h_magic; /* magic number for identification */
......
......@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int space_left = 0;
int first_tag = 0;
int tag_flag;
int i, to_free = 0;
int i;
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
......@@ -1134,7 +1134,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
spin_unlock(&journal->j_history_lock);
commit_transaction->t_state = T_FINISHED;
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
journal->j_committing_transaction = NULL;
......@@ -1149,38 +1149,44 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_average_commit_time*3) / 4;
else
journal->j_average_commit_time = commit_time;
write_unlock(&journal->j_state_lock);
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
to_free = 1;
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
if (journal->j_checkpoint_transactions == NULL) {
journal->j_checkpoint_transactions = commit_transaction;
commit_transaction->t_cpnext = commit_transaction;
commit_transaction->t_cpprev = commit_transaction;
} else {
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction->t_cpnext =
journal->j_checkpoint_transactions;
commit_transaction->t_cpprev =
commit_transaction->t_cpnext->t_cpprev;
commit_transaction->t_cpnext->t_cpprev =
commit_transaction;
commit_transaction->t_cpprev->t_cpnext =
commit_transaction;
}
}
spin_unlock(&journal->j_list_lock);
/* Drop all spin_locks because commit_callback may be block.
* __journal_remove_checkpoint() can not destroy transaction
* under us because it is not marked as T_FINISHED yet */
if (journal->j_commit_callback)
journal->j_commit_callback(journal, commit_transaction);
trace_jbd2_end_commit(journal, commit_transaction);
jbd_debug(1, "JBD2: commit %d complete, head %d\n",
journal->j_commit_sequence, journal->j_tail_sequence);
if (to_free)
jbd2_journal_free_transaction(commit_transaction);
write_lock(&journal->j_state_lock);
spin_lock(&journal->j_list_lock);
commit_transaction->t_state = T_FINISHED;
/* Recheck checkpoint lists after j_list_lock was dropped */
if (commit_transaction->t_checkpoint_list == NULL &&
commit_transaction->t_checkpoint_io_list == NULL) {
__jbd2_journal_drop_transaction(journal, commit_transaction);
jbd2_journal_free_transaction(commit_transaction);
}
spin_unlock(&journal->j_list_lock);
write_unlock(&journal->j_state_lock);
wake_up(&journal->j_wait_done_commit);
}
......@@ -707,6 +707,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err;
}
/*
* When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start
* committing that transaction before waiting for it to complete. If
* the transaction id is stale, it is by definition already completed,
* so just return SUCCESS.
*/
int jbd2_complete_transaction(journal_t *journal, tid_t tid)
{
int need_to_wait = 1;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid) {
if (journal->j_commit_request != tid) {
/* transaction not yet started, so request it */
read_unlock(&journal->j_state_lock);
jbd2_log_start_commit(journal, tid);
goto wait_commit;
}
} else if (!(journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid))
need_to_wait = 0;
read_unlock(&journal->j_state_lock);
if (!need_to_wait)
return 0;
wait_commit:
return jbd2_log_wait_commit(journal, tid);
}
EXPORT_SYMBOL(jbd2_complete_transaction);
/*
* Log buffer allocation routines:
*/
......
......@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
if (!handle)
return NULL;
memset(handle, 0, sizeof(*handle));
handle->h_buffer_credits = nblocks;
handle->h_ref = 1;
......@@ -640,6 +639,7 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
int error;
char *frozen_buffer = NULL;
int need_copy = 0;
unsigned long start_lock, time_lock;
if (is_handle_aborted(handle))
return -EROFS;
......@@ -655,9 +655,16 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
/* @@@ Need to check for errors here at some point. */
start_lock = jiffies;
lock_buffer(bh);
jbd_lock_bh_state(bh);
/* If it takes too long to lock the buffer, trace it */
time_lock = jbd2_time_diff(start_lock, jiffies);
if (time_lock > HZ/10)
trace_jbd2_lock_buffer_stall(bh->b_bdev->bd_dev,
jiffies_to_msecs(time_lock));
/* We now hold the buffer lock so it is safe to query the buffer
* state. Is the buffer dirty?
*
......
......@@ -34,6 +34,8 @@ enum bh_state_bits {
BH_Write_EIO, /* I/O error on write */
BH_Unwritten, /* Buffer is allocated on disk but not written */
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
......@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
BUFFER_FNS(Boundary, boundary)
BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
......
......@@ -480,6 +480,7 @@ struct transaction_s
T_COMMIT,
T_COMMIT_DFLUSH,
T_COMMIT_JFLUSH,
T_COMMIT_CALLBACK,
T_FINISHED
} t_state;
......@@ -1144,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
{
return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
}
static inline void jbd2_free_handle(handle_t *handle)
......@@ -1200,6 +1201,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_journal_force_commit_nested(journal_t *journal);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment