Commit d4f03186 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 bugfixes from Ted Ts'o:
 "Ext4 bug fixes for 3.17, to provide better handling of memory
  allocation failures, and to fix some journaling bugs involving
  journal checksums and FALLOC_FL_ZERO_RANGE"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix same-dir rename when inline data directory overflows
  jbd2: fix descriptor block size handling errors with journal_csum
  jbd2: fix infinite loop when recovering corrupt journal blocks
  ext4: update i_disksize coherently with block allocation on error path
  ext4: fix transaction issues for ext4_fallocate and ext_zero_range
  ext4: fix incorect journal credits reservation in ext4_zero_range
  ext4: move i_size,i_disksize update routines to helper function
  ext4: fix BUG_ON in mb_free_blocks()
  ext4: propagate errors up to ext4_find_entry()'s callers
parents ef13c8af d80d448c
......@@ -1825,7 +1825,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
/*
* Special error return code only used by dx_probe() and its callers.
*/
#define ERR_BAD_DX_DIR -75000
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
/*
* Timeout and state flag for lazy initialization inode thread.
......@@ -2454,6 +2454,22 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
up_write(&EXT4_I(inode)->i_data_sem);
}
/* Update i_size, i_disksize. Requires i_mutex to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
int changed = 0;
if (newsize > inode->i_size) {
i_size_write(inode, newsize);
changed = 1;
}
if (newsize > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, newsize);
changed |= 2;
}
return changed;
}
struct ext4_group_info {
unsigned long bb_state;
struct rb_root bb_free_root;
......
......@@ -4665,7 +4665,8 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
}
static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
ext4_lblk_t len, int flags, int mode)
ext4_lblk_t len, loff_t new_size,
int flags, int mode)
{
struct inode *inode = file_inode(file);
handle_t *handle;
......@@ -4674,8 +4675,10 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int retries = 0;
struct ext4_map_blocks map;
unsigned int credits;
loff_t epos;
map.m_lblk = offset;
map.m_len = len;
/*
* Don't normalize the request if it can fit in one extent so
* that it doesn't get unnecessarily split into multiple
......@@ -4690,9 +4693,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
credits = ext4_chunk_trans_blocks(inode, len);
retry:
while (ret >= 0 && ret < len) {
map.m_lblk = map.m_lblk + ret;
map.m_len = len = len - ret;
while (ret >= 0 && len) {
handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
credits);
if (IS_ERR(handle)) {
......@@ -4709,6 +4710,21 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
ret2 = ext4_journal_stop(handle);
break;
}
map.m_lblk += ret;
map.m_len = len = len - ret;
epos = (loff_t)map.m_lblk << inode->i_blkbits;
inode->i_ctime = ext4_current_time(inode);
if (new_size) {
if (epos > new_size)
epos = new_size;
if (ext4_update_inode_size(inode, epos) & 0x1)
inode->i_mtime = inode->i_ctime;
} else {
if (epos > inode->i_size)
ext4_set_inode_flag(inode,
EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
ret2 = ext4_journal_stop(handle);
if (ret2)
break;
......@@ -4731,7 +4747,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
loff_t new_size = 0;
int ret = 0;
int flags;
int partial;
int credits;
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
struct address_space *mapping = inode->i_mapping;
......@@ -4771,7 +4788,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (start < offset || end > offset + len)
return -EINVAL;
partial = (offset + len) & ((1 << blkbits) - 1);
partial_begin = offset & ((1 << blkbits) - 1);
partial_end = (offset + len) & ((1 << blkbits) - 1);
lblk = start >> blkbits;
max_blocks = (end >> blkbits);
......@@ -4805,7 +4823,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* If we have a partial block after EOF we have to allocate
* the entire block.
*/
if (partial)
if (partial_end)
max_blocks += 1;
}
......@@ -4813,6 +4831,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
......@@ -4825,13 +4844,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if (ret)
goto out_dio;
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags,
mode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
if (ret)
goto out_dio;
}
if (!partial_begin && !partial_end)
goto out_dio;
handle = ext4_journal_start(inode, EXT4_HT_MISC, 4);
/*
* In worst case we have to writeout two nonadjacent unwritten
* blocks and update the inode
*/
credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1;
if (ext4_should_journal_data(inode))
credits += 2;
handle = ext4_journal_start(inode, EXT4_HT_MISC, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
ext4_std_error(inode->i_sb, ret);
......@@ -4839,12 +4867,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
}
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
if (new_size) {
if (new_size > i_size_read(inode))
i_size_write(inode, new_size);
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
ext4_update_inode_size(inode, new_size);
} else {
/*
* Mark that we allocate beyond EOF so the subsequent truncate
......@@ -4853,7 +4877,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
/* Zero out partial block at the edges of the range */
......@@ -4880,13 +4903,11 @@ static long ext4_zero_range(struct file *file, loff_t offset,
long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
handle_t *handle;
loff_t new_size = 0;
unsigned int max_blocks;
int ret = 0;
int flags;
ext4_lblk_t lblk;
struct timespec tv;
unsigned int blkbits = inode->i_blkbits;
/* Return error if mode is not supported */
......@@ -4937,36 +4958,15 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
goto out;
}
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, flags, mode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
if (ret)
goto out;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
tv = inode->i_ctime = ext4_current_time(inode);
if (new_size) {
if (new_size > i_size_read(inode)) {
i_size_write(inode, new_size);
inode->i_mtime = tv;
if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) {
ret = jbd2_complete_transaction(EXT4_SB(inode->i_sb)->s_journal,
EXT4_I(inode)->i_sync_tid);
}
if (new_size > EXT4_I(inode)->i_disksize)
ext4_update_i_disksize(inode, new_size);
} else {
/*
* Mark that we allocate beyond EOF so the subsequent truncate
* can proceed even if the new size is the same as i_size.
*/
if ((offset + len) > i_size_read(inode))
ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
}
ext4_mark_inode_dirty(handle, inode);
if (file->f_flags & O_SYNC)
ext4_handle_sync(handle);
ext4_journal_stop(handle);
out:
mutex_unlock(&inode->i_mutex);
trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
......
......@@ -1055,27 +1055,11 @@ static int ext4_write_end(struct file *file,
} else
copied = block_write_end(file, mapping, pos,
len, copied, page, fsdata);
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hole i_mutex.
*
* But it's important to update i_size while still holding page lock:
* it's important to update i_size while still holding page lock:
* page writeout could otherwise come in and zero beyond i_size.
*/
if (pos + copied > inode->i_size) {
i_size_write(inode, pos + copied);
i_size_changed = 1;
}
if (pos + copied > EXT4_I(inode)->i_disksize) {
/* We need to mark inode dirty even if
* new_i_size is less that inode->i_size
* but greater than i_disksize. (hint delalloc)
*/
ext4_update_i_disksize(inode, (pos + copied));
i_size_changed = 1;
}
i_size_changed = ext4_update_inode_size(inode, pos + copied);
unlock_page(page);
page_cache_release(page);
......@@ -1123,7 +1107,7 @@ static int ext4_journalled_write_end(struct file *file,
int ret = 0, ret2;
int partial = 0;
unsigned from, to;
loff_t new_i_size;
int size_changed = 0;
trace_ext4_journalled_write_end(inode, pos, len, copied);
from = pos & (PAGE_CACHE_SIZE - 1);
......@@ -1146,20 +1130,18 @@ static int ext4_journalled_write_end(struct file *file,
if (!partial)
SetPageUptodate(page);
}
new_i_size = pos + copied;
if (new_i_size > inode->i_size)
i_size_write(inode, pos+copied);
size_changed = ext4_update_inode_size(inode, pos + copied);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
if (new_i_size > EXT4_I(inode)->i_disksize) {
ext4_update_i_disksize(inode, new_i_size);
unlock_page(page);
page_cache_release(page);
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
if (!ret)
ret = ret2;
}
unlock_page(page);
page_cache_release(page);
if (pos + len > inode->i_size && ext4_can_truncate(inode))
/* if we have allocated more blocks and copied
* less. We will have blocks allocated outside
......@@ -2095,6 +2077,7 @@ static int mpage_map_and_submit_extent(handle_t *handle,
struct ext4_map_blocks *map = &mpd->map;
int err;
loff_t disksize;
int progress = 0;
mpd->io_submit.io_end->offset =
((loff_t)map->m_lblk) << inode->i_blkbits;
......@@ -2111,8 +2094,11 @@ static int mpage_map_and_submit_extent(handle_t *handle,
* is non-zero, a commit should free up blocks.
*/
if ((err == -ENOMEM) ||
(err == -ENOSPC && ext4_count_free_clusters(sb)))
(err == -ENOSPC && ext4_count_free_clusters(sb))) {
if (progress)
goto update_disksize;
return err;
}
ext4_msg(sb, KERN_CRIT,
"Delayed block allocation failed for "
"inode %lu at logical offset %llu with"
......@@ -2129,15 +2115,17 @@ static int mpage_map_and_submit_extent(handle_t *handle,
*give_up_on_write = true;
return err;
}
progress = 1;
/*
* Update buffer state, submit mapped pages, and get us new
* extent to map
*/
err = mpage_map_and_submit_buffers(mpd);
if (err < 0)
return err;
goto update_disksize;
} while (map->m_len);
update_disksize:
/*
* Update on-disk size after IO is submitted. Races with
* truncate are avoided by checking i_size under i_data_sem.
......
......@@ -1412,6 +1412,8 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
int last = first + count - 1;
struct super_block *sb = e4b->bd_sb;
if (WARN_ON(count == 0))
return;
BUG_ON(last >= (sb->s_blocksize << 3));
assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
/* Don't bother if the block group is corrupt. */
......@@ -3221,6 +3223,8 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
int err;
if (pa == NULL) {
if (ac->ac_f_ex.fe_len == 0)
return;
err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
if (err) {
/*
......@@ -3235,6 +3239,7 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
ac->ac_f_ex.fe_len);
ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
ext4_mb_unload_buddy(&e4b);
return;
}
if (pa->pa_type == MB_INODE_PA)
......
......@@ -1227,7 +1227,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
buffer */
int num = 0;
ext4_lblk_t nblocks;
int i, err;
int i, err = 0;
int namelen;
*res_dir = NULL;
......@@ -1264,7 +1264,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
* return. Otherwise, fall back to doing a search the
* old fashioned way.
*/
if (bh || (err != ERR_BAD_DX_DIR))
if (err == -ENOENT)
return NULL;
if (err && err != ERR_BAD_DX_DIR)
return ERR_PTR(err);
if (bh)
return bh;
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
"falling back\n"));
......@@ -1295,6 +1299,11 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
}
num++;
bh = ext4_getblk(NULL, dir, b++, 0, &err);
if (unlikely(err)) {
if (ra_max == 0)
return ERR_PTR(err);
break;
}
bh_use[ra_max] = bh;
if (bh)
ll_rw_block(READ | REQ_META | REQ_PRIO,
......@@ -1417,6 +1426,8 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
return ERR_PTR(-ENAMETOOLONG);
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
return (struct dentry *) bh;
inode = NULL;
if (bh) {
__u32 ino = le32_to_cpu(de->inode);
......@@ -1450,6 +1461,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
struct buffer_head *bh;
bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL);
if (IS_ERR(bh))
return (struct dentry *) bh;
if (!bh)
return ERR_PTR(-ENOENT);
ino = le32_to_cpu(de->inode);
......@@ -2727,6 +2740,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
goto end_rmdir;
......@@ -2794,6 +2809,8 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
retval = -ENOENT;
bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (!bh)
goto end_unlink;
......@@ -3121,6 +3138,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
struct ext4_dir_entry_2 *de;
bh = ext4_find_entry(dir, d_name, &de, NULL);
if (IS_ERR(bh))
return PTR_ERR(bh);
if (bh) {
retval = ext4_delete_entry(handle, dir, de, bh);
brelse(bh);
......@@ -3128,7 +3147,8 @@ static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
return retval;
}
static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent,
int force_reread)
{
int retval;
/*
......@@ -3140,7 +3160,8 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent)
if (le32_to_cpu(ent->de->inode) != ent->inode->i_ino ||
ent->de->name_len != ent->dentry->d_name.len ||
strncmp(ent->de->name, ent->dentry->d_name.name,
ent->de->name_len)) {
ent->de->name_len) ||
force_reread) {
retval = ext4_find_delete_entry(handle, ent->dir,
&ent->dentry->d_name);
} else {
......@@ -3191,6 +3212,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
.dentry = new_dentry,
.inode = new_dentry->d_inode,
};
int force_reread;
int retval;
dquot_initialize(old.dir);
......@@ -3202,6 +3224,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dquot_initialize(new.inode);
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
......@@ -3214,6 +3238,10 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
goto end_rename;
}
if (new.bh) {
if (!new.inode) {
brelse(new.bh);
......@@ -3246,6 +3274,15 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
}
/*
* If we're renaming a file within an inline_data dir and adding or
* setting the new dirent causes a conversion from inline_data to
* extents/blockmap, we need to force the dirent delete code to
* re-read the directory, or else we end up trying to delete a dirent
* from what is now the extent tree root (or a block map).
*/
force_reread = (new.dir->i_ino == old.dir->i_ino &&
ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA));
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
if (retval)
......@@ -3256,6 +3293,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (retval)
goto end_rename;
}
if (force_reread)
force_reread = !ext4_test_inode_flag(new.dir,
EXT4_INODE_INLINE_DATA);
/*
* Like most other Unix systems, set the ctime for inodes on a
......@@ -3267,7 +3307,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
/*
* ok, that's it
*/
ext4_rename_delete(handle, &old);
ext4_rename_delete(handle, &old, force_reread);
if (new.inode) {
ext4_dec_count(handle, new.inode);
......@@ -3330,6 +3370,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name,
&old.de, &old.inlined);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
......@@ -3342,6 +3384,10 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
goto end_rename;
}
/* RENAME_EXCHANGE case: old *and* new must both exist */
if (!new.bh || le32_to_cpu(new.de->inode) != new.inode->i_ino)
......
......@@ -3181,9 +3181,9 @@ static int set_journal_csum_feature_set(struct super_block *sb)
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
/* journal checksum v2 */
/* journal checksum v3 */
compat = 0;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3;
} else {
/* journal checksum v1 */
compat = JBD2_FEATURE_COMPAT_CHECKSUM;
......@@ -3205,6 +3205,7 @@ static int set_journal_csum_feature_set(struct super_block *sb)
jbd2_journal_clear_features(sbi->s_journal,
JBD2_FEATURE_COMPAT_CHECKSUM, 0,
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
JBD2_FEATURE_INCOMPAT_CSUM_V3 |
JBD2_FEATURE_INCOMPAT_CSUM_V2);
}
......
......@@ -97,7 +97,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
struct commit_header *h;
__u32 csum;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return;
h = (struct commit_header *)(bh->b_data);
......@@ -313,11 +313,11 @@ static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
return checksum;
}
static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
if (tag_bytes > JBD2_TAG_SIZE32)
if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
......@@ -327,7 +327,7 @@ static void jbd2_descr_block_csum_set(journal_t *j,
struct jbd2_journal_block_tail *tail;
__u32 csum;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
......@@ -340,12 +340,13 @@ static void jbd2_descr_block_csum_set(journal_t *j,
static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
struct buffer_head *bh, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
struct page *page = bh->b_page;
__u8 *addr;
__u32 csum32;
__be32 seq;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return;
seq = cpu_to_be32(sequence);
......@@ -355,7 +356,9 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
/* We only have space to store the lower 16 bits of the crc32c. */
if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
tag3->t_checksum = cpu_to_be32(csum32);
else
tag->t_checksum = cpu_to_be16(csum32);
}
/*
......@@ -396,7 +399,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
LIST_HEAD(io_bufs);
LIST_HEAD(log_bufs);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_block_tail);
/*
......@@ -690,7 +693,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
tag_flag |= JBD2_FLAG_SAME_UUID;
tag = (journal_block_tag_t *) tagp;
write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
tag->t_flags = cpu_to_be16(tag_flag);
jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
commit_transaction->t_tid);
......
......@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
......@@ -145,7 +145,7 @@ static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
return sb->s_checksum == jbd2_superblock_csum(j, sb);
......@@ -153,7 +153,7 @@ static int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
static void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
{
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return;
sb->s_checksum = jbd2_superblock_csum(j, sb);
......@@ -1522,21 +1522,29 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
if (jbd2_journal_has_csum_v2or3(journal) &&
JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
"at the same time!\n");
goto out;
}
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
/* Can't have checksum v2 and v3 at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
"at the same time!\n");
goto out;
}
if (!jbd2_verify_csum_type(journal, sb)) {
printk(KERN_ERR "JBD2: Unknown checksum type\n");
goto out;
}
/* Load the checksum driver */
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
if (jbd2_journal_has_csum_v2or3(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
......@@ -1553,7 +1561,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Precompute checksum seed for all metadata */
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
......@@ -1813,8 +1821,14 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
return 0;
/* Asking for checksumming v2 and v1? Only give them v2. */
if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
/* If enabling v2 checksums, turn on v3 instead */
if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2) {
incompat &= ~JBD2_FEATURE_INCOMPAT_CSUM_V2;
incompat |= JBD2_FEATURE_INCOMPAT_CSUM_V3;
}
/* Asking for checksumming v3 and v1? Only give them v3. */
if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V3 &&
compat & JBD2_FEATURE_COMPAT_CHECKSUM)
compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
......@@ -1823,8 +1837,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
sb = journal->j_superblock;
/* If enabling v2 checksums, update superblock */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
/* If enabling v3 checksums, update superblock */
if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
......@@ -1842,8 +1856,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
}
/* Precompute checksum seed for all metadata */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0,
sb->s_uuid,
sizeof(sb->s_uuid));
......@@ -1852,7 +1865,8 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
/* If enabling v1 checksums, downgrade superblock */
if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
sb->s_feature_incompat &=
~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2 |
JBD2_FEATURE_INCOMPAT_CSUM_V3);
sb->s_feature_compat |= cpu_to_be32(compat);
sb->s_feature_ro_compat |= cpu_to_be32(ro);
......@@ -2165,16 +2179,20 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
*/
size_t journal_tag_bytes(journal_t *journal)
{
journal_block_tag_t tag;
size_t x = 0;
size_t sz;
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
return sizeof(journal_block_tag3_t);
sz = sizeof(journal_block_tag_t);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
x += sizeof(tag.t_checksum);
sz += sizeof(__u16);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
return x + JBD2_TAG_SIZE64;
return sz;
else
return x + JBD2_TAG_SIZE32;
return sz - sizeof(__u32);
}
/*
......
......@@ -181,7 +181,7 @@ static int jbd2_descr_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
......@@ -205,7 +205,7 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
int nr = 0, size = journal->j_blocksize;
int tag_bytes = journal_tag_bytes(journal);
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
size -= sizeof(struct jbd2_journal_block_tail);
tagp = &bh->b_data[sizeof(journal_header_t)];
......@@ -338,10 +338,11 @@ int jbd2_journal_skip_recovery(journal_t *journal)
return err;
}
static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag)
static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
if (tag_bytes > JBD2_TAG_SIZE32)
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
......@@ -384,7 +385,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
__be32 provided;
__u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
h = buf;
......@@ -399,16 +400,20 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
void *buf, __u32 sequence)
{
journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
__u32 csum32;
__be32 seq;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
seq = cpu_to_be32(sequence);
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
}
......@@ -426,6 +431,7 @@ static int do_one_pass(journal_t *journal,
int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
int descr_csum_size = 0;
int block_error = 0;
/*
* First thing is to establish what we expect to find in the log
......@@ -512,8 +518,7 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
descr_csum_size =
sizeof(struct jbd2_journal_block_tail);
if (descr_csum_size > 0 &&
......@@ -574,7 +579,7 @@ static int do_one_pass(journal_t *journal,
unsigned long long blocknr;
J_ASSERT(obh != NULL);
blocknr = read_tag_block(tag_bytes,
blocknr = read_tag_block(journal,
tag);
/* If the block has been
......@@ -598,7 +603,8 @@ static int do_one_pass(journal_t *journal,
"checksum recovering "
"block %llu in log\n",
blocknr);
continue;
block_error = 1;
goto skip_write;
}
/* Find a buffer for the new
......@@ -797,7 +803,8 @@ static int do_one_pass(journal_t *journal,
success = -EIO;
}
}
if (block_error && success == 0)
success = -EIO;
return success;
failed:
......@@ -811,7 +818,7 @@ static int jbd2_revoke_block_csum_verify(journal_t *j,
__be32 provided;
__u32 calculated;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return 1;
tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
......
......@@ -91,8 +91,8 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/bio.h>
#endif
#include <linux/log2.h>
#endif
static struct kmem_cache *jbd2_revoke_record_cache;
static struct kmem_cache *jbd2_revoke_table_cache;
......@@ -597,7 +597,7 @@ static void write_one_revoke_record(journal_t *journal,
offset = *offsetp;
/* Do we need to leave space at the end for a checksum? */
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
/* Make sure we have a descriptor with space left for the record */
......@@ -644,7 +644,7 @@ static void jbd2_revoke_csum_set(journal_t *j, struct buffer_head *bh)
struct jbd2_journal_revoke_tail *tail;
__u32 csum;
if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
if (!jbd2_journal_has_csum_v2or3(j))
return;
tail = (struct jbd2_journal_revoke_tail *)(bh->b_data + j->j_blocksize -
......
......@@ -159,7 +159,11 @@ typedef struct journal_header_s
* journal_block_tag (in the descriptor). The other h_chksum* fields are
* not used.
*
* Checksum v1 and v2 are mutually exclusive features.
* If FEATURE_INCOMPAT_CSUM_V3 is set, the descriptor block uses
* journal_block_tag3_t to store a full 32-bit checksum. Everything else
* is the same as v2.
*
* Checksum v1, v2, and v3 are mutually exclusive features.
*/
struct commit_header {
__be32 h_magic;
......@@ -179,6 +183,14 @@ struct commit_header {
* raw struct shouldn't be used for pointer math or sizeof() - use
* journal_tag_bytes(journal) instead to compute this.
*/
typedef struct journal_block_tag3_s
{
__be32 t_blocknr; /* The on-disk block number */
__be32 t_flags; /* See below */
__be32 t_blocknr_high; /* most-significant high 32bits. */
__be32 t_checksum; /* crc32c(uuid+seq+block) */
} journal_block_tag3_t;
typedef struct journal_block_tag_s
{
__be32 t_blocknr; /* The on-disk block number */
......@@ -187,9 +199,6 @@ typedef struct journal_block_tag_s
__be32 t_blocknr_high; /* most-significant high 32bits. */
} journal_block_tag_t;
#define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high))
#define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t))
/* Tail of descriptor block, for checksumming */
struct jbd2_journal_block_tail {
__be32 t_checksum; /* crc32c(uuid+descr_block) */
......@@ -284,6 +293,7 @@ typedef struct journal_superblock_s
#define JBD2_FEATURE_INCOMPAT_64BIT 0x00000002
#define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
#define JBD2_FEATURE_INCOMPAT_CSUM_V2 0x00000008
#define JBD2_FEATURE_INCOMPAT_CSUM_V3 0x00000010
/* Features known to this kernel version: */
#define JBD2_KNOWN_COMPAT_FEATURES JBD2_FEATURE_COMPAT_CHECKSUM
......@@ -291,7 +301,8 @@ typedef struct journal_superblock_s
#define JBD2_KNOWN_INCOMPAT_FEATURES (JBD2_FEATURE_INCOMPAT_REVOKE | \
JBD2_FEATURE_INCOMPAT_64BIT | \
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
JBD2_FEATURE_INCOMPAT_CSUM_V2)
JBD2_FEATURE_INCOMPAT_CSUM_V2 | \
JBD2_FEATURE_INCOMPAT_CSUM_V3)
#ifdef __KERNEL__
......@@ -1296,6 +1307,15 @@ static inline int tid_geq(tid_t x, tid_t y)
extern int jbd2_journal_blocks_per_page(struct inode *inode);
extern size_t journal_tag_bytes(journal_t *journal);
static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
{
if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) ||
JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
return 1;
return 0;
}
/*
* We reserve t_outstanding_credits >> JBD2_CONTROL_BLOCKS_SHIFT for
* transaction control blocks.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment