Commit 51ed42a8 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Many cleanups and bug fixes in ext4, especially for the fast commit
  feature.

  Also some performance improvements; in particular, improving IOPS and
  throughput on fast devices running Async Direct I/O by up to 20% by
  optimizing jbd2_transaction_committed()"

* tag 'ext4_for_linus-6.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (40 commits)
  ext4: make sure the first directory block is not a hole
  ext4: check dot and dotdot of dx_root before making dir indexed
  ext4: sanity check for NULL pointer after ext4_force_shutdown
  jbd2: increase maximum transaction size
  jbd2: drop pointless shrinker batch initialization
  jbd2: avoid infinite transaction commit loop
  jbd2: precompute number of transaction descriptor blocks
  jbd2: make jbd2_journal_get_max_txn_bufs() internal
  jbd2: avoid mount failed when commit block is partial submitted
  ext4: avoid writing unitialized memory to disk in EA inodes
  ext4: don't track ranges in fast_commit if inode has inlined data
  ext4: fix possible tid_t sequence overflows
  ext4: use ext4_update_inode_fsync_trans() helper in inode creation
  ext4: add missing MODULE_DESCRIPTION()
  jbd2: add missing MODULE_DESCRIPTION()
  ext4: use memtostr_pad() for s_volume_name
  jbd2: speed up jbd2_transaction_committed()
  ext4: make ext4_da_map_blocks() buffer_head unaware
  ext4: make ext4_insert_delayed_block() insert multi-blocks
  ext4: factor out a helper to check the cluster allocation state
  ...
parents dddebdec f9ca5159
......@@ -2184,6 +2184,8 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to)
struct buffer_head *bh, *head;
bh = head = folio_buffers(folio);
if (!bh)
return;
blocksize = bh->b_size;
block_start = 0;
......
......@@ -72,7 +72,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks,
{
struct ext4_system_zone *new_entry, *entry;
struct rb_node **n = &system_blks->root.rb_node, *node;
struct rb_node *parent = NULL, *new_node = NULL;
struct rb_node *parent = NULL, *new_node;
while (*n) {
parent = *n;
......
......@@ -1347,7 +1347,7 @@ struct ext4_super_block {
/*60*/ __le32 s_feature_incompat; /* incompatible feature set */
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
/*78*/ char s_volume_name[EXT4_LABEL_MAX]; /* volume name */
/*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
......
......@@ -310,6 +310,8 @@ void ext4_es_find_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es)
{
es->es_lblk = es->es_len = es->es_pblk = 0;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
......@@ -2052,34 +2054,49 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
}
/*
* ext4_es_insert_delayed_block - adds a delayed block to the extents status
* tree, adding a pending reservation where
* needed
* ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
* status tree, adding a pending reservation
* where needed
*
* @inode - file containing the newly added block
* @lblk - logical block to be added
* @allocated - indicates whether a physical cluster has been allocated for
* the logical cluster that contains the block
* @lblk - start logical block to be added
* @len - length of blocks to be added
* @lclu_allocated/end_allocated - indicates whether a physical cluster has
* been allocated for the logical cluster
* that contains the start/end block. Note that
* end_allocated should always be set to false
* if the start and the end block are in the
* same cluster
*/
void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated)
void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated,
bool end_allocated)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
int err1 = 0, err2 = 0, err3 = 0;
struct extent_status *es1 = NULL;
struct extent_status *es2 = NULL;
struct pending_reservation *pr = NULL;
struct pending_reservation *pr1 = NULL;
struct pending_reservation *pr2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len)
return;
WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
end_allocated);
newes.es_lblk = lblk;
newes.es_len = 1;
newes.es_len = len;
ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
trace_ext4_es_insert_delayed_block(inode, &newes, allocated);
trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
end_allocated);
ext4_es_insert_extent_check(inode, &newes);
......@@ -2088,11 +2105,15 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es1 = __es_alloc_extent(true);
if ((err1 || err2) && !es2)
es2 = __es_alloc_extent(true);
if ((err1 || err2 || err3) && allocated && !pr)
pr = __alloc_pending(true);
if (err1 || err2 || err3) {
if (lclu_allocated && !pr1)
pr1 = __alloc_pending(true);
if (end_allocated && !pr2)
pr2 = __alloc_pending(true);
}
write_lock(&EXT4_I(inode)->i_es_lock);
err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
if (err1 != 0)
goto error;
/* Free preallocated extent if it didn't get used. */
......@@ -2112,13 +2133,22 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
es2 = NULL;
}
if (allocated) {
err3 = __insert_pending(inode, lblk, &pr);
if (lclu_allocated) {
err3 = __insert_pending(inode, lblk, &pr1);
if (err3 != 0)
goto error;
if (pr) {
__free_pending(pr);
pr = NULL;
if (pr1) {
__free_pending(pr1);
pr1 = NULL;
}
}
if (end_allocated) {
err3 = __insert_pending(inode, end, &pr2);
if (err3 != 0)
goto error;
if (pr2) {
__free_pending(pr2);
pr2 = NULL;
}
}
error:
......
......@@ -249,8 +249,9 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, bool lclu_allocated,
bool end_allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
......
......@@ -353,7 +353,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
read_unlock(&sbi->s_journal->j_state_lock);
}
spin_lock(&sbi->s_fc_lock);
if (sbi->s_fc_ineligible_tid < tid)
if (tid_gt(tid, sbi->s_fc_ineligible_tid))
sbi->s_fc_ineligible_tid = tid;
spin_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
......@@ -649,6 +649,12 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
if (ext4_has_inline_data(inode)) {
ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR,
handle);
return;
}
args.start = start;
args.end = end;
......@@ -1207,7 +1213,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
if (ret == -EALREADY) {
/* There was an ongoing commit, check if we need to restart */
if (atomic_read(&sbi->s_fc_subtid) <= subtid &&
commit_tid > journal->j_commit_sequence)
tid_gt(commit_tid, journal->j_commit_sequence))
goto restart_fc;
ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0,
commit_tid);
......@@ -1282,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_del_init(&iter->i_fc_list);
ext4_clear_inode_state(&iter->vfs_inode,
EXT4_STATE_FC_COMMITTING);
if (iter->i_sync_tid <= tid)
if (tid_geq(tid, iter->i_sync_tid))
ext4_fc_reset_inode(&iter->vfs_inode);
/* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
smp_mb();
......@@ -1313,7 +1319,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
list_splice_init(&sbi->s_fc_q[FC_Q_STAGING],
&sbi->s_fc_q[FC_Q_MAIN]);
if (tid >= sbi->s_fc_ineligible_tid) {
if (tid_geq(tid, sbi->s_fc_ineligible_tid)) {
sbi->s_fc_ineligible_tid = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
}
......
......@@ -1336,10 +1336,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
}
}
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
......
......@@ -1410,7 +1410,11 @@ int ext4_inlinedir_to_tree(struct file *dir_file,
hinfo->hash = EXT4_DIRENT_HASH(de);
hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de);
} else {
ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
err = ext4fs_dirhash(dir, de->name, de->name_len, hinfo);
if (err) {
ret = err;
goto out;
}
}
if ((hinfo->hash < start_hash) ||
((hinfo->hash == start_hash) &&
......
......@@ -279,4 +279,5 @@ static struct kunit_suite ext4_inode_test_suite = {
kunit_test_suites(&ext4_inode_test_suite);
MODULE_DESCRIPTION("KUnit test of ext4 inode timestamp decoding");
MODULE_LICENSE("GPL v2");
This diff is collapsed.
......@@ -1151,7 +1151,7 @@ static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label
BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX);
lock_buffer(sbi->s_sbh);
strscpy_pad(label, sbi->s_es->s_volume_name);
memtostr_pad(label, sbi->s_es->s_volume_name);
unlock_buffer(sbi->s_sbh);
if (copy_to_user(user_label, label, sizeof(label)))
......
......@@ -151,10 +151,11 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
if (!bh && (type == INDEX || type == DIRENT_HTREE)) {
/* The first directory block must not be a hole. */
if (!bh && (type == INDEX || type == DIRENT_HTREE || block == 0)) {
ext4_error_inode(inode, func, line, block,
"Directory hole found for htree %s block",
(type == INDEX) ? "index" : "leaf");
"Directory hole found for htree %s block %u",
(type == INDEX) ? "index" : "leaf", block);
return ERR_PTR(-EFSCORRUPTED);
}
if (!bh)
......@@ -2172,6 +2173,52 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
return err ? err : err2;
}
static bool ext4_check_dx_root(struct inode *dir, struct dx_root *root)
{
struct fake_dirent *fde;
const char *error_msg;
unsigned int rlen;
unsigned int blocksize = dir->i_sb->s_blocksize;
char *blockend = (char *)root + dir->i_sb->s_blocksize;
fde = &root->dot;
if (unlikely(fde->name_len != 1)) {
error_msg = "invalid name_len for '.'";
goto corrupted;
}
if (unlikely(strncmp(root->dot_name, ".", fde->name_len))) {
error_msg = "invalid name for '.'";
goto corrupted;
}
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
if (unlikely((char *)fde + rlen >= blockend)) {
error_msg = "invalid rec_len for '.'";
goto corrupted;
}
fde = &root->dotdot;
if (unlikely(fde->name_len != 2)) {
error_msg = "invalid name_len for '..'";
goto corrupted;
}
if (unlikely(strncmp(root->dotdot_name, "..", fde->name_len))) {
error_msg = "invalid name for '..'";
goto corrupted;
}
rlen = ext4_rec_len_from_disk(fde->rec_len, blocksize);
if (unlikely((char *)fde + rlen >= blockend)) {
error_msg = "invalid rec_len for '..'";
goto corrupted;
}
return true;
corrupted:
EXT4_ERROR_INODE(dir, "Corrupt dir, %s, running e2fsck is recommended",
error_msg);
return false;
}
/*
* This converts a one block unindexed directory to a 3 block indexed
* directory, and adds the dentry to the indexed directory.
......@@ -2206,17 +2253,17 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
brelse(bh);
return retval;
}
root = (struct dx_root *) bh->b_data;
if (!ext4_check_dx_root(dir, root)) {
brelse(bh);
return -EFSCORRUPTED;
}
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
return -EFSCORRUPTED;
}
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
/* Allocate new block for the 0th block's dirents */
......@@ -3038,10 +3085,7 @@ bool ext4_empty_dir(struct inode *inode)
EXT4_ERROR_INODE(inode, "invalid size");
return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh))
return false;
......@@ -3483,10 +3527,7 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle,
struct ext4_dir_entry_2 *de;
unsigned int offset;
/* The first directory block must not be a hole, so
* treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
bh = ext4_read_dirblock(inode, 0, EITHER);
if (IS_ERR(bh)) {
*retval = PTR_ERR(bh);
return NULL;
......
......@@ -1327,6 +1327,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_group_desc_free(sbi);
ext4_flex_groups_free(sbi);
WARN_ON_ONCE(!(sbi->s_mount_state & EXT4_ERROR_FS) &&
percpu_counter_sum(&sbi->s_dirtyclusters_counter));
ext4_percpu_param_destroy(sbi);
#ifdef CONFIG_QUOTA
for (int i = 0; i < EXT4_MAXQUOTAS; i++)
......@@ -1457,7 +1460,8 @@ static void ext4_destroy_inode(struct inode *inode)
dump_stack();
}
if (EXT4_I(inode)->i_reserved_data_blocks)
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ERROR_FS) &&
WARN_ON_ONCE(EXT4_I(inode)->i_reserved_data_blocks))
ext4_msg(inode->i_sb, KERN_ERR,
"Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!",
inode->i_ino, EXT4_I(inode),
......
......@@ -1433,6 +1433,12 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
goto out;
memcpy(bh->b_data, buf, csize);
/*
* Zero out block tail to avoid writing uninitialized memory
* to disk.
*/
if (csize < blocksize)
memset(bh->b_data + csize, 0, blocksize - csize);
set_buffer_uptodate(bh);
ext4_handle_dirty_metadata(handle, ea_inode, bh);
......
......@@ -353,7 +353,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
struct buffer_head *descriptor;
struct buffer_head **wbuf = journal->j_wbuf;
int bufs;
int flags;
int escape;
int err;
unsigned long long blocknr;
ktime_t start_time;
......@@ -660,10 +660,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
set_bit(BH_JWrite, &jh2bh(jh)->b_state);
JBUFFER_TRACE(jh, "ph3: write metadata");
flags = jbd2_journal_write_metadata_buffer(commit_transaction,
escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
if (flags < 0) {
jbd2_journal_abort(journal, flags);
if (escape < 0) {
jbd2_journal_abort(journal, escape);
continue;
}
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
......@@ -672,7 +672,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
buffer */
tag_flag = 0;
if (flags & 1)
if (escape)
tag_flag |= JBD2_FLAG_ESCAPE;
if (!first_tag)
tag_flag |= JBD2_FLAG_SAME_UUID;
......@@ -766,7 +766,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (first_block < journal->j_tail)
freed += journal->j_last - journal->j_first;
/* Update tail only if we free significant amount of space */
if (freed < jbd2_journal_get_max_txn_bufs(journal))
if (freed < journal->j_max_transaction_buffers)
update_tail = 0;
}
J_ASSERT(commit_transaction->t_state == T_COMMIT);
......@@ -1107,7 +1107,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
commit_transaction->t_state = T_COMMIT_CALLBACK;
J_ASSERT(commit_transaction == journal->j_committing_transaction);
journal->j_commit_sequence = commit_transaction->t_tid;
WRITE_ONCE(journal->j_commit_sequence, commit_transaction->t_tid);
journal->j_committing_transaction = NULL;
commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
......
......@@ -220,19 +220,12 @@ static int kjournald2(void *arg)
* so we don't sleep
*/
DEFINE_WAIT(wait);
int should_sleep = 1;
prepare_to_wait(&journal->j_wait_commit, &wait,
TASK_INTERRUPTIBLE);
if (journal->j_commit_sequence != journal->j_commit_request)
should_sleep = 0;
transaction = journal->j_running_transaction;
if (transaction && time_after_eq(jiffies,
transaction->t_expires))
should_sleep = 0;
if (journal->j_flags & JBD2_UNMOUNT)
should_sleep = 0;
if (should_sleep) {
if (transaction == NULL ||
time_before(jiffies, transaction->t_expires)) {
write_unlock(&journal->j_state_lock);
schedule();
write_lock(&journal->j_state_lock);
......@@ -316,11 +309,8 @@ static void journal_kill_thread(journal_t *journal)
*
* Return value:
* <0: Error
* >=0: Finished OK
*
* On success:
* Bit 0 set == escape performed on the data
* Bit 1 set == buffer copy-out performed (kfree the data after IO)
* =0: Finished OK without escape
* =1: Finished OK with escape
*/
int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
......@@ -328,7 +318,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct buffer_head **bh_out,
sector_t blocknr)
{
int need_copy_out = 0;
int done_copy_out = 0;
int do_escape = 0;
char *mapped_data;
......@@ -355,7 +344,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
atomic_set(&new_bh->b_count, 1);
spin_lock(&jh_in->b_state_lock);
repeat:
/*
* If a new transaction has already done a buffer copy-out, then
* we use that version of the data for the commit.
......@@ -365,8 +353,8 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
} else {
new_folio = jh2bh(jh_in)->b_folio;
new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data);
new_folio = bh_in->b_folio;
new_offset = offset_in_folio(new_folio, bh_in->b_data);
}
mapped_data = kmap_local_folio(new_folio, new_offset);
......@@ -383,54 +371,52 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
/*
* Check for escaping
*/
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) {
need_copy_out = 1;
if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER))
do_escape = 1;
}
kunmap_local(mapped_data);
/*
* Do we need to do a data copy?
*/
if (need_copy_out && !done_copy_out) {
if (do_escape && !done_copy_out) {
char *tmp;
spin_unlock(&jh_in->b_state_lock);
tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
if (!tmp) {
brelse(new_bh);
free_buffer_head(new_bh);
return -ENOMEM;
}
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
goto repeat;
goto copy_done;
}
jh_in->b_frozen_data = tmp;
memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size);
new_folio = virt_to_folio(tmp);
new_offset = offset_in_folio(new_folio, tmp);
done_copy_out = 1;
/*
* This isn't strictly necessary, as we're using frozen
* data for the escaping, but it keeps consistency with
* b_frozen_data usage.
*/
jh_in->b_frozen_triggers = jh_in->b_triggers;
copy_done:
new_folio = virt_to_folio(jh_in->b_frozen_data);
new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data);
done_copy_out = 1;
}
/*
* Did we need to do an escaping? Now we've done all the
* copying, we can finally do so.
* b_frozen_data is from jbd2_alloc() which always provides an
* address from the direct kernels mapping.
*/
if (do_escape) {
mapped_data = kmap_local_folio(new_folio, new_offset);
*((unsigned int *)mapped_data) = 0;
kunmap_local(mapped_data);
}
if (do_escape)
*((unsigned int *)jh_in->b_frozen_data) = 0;
folio_set_bh(new_bh, new_folio, new_offset);
new_bh->b_size = bh_in->b_size;
......@@ -454,7 +440,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
set_buffer_shadow(bh_in);
spin_unlock(&jh_in->b_state_lock);
return do_escape | (done_copy_out << 1);
return do_escape;
}
/*
......@@ -789,17 +775,7 @@ EXPORT_SYMBOL(jbd2_fc_end_commit_fallback);
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
int ret = 1;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid)
ret = 0;
if (journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid)
ret = 0;
read_unlock(&journal->j_state_lock);
return ret;
return tid_geq(READ_ONCE(journal->j_commit_sequence), tid);
}
EXPORT_SYMBOL(jbd2_transaction_committed);
......@@ -1451,6 +1427,48 @@ static int journal_revoke_records_per_block(journal_t *journal)
return space / record_size;
}
static int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 3;
}
/*
* Base amount of descriptor blocks we reserve for each transaction.
*/
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
int tags_per_block;
/* Subtract UUID */
tag_space -= 16;
if (jbd2_journal_has_csum_v2or3(journal))
tag_space -= sizeof(struct jbd2_journal_block_tail);
/* Commit code leaves a slack space of 16 bytes at the end of block */
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
/*
* Revoke descriptors are accounted separately so we need to reserve
* space for commit block and normal transaction descriptor blocks.
*/
return 1 + DIV_ROUND_UP(jbd2_journal_get_max_txn_bufs(journal),
tags_per_block);
}
/*
* Initialize number of blocks each transaction reserves for its bookkeeping
* and maximum number of blocks a transaction can use. This needs to be called
* after the journal size and the fastcommit area size are initialized.
*/
static void jbd2_journal_init_transaction_limits(journal_t *journal)
{
journal->j_revoke_records_per_block =
journal_revoke_records_per_block(journal);
journal->j_transaction_overhead_buffers =
jbd2_descriptor_blocks_per_trans(journal);
journal->j_max_transaction_buffers =
jbd2_journal_get_max_txn_bufs(journal);
}
/*
* Load the on-disk journal superblock and read the key fields into the
* journal_t.
......@@ -1492,8 +1510,8 @@ static int journal_load_superblock(journal_t *journal)
if (jbd2_journal_has_csum_v2or3(journal))
journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
sizeof(sb->s_uuid));
journal->j_revoke_records_per_block =
journal_revoke_records_per_block(journal);
/* After journal features are set, we can compute transaction limits */
jbd2_journal_init_transaction_limits(journal);
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
......@@ -1599,7 +1617,6 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan;
journal->j_shrinker->count_objects = jbd2_journal_shrink_count;
journal->j_shrinker->batch = journal->j_max_transaction_buffers;
journal->j_shrinker->private_data = journal;
shrinker_register(journal->j_shrinker);
......@@ -1743,8 +1760,6 @@ static int journal_reset(journal_t *journal)
journal->j_commit_sequence = journal->j_transaction_sequence - 1;
journal->j_commit_request = journal->j_commit_sequence;
journal->j_max_transaction_buffers = jbd2_journal_get_max_txn_bufs(journal);
/*
* Now that journal recovery is done, turn fast commits off here. This
* way, if fast commit was enabled before the crash but if now FS has
......@@ -2285,8 +2300,6 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal->j_fc_first = journal->j_last + 1;
journal->j_fc_off = 0;
journal->j_free = journal->j_last - journal->j_first;
journal->j_max_transaction_buffers =
jbd2_journal_get_max_txn_bufs(journal);
return 0;
}
......@@ -2374,8 +2387,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_feature_ro_compat |= cpu_to_be32(ro);
sb->s_feature_incompat |= cpu_to_be32(incompat);
unlock_buffer(journal->j_sb_buffer);
journal->j_revoke_records_per_block =
journal_revoke_records_per_block(journal);
jbd2_journal_init_transaction_limits(journal);
return 1;
#undef COMPAT_FEATURE_ON
......@@ -2406,8 +2418,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
sb->s_feature_compat &= ~cpu_to_be32(compat);
sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
sb->s_feature_incompat &= ~cpu_to_be32(incompat);
journal->j_revoke_records_per_block =
journal_revoke_records_per_block(journal);
jbd2_journal_init_transaction_limits(journal);
}
EXPORT_SYMBOL(jbd2_journal_clear_features);
......
......@@ -19,6 +19,7 @@
#include <linux/errno.h>
#include <linux/crc32.h>
#include <linux/blkdev.h>
#include <linux/string_choices.h>
#endif
/*
......@@ -374,7 +375,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
be32_to_cpu(journal->j_superblock->s_sequence);
jbd2_debug(1,
"JBD2: ignoring %d transaction%s from the journal.\n",
dropped, (dropped == 1) ? "" : "s");
dropped, str_plural(dropped));
#endif
journal->j_transaction_sequence = ++info.end_transaction;
journal->j_head = info.head_block;
......@@ -443,6 +444,27 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
return provided == cpu_to_be32(calculated);
}
static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
{
struct commit_header *h;
__be32 provided;
__u32 calculated;
void *tmpbuf;
tmpbuf = kzalloc(j->j_blocksize, GFP_KERNEL);
if (!tmpbuf)
return false;
memcpy(tmpbuf, buf, sizeof(struct commit_header));
h = tmpbuf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
kfree(tmpbuf);
return provided == cpu_to_be32(calculated);
}
static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
journal_block_tag3_t *tag3,
void *buf, __u32 sequence)
......@@ -810,6 +832,13 @@ static int do_one_pass(journal_t *journal,
if (pass == PASS_SCAN &&
!jbd2_commit_block_csum_verify(journal,
bh->b_data)) {
if (jbd2_commit_block_csum_verify_partial(
journal,
bh->b_data)) {
pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
next_commit_ID, next_log_block);
goto chksum_ok;
}
chksum_error:
if (commit_time < last_trans_commit_time)
goto ignore_crc_mismatch;
......@@ -824,6 +853,7 @@ static int do_one_pass(journal_t *journal,
}
}
if (pass == PASS_SCAN) {
chksum_ok:
last_trans_commit_time = commit_time;
head_block = next_log_block;
}
......@@ -843,6 +873,7 @@ static int do_one_pass(journal_t *journal,
next_log_block);
need_check_commit_time = true;
}
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
if (pass != PASS_REVOKE) {
......
......@@ -62,28 +62,6 @@ void jbd2_journal_free_transaction(transaction_t *transaction)
kmem_cache_free(transaction_cache, transaction);
}
/*
* Base amount of descriptor blocks we reserve for each transaction.
*/
static int jbd2_descriptor_blocks_per_trans(journal_t *journal)
{
int tag_space = journal->j_blocksize - sizeof(journal_header_t);
int tags_per_block;
/* Subtract UUID */
tag_space -= 16;
if (jbd2_journal_has_csum_v2or3(journal))
tag_space -= sizeof(struct jbd2_journal_block_tail);
/* Commit code leaves a slack space of 16 bytes at the end of block */
tags_per_block = (tag_space - 16) / journal_tag_bytes(journal);
/*
* Revoke descriptors are accounted separately so we need to reserve
* space for commit block and normal transaction descriptor blocks.
*/
return 1 + DIV_ROUND_UP(journal->j_max_transaction_buffers,
tags_per_block);
}
/*
* jbd2_get_transaction: obtain a new transaction_t object.
*
......@@ -109,7 +87,7 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_expires = jiffies + journal->j_commit_interval;
atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) +
journal->j_transaction_overhead_buffers +
atomic_read(&journal->j_reserved_credits));
atomic_set(&transaction->t_outstanding_revokes, 0);
atomic_set(&transaction->t_handle_count, 0);
......@@ -213,6 +191,13 @@ static void sub_reserved_credits(journal_t *journal, int blocks)
wake_up(&journal->j_wait_reserved);
}
/* Maximum number of blocks for user transaction payload */
static int jbd2_max_user_trans_buffers(journal_t *journal)
{
return journal->j_max_transaction_buffers -
journal->j_transaction_overhead_buffers;
}
/*
* Wait until we can add credits for handle to the running transaction. Called
* with j_state_lock held for reading. Returns 0 if handle joined the running
......@@ -262,12 +247,12 @@ __must_hold(&journal->j_state_lock)
* big to fit this handle? Wait until reserved credits are freed.
*/
if (atomic_read(&journal->j_reserved_credits) + total >
journal->j_max_transaction_buffers) {
jbd2_max_user_trans_buffers(journal)) {
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + total <=
journal->j_max_transaction_buffers);
jbd2_max_user_trans_buffers(journal));
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
......@@ -307,14 +292,14 @@ __must_hold(&journal->j_state_lock)
needed = atomic_add_return(rsv_blocks, &journal->j_reserved_credits);
/* We allow at most half of a transaction to be reserved */
if (needed > journal->j_max_transaction_buffers / 2) {
if (needed > jbd2_max_user_trans_buffers(journal) / 2) {
sub_reserved_credits(journal, rsv_blocks);
atomic_sub(total, &t->t_outstanding_credits);
read_unlock(&journal->j_state_lock);
jbd2_might_wait_for_commit(journal);
wait_event(journal->j_wait_reserved,
atomic_read(&journal->j_reserved_credits) + rsv_blocks
<= journal->j_max_transaction_buffers / 2);
<= jbd2_max_user_trans_buffers(journal) / 2);
__acquire(&journal->j_state_lock); /* fake out sparse */
return 1;
}
......@@ -344,12 +329,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
* size and limit the number of total credits to not exceed maximum
* transaction size per operation.
*/
if ((rsv_blocks > journal->j_max_transaction_buffers / 2) ||
(rsv_blocks + blocks > journal->j_max_transaction_buffers)) {
if (rsv_blocks > jbd2_max_user_trans_buffers(journal) / 2 ||
rsv_blocks + blocks > jbd2_max_user_trans_buffers(journal)) {
printk(KERN_ERR "JBD2: %s wants too many credits "
"credits:%d rsv_credits:%d max:%d\n",
current->comm, blocks, rsv_blocks,
journal->j_max_transaction_buffers);
jbd2_max_user_trans_buffers(journal));
WARN_ON(1);
return -ENOSPC;
}
......
......@@ -1085,6 +1085,13 @@ struct journal_s
*/
int j_revoke_records_per_block;
/**
* @j_transaction_overhead:
*
* Number of blocks each transaction needs for its own bookkeeping
*/
int j_transaction_overhead_buffers;
/**
* @j_commit_interval:
*
......@@ -1660,11 +1667,6 @@ int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode);
int jbd2_fc_wait_bufs(journal_t *journal, int num_blks);
int jbd2_fc_release_bufs(journal_t *journal);
static inline int jbd2_journal_get_max_txn_bufs(journal_t *journal)
{
return (journal->j_total_len - journal->j_fc_wbufsize) / 4;
}
/*
* is_journal_abort
*
......
......@@ -1246,14 +1246,15 @@ TRACE_EVENT(ext4_da_update_reserve_space,
);
TRACE_EVENT(ext4_da_reserve_space,
TP_PROTO(struct inode *inode),
TP_PROTO(struct inode *inode, int nr_resv),
TP_ARGS(inode),
TP_ARGS(inode, nr_resv),
TP_STRUCT__entry(
__field( dev_t, dev )
__field( ino_t, ino )
__field( __u64, i_blocks )
__field( int, reserve_blocks )
__field( int, reserved_data_blocks )
__field( __u16, mode )
),
......@@ -1262,16 +1263,17 @@ TRACE_EVENT(ext4_da_reserve_space,
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->i_blocks = inode->i_blocks;
__entry->reserve_blocks = nr_resv;
__entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
__entry->mode = inode->i_mode;
),
TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu "
TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d"
"reserved_data_blocks %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->mode, __entry->i_blocks,
__entry->reserved_data_blocks)
__entry->reserve_blocks, __entry->reserved_data_blocks)
);
TRACE_EVENT(ext4_da_release_space,
......@@ -2478,11 +2480,11 @@ TRACE_EVENT(ext4_es_shrink,
__entry->scan_time, __entry->nr_skipped, __entry->retried)
);
TRACE_EVENT(ext4_es_insert_delayed_block,
TRACE_EVENT(ext4_es_insert_delayed_extent,
TP_PROTO(struct inode *inode, struct extent_status *es,
bool allocated),
bool lclu_allocated, bool end_allocated),
TP_ARGS(inode, es, allocated),
TP_ARGS(inode, es, lclu_allocated, end_allocated),
TP_STRUCT__entry(
__field( dev_t, dev )
......@@ -2491,7 +2493,8 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__field( ext4_lblk_t, len )
__field( ext4_fsblk_t, pblk )
__field( char, status )
__field( bool, allocated )
__field( bool, lclu_allocated )
__field( bool, end_allocated )
),
TP_fast_assign(
......@@ -2501,16 +2504,17 @@ TRACE_EVENT(ext4_es_insert_delayed_block,
__entry->len = es->es_len;
__entry->pblk = ext4_es_show_pblock(es);
__entry->status = ext4_es_status(es);
__entry->allocated = allocated;
__entry->lclu_allocated = lclu_allocated;
__entry->end_allocated = end_allocated;
),
TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
"allocated %d",
"allocated %d %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long) __entry->ino,
__entry->lblk, __entry->len,
__entry->pblk, show_extent_status(__entry->status),
__entry->allocated)
__entry->lclu_allocated, __entry->end_allocated)
);
/* fsmap traces */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment