Commit 9b03992f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Fix some bugs in converting ext4 to use the new mount API, as well as
  more bug fixes and clean ups in the ext4 fast_commit feature (most
  notably, in the tracepoints).

  In the jbd2 layer, the t_handle_lock spinlock has been removed, with
  the last place where it was actually needed replaced with an atomic
  cmpxchg"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (35 commits)
  ext4: fix kernel doc warnings
  ext4: fix remaining two trace events to use same printk convention
  ext4: add commit tid info in ext4_fc_commit_start/stop trace events
  ext4: add commit_tid info in jbd debug log
  ext4: add transaction tid info in fc_track events
  ext4: add new trace event in ext4_fc_cleanup
  ext4: return early for non-eligible fast_commit track events
  ext4: do not call FC trace event in ext4_fc_commit() if FS does not support FC
  ext4: convert ext4_fc_track_dentry type events to use event class
  ext4: fix ext4_fc_stats trace point
  ext4: remove unused enum EXT4_FC_COMMIT_FAILED
  ext4: warn when dirtying page w/o buffers in data=journal mode
  doc: fixed a typo in ext4 documentation
  ext4: make mb_optimize_scan performance mount option work with extents
  ext4: make mb_optimize_scan option work with set/unset mount cmd
  ext4: don't BUG if someone dirty pages without asking ext4 first
  ext4: remove redundant assignment to variable split_flag1
  ext4: fix underflow in ext4_max_bitmap_size()
  ext4: fix ext4_mb_clear_bb() kernel-doc comment
  ext4: fix fs corruption when tring to remove a non-empty directory with IO error
  ...
parents 14705fda 919adbfe
...@@ -39,7 +39,7 @@ For 32-bit filesystems, limits are as follows: ...@@ -39,7 +39,7 @@ For 32-bit filesystems, limits are as follows:
- 4TiB - 4TiB
- 8TiB - 8TiB
- 16TiB - 16TiB
- 256PiB - 256TiB
* - Blocks Per Block Group * - Blocks Per Block Group
- 8,192 - 8,192
- 16,384 - 16,384
......
...@@ -411,6 +411,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ...@@ -411,6 +411,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
* ext4_read_block_bitmap_nowait() * ext4_read_block_bitmap_nowait()
* @sb: super block * @sb: super block
* @block_group: given block group * @block_group: given block group
* @ignore_locked: ignore locked buffers
* *
* Read the bitmap for a given block_group,and validate the * Read the bitmap for a given block_group,and validate the
* bits for block/inode/inode tables are set in the bitmaps * bits for block/inode/inode tables are set in the bitmaps
......
...@@ -292,15 +292,10 @@ void ext4_release_system_zone(struct super_block *sb) ...@@ -292,15 +292,10 @@ void ext4_release_system_zone(struct super_block *sb)
call_rcu(&system_blks->rcu, ext4_destroy_system_zone); call_rcu(&system_blks->rcu, ext4_destroy_system_zone);
} }
/* int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
* Returns 1 if the passed-in block region (start_blk, ext4_fsblk_t start_blk, unsigned int count)
* start_blk+count) is valid; 0 if some part of the block region
* overlaps with some other filesystem metadata blocks.
*/
int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
unsigned int count)
{ {
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_system_blocks *system_blks; struct ext4_system_blocks *system_blks;
struct ext4_system_zone *entry; struct ext4_system_zone *entry;
struct rb_node *n; struct rb_node *n;
...@@ -329,7 +324,9 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, ...@@ -329,7 +324,9 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
else if (start_blk >= (entry->start_blk + entry->count)) else if (start_blk >= (entry->start_blk + entry->count))
n = n->rb_right; n = n->rb_right;
else { else {
ret = (entry->ino == inode->i_ino); ret = 0;
if (inode)
ret = (entry->ino == inode->i_ino);
break; break;
} }
} }
...@@ -338,6 +335,17 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, ...@@ -338,6 +335,17 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
return ret; return ret;
} }
/*
* Returns 1 if the passed-in block region (start_blk,
* start_blk+count) is valid; 0 if some part of the block region
* overlaps with some other filesystem metadata blocks.
*/
int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk,
unsigned int count)
{
return ext4_sb_block_valid(inode->i_sb, inode, start_blk, count);
}
int ext4_check_blockref(const char *function, unsigned int line, int ext4_check_blockref(const char *function, unsigned int line,
struct inode *inode, __le32 *p, unsigned int max) struct inode *inode, __le32 *p, unsigned int max)
{ {
......
...@@ -1046,6 +1046,8 @@ struct ext4_inode_info { ...@@ -1046,6 +1046,8 @@ struct ext4_inode_info {
/* Fast commit related info */ /* Fast commit related info */
/* For tracking dentry create updates */
struct list_head i_fc_dilist;
struct list_head i_fc_list; /* struct list_head i_fc_list; /*
* inodes that need fast commit * inodes that need fast commit
* protected by sbi->s_fc_lock. * protected by sbi->s_fc_lock.
...@@ -1279,7 +1281,7 @@ struct ext4_inode_info { ...@@ -1279,7 +1281,7 @@ struct ext4_inode_info {
#define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le
#define ext4_find_next_bit find_next_bit_le #define ext4_find_next_bit find_next_bit_le
extern void ext4_set_bits(void *bm, int cur, int len); extern void mb_set_bits(void *bm, int cur, int len);
/* /*
* Maximal mount counts between two filesystem checks * Maximal mount counts between two filesystem checks
...@@ -3707,6 +3709,9 @@ extern int ext4_inode_block_valid(struct inode *inode, ...@@ -3707,6 +3709,9 @@ extern int ext4_inode_block_valid(struct inode *inode,
unsigned int count); unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int, extern int ext4_check_blockref(const char *, unsigned int,
struct inode *, __le32 *, unsigned int); struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
ext4_fsblk_t start_blk, unsigned int count);
/* extents.c */ /* extents.c */
struct ext4_ext_path; struct ext4_ext_path;
......
...@@ -3368,7 +3368,6 @@ static int ext4_split_extent(handle_t *handle, ...@@ -3368,7 +3368,6 @@ static int ext4_split_extent(handle_t *handle,
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
unwritten = ext4_ext_is_unwritten(ex); unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
if (map->m_lblk >= ee_block) { if (map->m_lblk >= ee_block) {
split_flag1 = split_flag & EXT4_EXT_DATA_VALID2; split_flag1 = split_flag & EXT4_EXT_DATA_VALID2;
......
This diff is collapsed.
...@@ -93,7 +93,6 @@ enum { ...@@ -93,7 +93,6 @@ enum {
EXT4_FC_REASON_RENAME_DIR, EXT4_FC_REASON_RENAME_DIR,
EXT4_FC_REASON_FALLOC_RANGE, EXT4_FC_REASON_FALLOC_RANGE,
EXT4_FC_REASON_INODE_JOURNAL_DATA, EXT4_FC_REASON_INODE_JOURNAL_DATA,
EXT4_FC_COMMIT_FAILED,
EXT4_FC_REASON_MAX EXT4_FC_REASON_MAX
}; };
...@@ -109,6 +108,7 @@ struct ext4_fc_dentry_update { ...@@ -109,6 +108,7 @@ struct ext4_fc_dentry_update {
struct qstr fcd_name; /* Dirent name */ struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */ unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list; struct list_head fcd_list;
struct list_head fcd_dilist;
}; };
struct ext4_fc_stats { struct ext4_fc_stats {
......
...@@ -1783,19 +1783,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ...@@ -1783,19 +1783,20 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
void *inline_pos; void *inline_pos;
unsigned int offset; unsigned int offset;
struct ext4_dir_entry_2 *de; struct ext4_dir_entry_2 *de;
bool ret = true; bool ret = false;
err = ext4_get_inode_loc(dir, &iloc); err = ext4_get_inode_loc(dir, &iloc);
if (err) { if (err) {
EXT4_ERROR_INODE_ERR(dir, -err, EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block", "error %d getting inode %lu block",
err, dir->i_ino); err, dir->i_ino);
return true; return false;
} }
down_read(&EXT4_I(dir)->xattr_sem); down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) { if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0; *has_inline_data = 0;
ret = true;
goto out; goto out;
} }
...@@ -1804,7 +1805,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ...@@ -1804,7 +1805,6 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
ext4_warning(dir->i_sb, ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'", "bad inline directory (dir #%lu) - no `..'",
dir->i_ino); dir->i_ino);
ret = true;
goto out; goto out;
} }
...@@ -1823,16 +1823,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data) ...@@ -1823,16 +1823,15 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
dir->i_ino, le32_to_cpu(de->inode), dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len, le16_to_cpu(de->rec_len), de->name_len,
inline_size); inline_size);
ret = true;
goto out; goto out;
} }
if (le32_to_cpu(de->inode)) { if (le32_to_cpu(de->inode)) {
ret = false;
goto out; goto out;
} }
offset += ext4_rec_len_from_disk(de->rec_len, inline_size); offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
} }
ret = true;
out: out:
up_read(&EXT4_I(dir)->xattr_sem); up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh); brelse(iloc.bh);
......
...@@ -1993,6 +1993,15 @@ static int ext4_writepage(struct page *page, ...@@ -1993,6 +1993,15 @@ static int ext4_writepage(struct page *page,
else else
len = PAGE_SIZE; len = PAGE_SIZE;
/* Should never happen but for bugs in other kernel subsystems */
if (!page_has_buffers(page)) {
ext4_warning_inode(inode,
"page %lu does not have buffers attached", page->index);
ClearPageDirty(page);
unlock_page(page);
return 0;
}
page_bufs = page_buffers(page); page_bufs = page_buffers(page);
/* /*
* We cannot do block allocation or other extent handling in this * We cannot do block allocation or other extent handling in this
...@@ -2594,6 +2603,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ...@@ -2594,6 +2603,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
wait_on_page_writeback(page); wait_on_page_writeback(page);
BUG_ON(PageWriteback(page)); BUG_ON(PageWriteback(page));
/*
* Should never happen but for buggy code in
* other subsystems that call
* set_page_dirty() without properly warning
* the file system first. See [1] for more
* information.
*
* [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
*/
if (!page_has_buffers(page)) {
ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
ClearPageDirty(page);
unlock_page(page);
continue;
}
if (mpd->map.m_len == 0) if (mpd->map.m_len == 0)
mpd->first_page = page->index; mpd->first_page = page->index;
mpd->next_page = page->index + 1; mpd->next_page = page->index + 1;
...@@ -3548,10 +3573,11 @@ const struct iomap_ops ext4_iomap_report_ops = { ...@@ -3548,10 +3573,11 @@ const struct iomap_ops ext4_iomap_report_ops = {
}; };
/* /*
* Pages can be marked dirty completely asynchronously from ext4's journalling * Whenever the page is being dirtied, corresponding buffers should already be
* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do * attached to the transaction (we take care of this in ext4_page_mkwrite() and
* much here because ->set_page_dirty is called under VFS locks. The page is * ext4_write_begin()). However we cannot move buffers to dirty transaction
* not necessarily locked. * lists here because ->set_page_dirty is called under VFS locks and the page
* is not necessarily locked.
* *
* We cannot just dirty the page and leave attached buffers clean, because the * We cannot just dirty the page and leave attached buffers clean, because the
* buffers' dirty state is "definitive". We cannot just set the buffers dirty * buffers' dirty state is "definitive". We cannot just set the buffers dirty
...@@ -3562,6 +3588,7 @@ const struct iomap_ops ext4_iomap_report_ops = { ...@@ -3562,6 +3588,7 @@ const struct iomap_ops ext4_iomap_report_ops = {
*/ */
static int ext4_journalled_set_page_dirty(struct page *page) static int ext4_journalled_set_page_dirty(struct page *page)
{ {
WARN_ON_ONCE(!page_has_buffers(page));
SetPageChecked(page); SetPageChecked(page);
return __set_page_dirty_nobuffers(page); return __set_page_dirty_nobuffers(page);
} }
......
...@@ -269,7 +269,7 @@ int ext4_update_superblocks_fn(struct super_block *sb, ...@@ -269,7 +269,7 @@ int ext4_update_superblocks_fn(struct super_block *sb,
return err ? err : 0; return err ? err : 0;
} }
/** /*
* Swap memory between @a and @b for @len bytes. * Swap memory between @a and @b for @len bytes.
* *
* @a: pointer to first memory area * @a: pointer to first memory area
...@@ -290,7 +290,7 @@ static void memswap(void *a, void *b, size_t len) ...@@ -290,7 +290,7 @@ static void memswap(void *a, void *b, size_t len)
} }
} }
/** /*
* Swap i_data and associated attributes between @inode1 and @inode2. * Swap i_data and associated attributes between @inode1 and @inode2.
* This function is used for the primary swap between inode1 and inode2 * This function is used for the primary swap between inode1 and inode2
* and also to revert this primary swap in case of errors. * and also to revert this primary swap in case of errors.
...@@ -344,7 +344,7 @@ void ext4_reset_inode_seed(struct inode *inode) ...@@ -344,7 +344,7 @@ void ext4_reset_inode_seed(struct inode *inode)
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
} }
/** /*
* Swap the information from the given @inode and the inode * Swap the information from the given @inode and the inode
* EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other
* important fields of the inodes. * important fields of the inodes.
......
This diff is collapsed.
...@@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode) ...@@ -2997,14 +2997,14 @@ bool ext4_empty_dir(struct inode *inode)
if (inode->i_size < ext4_dir_rec_len(1, NULL) + if (inode->i_size < ext4_dir_rec_len(1, NULL) +
ext4_dir_rec_len(2, NULL)) { ext4_dir_rec_len(2, NULL)) {
EXT4_ERROR_INODE(inode, "invalid size"); EXT4_ERROR_INODE(inode, "invalid size");
return true; return false;
} }
/* The first directory block must not be a hole, /* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE * so treat it as DIRENT_HTREE
*/ */
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE); bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh)) if (IS_ERR(bh))
return true; return false;
de = (struct ext4_dir_entry_2 *) bh->b_data; de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
...@@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode) ...@@ -3012,7 +3012,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'"); ext4_warning_inode(inode, "directory missing '.'");
brelse(bh); brelse(bh);
return true; return false;
} }
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize); de = ext4_next_entry(de, sb->s_blocksize);
...@@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode) ...@@ -3021,7 +3021,7 @@ bool ext4_empty_dir(struct inode *inode)
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'"); ext4_warning_inode(inode, "directory missing '..'");
brelse(bh); brelse(bh);
return true; return false;
} }
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) { while (offset < inode->i_size) {
...@@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode) ...@@ -3035,7 +3035,7 @@ bool ext4_empty_dir(struct inode *inode)
continue; continue;
} }
if (IS_ERR(bh)) if (IS_ERR(bh))
return true; return false;
} }
de = (struct ext4_dir_entry_2 *) (bh->b_data + de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1))); (offset & (sb->s_blocksize - 1)));
...@@ -3891,12 +3891,19 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, ...@@ -3891,12 +3891,19 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
ext4_fc_mark_ineligible(old.inode->i_sb, ext4_fc_mark_ineligible(old.inode->i_sb,
EXT4_FC_REASON_RENAME_DIR, handle); EXT4_FC_REASON_RENAME_DIR, handle);
} else { } else {
struct super_block *sb = old.inode->i_sb;
if (new.inode) if (new.inode)
ext4_fc_track_unlink(handle, new.dentry); ext4_fc_track_unlink(handle, new.dentry);
__ext4_fc_track_link(handle, old.inode, new.dentry); if (test_opt2(sb, JOURNAL_FAST_COMMIT) &&
__ext4_fc_track_unlink(handle, old.inode, old.dentry); !(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
if (whiteout) !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE))) {
__ext4_fc_track_create(handle, whiteout, old.dentry); __ext4_fc_track_link(handle, old.inode, new.dentry);
__ext4_fc_track_unlink(handle, old.inode, old.dentry);
if (whiteout)
__ext4_fc_track_create(handle, whiteout,
old.dentry);
}
} }
if (new.inode) { if (new.inode) {
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/errno.h> #include <linux/errno.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/jiffies.h>
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
...@@ -483,7 +484,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ...@@ -483,7 +484,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle,
} }
ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n", ext4_debug("mark block bitmap %#04llx (+%llu/%u)\n",
first_cluster, first_cluster - start, count2); first_cluster, first_cluster - start, count2);
ext4_set_bits(bh->b_data, first_cluster - start, count2); mb_set_bits(bh->b_data, first_cluster - start, count2);
err = ext4_handle_dirty_metadata(handle, NULL, bh); err = ext4_handle_dirty_metadata(handle, NULL, bh);
brelse(bh); brelse(bh);
...@@ -632,7 +633,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, ...@@ -632,7 +633,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
if (overhead != 0) { if (overhead != 0) {
ext4_debug("mark backup superblock %#04llx (+0)\n", ext4_debug("mark backup superblock %#04llx (+0)\n",
start); start);
ext4_set_bits(bh->b_data, 0, mb_set_bits(bh->b_data, 0,
EXT4_NUM_B2C(sbi, overhead)); EXT4_NUM_B2C(sbi, overhead));
} }
ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count), ext4_mark_bitmap_end(EXT4_B2C(sbi, group_data[i].blocks_count),
...@@ -2100,7 +2101,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ...@@ -2100,7 +2101,7 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
*/ */
while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count, while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
flexbg_size)) { flexbg_size)) {
if (jiffies - last_update_time > HZ * 10) { if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time) if (last_update_time)
ext4_msg(sb, KERN_INFO, ext4_msg(sb, KERN_INFO,
"resized to %llu blocks", "resized to %llu blocks",
......
...@@ -2021,12 +2021,12 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg) ...@@ -2021,12 +2021,12 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, char *arg)
#define EXT4_SPEC_s_commit_interval (1 << 16) #define EXT4_SPEC_s_commit_interval (1 << 16)
#define EXT4_SPEC_s_fc_debug_max_replay (1 << 17) #define EXT4_SPEC_s_fc_debug_max_replay (1 << 17)
#define EXT4_SPEC_s_sb_block (1 << 18) #define EXT4_SPEC_s_sb_block (1 << 18)
#define EXT4_SPEC_mb_optimize_scan (1 << 19)
struct ext4_fs_context { struct ext4_fs_context {
char *s_qf_names[EXT4_MAXQUOTAS]; char *s_qf_names[EXT4_MAXQUOTAS];
char *test_dummy_enc_arg; char *test_dummy_enc_arg;
int s_jquota_fmt; /* Format of quota to use */ int s_jquota_fmt; /* Format of quota to use */
int mb_optimize_scan;
#ifdef CONFIG_EXT4_DEBUG #ifdef CONFIG_EXT4_DEBUG
int s_fc_debug_max_replay; int s_fc_debug_max_replay;
#endif #endif
...@@ -2045,8 +2045,8 @@ struct ext4_fs_context { ...@@ -2045,8 +2045,8 @@ struct ext4_fs_context {
unsigned int mask_s_mount_opt; unsigned int mask_s_mount_opt;
unsigned int vals_s_mount_opt2; unsigned int vals_s_mount_opt2;
unsigned int mask_s_mount_opt2; unsigned int mask_s_mount_opt2;
unsigned int vals_s_mount_flags; unsigned long vals_s_mount_flags;
unsigned int mask_s_mount_flags; unsigned long mask_s_mount_flags;
unsigned int opt_flags; /* MOPT flags */ unsigned int opt_flags; /* MOPT flags */
unsigned int spec; unsigned int spec;
u32 s_max_batch_time; u32 s_max_batch_time;
...@@ -2149,23 +2149,36 @@ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \ ...@@ -2149,23 +2149,36 @@ static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
{ \ { \
ctx->mask_s_##name |= flag; \ ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \ ctx->vals_s_##name |= flag; \
} \ }
#define EXT4_CLEAR_CTX(name) \
static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \ static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
unsigned long flag) \ unsigned long flag) \
{ \ { \
ctx->mask_s_##name |= flag; \ ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \ ctx->vals_s_##name &= ~flag; \
} \ }
#define EXT4_TEST_CTX(name) \
static inline unsigned long \ static inline unsigned long \
ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \ ctx_test_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \ { \
return (ctx->vals_s_##name & flag); \ return (ctx->vals_s_##name & flag); \
} \ }
EXT4_SET_CTX(flags); EXT4_SET_CTX(flags); /* set only */
EXT4_SET_CTX(mount_opt); EXT4_SET_CTX(mount_opt);
EXT4_CLEAR_CTX(mount_opt);
EXT4_TEST_CTX(mount_opt);
EXT4_SET_CTX(mount_opt2); EXT4_SET_CTX(mount_opt2);
EXT4_SET_CTX(mount_flags); EXT4_CLEAR_CTX(mount_opt2);
EXT4_TEST_CTX(mount_opt2);
static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit)
{
set_bit(bit, &ctx->mask_s_mount_flags);
set_bit(bit, &ctx->vals_s_mount_flags);
}
static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
{ {
...@@ -2235,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ...@@ -2235,7 +2248,7 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
param->key); param->key);
return 0; return 0;
case Opt_abort: case Opt_abort:
ctx_set_mount_flags(ctx, EXT4_MF_FS_ABORTED); ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED);
return 0; return 0;
case Opt_i_version: case Opt_i_version:
ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20"); ext4_msg(NULL, KERN_WARNING, deprecated_msg, param->key, "5.20");
...@@ -2451,12 +2464,17 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ...@@ -2451,12 +2464,17 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx_clear_mount_opt(ctx, m->mount_opt); ctx_clear_mount_opt(ctx, m->mount_opt);
return 0; return 0;
case Opt_mb_optimize_scan: case Opt_mb_optimize_scan:
if (result.int_32 != 0 && result.int_32 != 1) { if (result.int_32 == 1) {
ctx_set_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
ctx->spec |= EXT4_SPEC_mb_optimize_scan;
} else if (result.int_32 == 0) {
ctx_clear_mount_opt2(ctx, EXT4_MOUNT2_MB_OPTIMIZE_SCAN);
ctx->spec |= EXT4_SPEC_mb_optimize_scan;
} else {
ext4_msg(NULL, KERN_WARNING, ext4_msg(NULL, KERN_WARNING,
"mb_optimize_scan should be set to 0 or 1."); "mb_optimize_scan should be set to 0 or 1.");
return -EINVAL; return -EINVAL;
} }
ctx->mb_optimize_scan = result.int_32;
return 0; return 0;
} }
...@@ -3468,8 +3486,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) ...@@ -3468,8 +3486,9 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
*/ */
static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
{ {
unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; loff_t upper_limit, res = EXT4_NDIR_BLOCKS;
int meta_blocks; int meta_blocks;
unsigned int ppb = 1 << (bits - 2);
/* /*
* This is calculated to be the largest file size for a dense, block * This is calculated to be the largest file size for a dense, block
...@@ -3501,27 +3520,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) ...@@ -3501,27 +3520,42 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
} }
/* Compute how many blocks we can address by block tree */
res += ppb;
res += ppb * ppb;
res += ((loff_t)ppb) * ppb * ppb;
/* Compute how many metadata blocks are needed */
meta_blocks = 1;
meta_blocks += 1 + ppb;
meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
/* How many metadata blocks are needed for addressing upper_limit? */
upper_limit -= EXT4_NDIR_BLOCKS;
/* indirect blocks */ /* indirect blocks */
meta_blocks = 1; meta_blocks = 1;
upper_limit -= ppb;
/* double indirect blocks */ /* double indirect blocks */
meta_blocks += 1 + (1LL << (bits-2)); if (upper_limit < ppb * ppb) {
/* tripple indirect blocks */ meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb);
meta_blocks += 1 + (1LL << (bits-2)) + (1LL << (2*(bits-2))); res -= meta_blocks;
goto check_lfs;
upper_limit -= meta_blocks; }
upper_limit <<= bits; meta_blocks += 1 + ppb;
upper_limit -= ppb * ppb;
res += 1LL << (bits-2); /* tripple indirect blocks for the rest */
res += 1LL << (2*(bits-2)); meta_blocks += 1 + DIV_ROUND_UP_ULL(upper_limit, ppb) +
res += 1LL << (3*(bits-2)); DIV_ROUND_UP_ULL(upper_limit, ppb*ppb);
res -= meta_blocks;
check_lfs:
res <<= bits; res <<= bits;
if (res > upper_limit)
res = upper_limit;
if (res > MAX_LFS_FILESIZE) if (res > MAX_LFS_FILESIZE)
res = MAX_LFS_FILESIZE; res = MAX_LFS_FILESIZE;
return (loff_t)res; return res;
} }
static ext4_fsblk_t descriptor_loc(struct super_block *sb, static ext4_fsblk_t descriptor_loc(struct super_block *sb,
...@@ -4369,7 +4403,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ...@@ -4369,7 +4403,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */ /* Set defaults for the variables that will be set during parsing */
ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
ctx->mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start = sbi->s_sectors_written_start =
...@@ -5320,12 +5353,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ...@@ -5320,12 +5353,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
* turned off by passing "mb_optimize_scan=0". This can also be * turned off by passing "mb_optimize_scan=0". This can also be
* turned on forcefully by passing "mb_optimize_scan=1". * turned on forcefully by passing "mb_optimize_scan=1".
*/ */
if (ctx->mb_optimize_scan == 1) if (!(ctx->spec & EXT4_SPEC_mb_optimize_scan)) {
set_opt2(sb, MB_OPTIMIZE_SCAN); if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD)
else if (ctx->mb_optimize_scan == 0) set_opt2(sb, MB_OPTIMIZE_SCAN);
clear_opt2(sb, MB_OPTIMIZE_SCAN); else
else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) clear_opt2(sb, MB_OPTIMIZE_SCAN);
set_opt2(sb, MB_OPTIMIZE_SCAN); }
err = ext4_mb_init(sb); err = ext4_mb_init(sb);
if (err) { if (err) {
......
...@@ -107,7 +107,6 @@ static void jbd2_get_transaction(journal_t *journal, ...@@ -107,7 +107,6 @@ static void jbd2_get_transaction(journal_t *journal,
transaction->t_start_time = ktime_get(); transaction->t_start_time = ktime_get();
transaction->t_tid = journal->j_transaction_sequence++; transaction->t_tid = journal->j_transaction_sequence++;
transaction->t_expires = jiffies + journal->j_commit_interval; transaction->t_expires = jiffies + journal->j_commit_interval;
spin_lock_init(&transaction->t_handle_lock);
atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_updates, 0);
atomic_set(&transaction->t_outstanding_credits, atomic_set(&transaction->t_outstanding_credits,
jbd2_descriptor_blocks_per_trans(journal) + jbd2_descriptor_blocks_per_trans(journal) +
...@@ -139,26 +138,22 @@ static void jbd2_get_transaction(journal_t *journal, ...@@ -139,26 +138,22 @@ static void jbd2_get_transaction(journal_t *journal,
/* /*
* Update transaction's maximum wait time, if debugging is enabled. * Update transaction's maximum wait time, if debugging is enabled.
* *
* In order for t_max_wait to be reliable, it must be protected by a * t_max_wait is carefully updated here with use of atomic compare exchange.
* lock. But doing so will mean that start_this_handle() can not be * Note that there could be multiplre threads trying to do this simultaneously
* run in parallel on SMP systems, which limits our scalability. So * hence using cmpxchg to avoid any use of locks in this case.
* unless debugging is enabled, we no longer update t_max_wait, which * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug.
* means that maximum wait time reported by the jbd2_run_stats
* tracepoint will always be zero.
*/ */
static inline void update_t_max_wait(transaction_t *transaction, static inline void update_t_max_wait(transaction_t *transaction,
unsigned long ts) unsigned long ts)
{ {
#ifdef CONFIG_JBD2_DEBUG unsigned long oldts, newts;
if (jbd2_journal_enable_debug &&
time_after(transaction->t_start, ts)) { if (time_after(transaction->t_start, ts)) {
ts = jbd2_time_diff(ts, transaction->t_start); newts = jbd2_time_diff(ts, transaction->t_start);
spin_lock(&transaction->t_handle_lock); oldts = READ_ONCE(transaction->t_max_wait);
if (ts > transaction->t_max_wait) while (oldts < newts)
transaction->t_max_wait = ts; oldts = cmpxchg(&transaction->t_max_wait, oldts, newts);
spin_unlock(&transaction->t_handle_lock);
} }
#endif
} }
/* /*
...@@ -690,7 +685,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) ...@@ -690,7 +685,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
DIV_ROUND_UP( DIV_ROUND_UP(
handle->h_revoke_credits_requested, handle->h_revoke_credits_requested,
journal->j_revoke_records_per_block); journal->j_revoke_records_per_block);
spin_lock(&transaction->t_handle_lock);
wanted = atomic_add_return(nblocks, wanted = atomic_add_return(nblocks,
&transaction->t_outstanding_credits); &transaction->t_outstanding_credits);
...@@ -698,7 +692,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) ...@@ -698,7 +692,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
jbd_debug(3, "denied handle %p %d blocks: " jbd_debug(3, "denied handle %p %d blocks: "
"transaction too large\n", handle, nblocks); "transaction too large\n", handle, nblocks);
atomic_sub(nblocks, &transaction->t_outstanding_credits); atomic_sub(nblocks, &transaction->t_outstanding_credits);
goto unlock; goto error_out;
} }
trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev,
...@@ -714,8 +708,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records) ...@@ -714,8 +708,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks, int revoke_records)
result = 0; result = 0;
jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); jbd_debug(3, "extended handle %p by %d\n", handle, nblocks);
unlock:
spin_unlock(&transaction->t_handle_lock);
error_out: error_out:
read_unlock(&journal->j_state_lock); read_unlock(&journal->j_state_lock);
return result; return result;
...@@ -842,27 +834,35 @@ EXPORT_SYMBOL(jbd2_journal_restart); ...@@ -842,27 +834,35 @@ EXPORT_SYMBOL(jbd2_journal_restart);
*/ */
void jbd2_journal_wait_updates(journal_t *journal) void jbd2_journal_wait_updates(journal_t *journal)
{ {
transaction_t *commit_transaction = journal->j_running_transaction; DEFINE_WAIT(wait);
if (!commit_transaction) while (1) {
return; /*
* Note that the running transaction can get freed under us if
* this transaction is getting committed in
* jbd2_journal_commit_transaction() ->
* jbd2_journal_free_transaction(). This can only happen when we
* release j_state_lock -> schedule() -> acquire j_state_lock.
* Hence we should everytime retrieve new j_running_transaction
* value (after j_state_lock release acquire cycle), else it may
* lead to use-after-free of old freed transaction.
*/
transaction_t *transaction = journal->j_running_transaction;
spin_lock(&commit_transaction->t_handle_lock); if (!transaction)
while (atomic_read(&commit_transaction->t_updates)) { break;
DEFINE_WAIT(wait);
prepare_to_wait(&journal->j_wait_updates, &wait, prepare_to_wait(&journal->j_wait_updates, &wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
if (atomic_read(&commit_transaction->t_updates)) { if (!atomic_read(&transaction->t_updates)) {
spin_unlock(&commit_transaction->t_handle_lock); finish_wait(&journal->j_wait_updates, &wait);
write_unlock(&journal->j_state_lock); break;
schedule();
write_lock(&journal->j_state_lock);
spin_lock(&commit_transaction->t_handle_lock);
} }
write_unlock(&journal->j_state_lock);
schedule();
finish_wait(&journal->j_wait_updates, &wait); finish_wait(&journal->j_wait_updates, &wait);
write_lock(&journal->j_state_lock);
} }
spin_unlock(&commit_transaction->t_handle_lock);
} }
/** /**
...@@ -877,8 +877,6 @@ void jbd2_journal_wait_updates(journal_t *journal) ...@@ -877,8 +877,6 @@ void jbd2_journal_wait_updates(journal_t *journal)
*/ */
void jbd2_journal_lock_updates(journal_t *journal) void jbd2_journal_lock_updates(journal_t *journal)
{ {
DEFINE_WAIT(wait);
jbd2_might_wait_for_commit(journal); jbd2_might_wait_for_commit(journal);
write_lock(&journal->j_state_lock); write_lock(&journal->j_state_lock);
......
...@@ -554,9 +554,6 @@ struct transaction_chp_stats_s { ...@@ -554,9 +554,6 @@ struct transaction_chp_stats_s {
* ->j_list_lock * ->j_list_lock
* *
* j_state_lock * j_state_lock
* ->t_handle_lock
*
* j_state_lock
* ->j_list_lock (journal_unmap_buffer) * ->j_list_lock (journal_unmap_buffer)
* *
*/ */
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment