Commit 02f310fc authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: Speedup ext4 orphan inode handling

Ext4 orphan inode handling is a bottleneck for workloads which heavily
truncate / unlink small files since it contends on the global
s_orphan_mutex lock (and generally it's difficult to improve scalability
of the ondisk linked list of orphaned inodes).

This patch implements new way of handling orphan inodes. Instead of
linking orphaned inode into a linked list, we store it's inode number in
a new special file which we call "orphan file". Only if there's no more
space in the orphan file (too many inodes are currently orphaned) we
fall back to using old style linked list. Currently we protect
operations in the orphan file with a spinlock for simplicity but even in
this setting we can substantially reduce the length of the critical
section and thus speedup some workloads. In the next patch we improve
this by making orphan handling lockless.

Note that the change is backwards compatible when the filesystem is
clean - the existence of the orphan file is a compat feature, we set
another ro-compat feature indicating orphan file needs scanning for
orphaned inodes when mounting filesystem read-write. This ro-compat
feature gets cleared on unmount / remount read-only.

Some performance data from 80 CPU Xeon Server with 512 GB of RAM,
filesystem located on SSD, average of 5 runs:

stress-orphan (microbenchmark truncating files byte-by-byte from N
processes in parallel)

Threads Time            Time
        Vanilla         Patched
  1       1.057200        0.945600
  2       1.680400        1.331800
  4       2.547000        1.995000
  8       7.049400        6.424200
 16      14.827800       14.937600
 32      40.948200       33.038200
 64      87.787400       60.823600
128     206.504000      122.941400

So we can see significant wins all over the board.
Reviewed-by: default avatarTheodore Ts'o <tytso@mit.edu>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210816095713.16537-3-jack@suse.czSigned-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent 25c6d98f
......@@ -1034,7 +1034,14 @@ struct ext4_inode_info {
*/
struct rw_semaphore xattr_sem;
struct list_head i_orphan; /* unlinked but open inodes */
/*
* Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
* i_orphan is used.
*/
union {
struct list_head i_orphan; /* unlinked but open inodes */
unsigned int i_orphan_idx; /* Index in orphan file */
};
/* Fast commit related info */
......@@ -1428,7 +1435,8 @@ struct ext4_super_block {
__u8 s_last_error_errcode;
__le16 s_encoding; /* Filename charset encoding */
__le16 s_encoding_flags; /* Filename charset encoding flags */
__le32 s_reserved[95]; /* Padding to the end of the block */
__le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */
__le32 s_reserved[94]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
......@@ -1449,6 +1457,7 @@ struct ext4_super_block {
/* Types of ext4 journal triggers */
enum ext4_journal_trigger_type {
EXT4_JTR_ORPHAN_FILE,
EXT4_JTR_NONE /* This must be the last entry for indexing to work! */
};
......@@ -1465,6 +1474,36 @@ static inline struct ext4_journal_trigger *EXT4_TRIGGER(
return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
}
#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04
/* Structure at the tail of orphan block */
struct ext4_orphan_block_tail {
__le32 ob_magic;
__le32 ob_checksum;
};
static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
{
return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
sizeof(u32);
}
struct ext4_orphan_block {
int ob_free_entries; /* Number of free orphan entries in block */
struct buffer_head *ob_bh; /* Buffer for orphan block */
};
/*
* Info about orphan file.
*/
struct ext4_orphan_info {
spinlock_t of_lock;
int of_blocks; /* Number of orphan blocks in a file */
__u32 of_csum_seed; /* Checksum seed for orphan file */
struct ext4_orphan_block *of_binfo; /* Array with info about orphan
* file blocks */
};
/*
* fourth extended-fs super-block data in memory
*/
......@@ -1519,9 +1558,11 @@ struct ext4_sb_info {
/* Journaling */
struct journal_s *s_journal;
struct list_head s_orphan;
struct mutex s_orphan_lock;
unsigned long s_ext4_flags; /* Ext4 superblock flags */
struct mutex s_orphan_lock; /* Protects on disk list changes */
struct list_head s_orphan; /* List of orphaned inodes in on disk
list */
struct ext4_orphan_info s_orphan_info;
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
......@@ -1859,6 +1900,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
#define EXT4_INODE_BIT_FNS(name, field, offset) \
......@@ -1960,6 +2002,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
*/
#define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */
#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002
......@@ -1980,6 +2023,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be
non-empty */
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
......@@ -2063,6 +2108,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES)
EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
......@@ -2077,6 +2123,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY)
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT)
EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
......@@ -2110,7 +2157,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \
EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
......@@ -2135,7 +2183,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA |\
EXT4_FEATURE_RO_COMPAT_PROJECT |\
EXT4_FEATURE_RO_COMPAT_VERITY)
EXT4_FEATURE_RO_COMPAT_VERITY |\
EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)
#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
......@@ -2185,7 +2234,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi)
return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
}
/*
* Default values for user and/or group using reserved blocks
*/
......@@ -3768,6 +3816,13 @@ extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern void ext4_orphan_cleanup(struct super_block *sb,
struct ext4_super_block *es);
extern void ext4_release_orphan_info(struct super_block *sb);
extern int ext4_init_orphan_info(struct super_block *sb);
extern int ext4_orphan_file_empty(struct super_block *sb);
extern void ext4_orphan_file_block_trigger(
struct jbd2_buffer_trigger_type *triggers,
struct buffer_head *bh,
void *data, size_t size);
/*
* Add new method to test whether block and inode bitmaps are properly
......
......@@ -4624,7 +4624,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
ino == le32_to_cpu(es->s_usr_quota_inum) ||
ino == le32_to_cpu(es->s_grp_quota_inum) ||
ino == le32_to_cpu(es->s_prj_quota_inum))) ||
ino == le32_to_cpu(es->s_prj_quota_inum) ||
ino == le32_to_cpu(es->s_orphan_file_inum))) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
......
This diff is collapsed.
......@@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb)
flush_work(&sbi->s_error_work);
destroy_workqueue(sbi->rsv_conversion_wq);
ext4_release_orphan_info(sb);
/*
* Unregister sysfs before destroying jbd2 journal.
......@@ -1199,6 +1200,7 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb) && !aborted) {
ext4_clear_feature_journal_needs_recovery(sb);
ext4_clear_feature_orphan_present(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!sb_rdonly(sb))
......@@ -2684,8 +2686,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
ext4_update_tstamp(es, s_mtime);
if (sbi->s_journal)
if (sbi->s_journal) {
ext4_set_feature_journal_needs_recovery(sb);
if (ext4_has_feature_orphan_file(sb))
ext4_set_feature_orphan_present(sb);
}
err = ext4_commit_super(sb);
done:
......@@ -3960,6 +3965,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
silent = 1;
goto cantfind_ext4;
}
ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE,
ext4_orphan_file_block_trigger);
/* Load the checksum driver */
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
......@@ -4624,6 +4631,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
ext4_has_feature_orphan_present(sb) ||
ext4_has_feature_journal_needs_recovery(sb));
if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb))
......@@ -4922,12 +4930,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (err)
goto failed_mount7;
err = ext4_init_orphan_info(sb);
if (err)
goto failed_mount8;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) {
err = ext4_enable_quotas(sb);
if (err)
goto failed_mount8;
goto failed_mount9;
}
#endif /* CONFIG_QUOTA */
......@@ -4946,7 +4957,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
goto failed_mount8;
goto failed_mount9;
}
if (EXT4_SB(sb)->s_journal) {
if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
......@@ -4992,6 +5003,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
goto failed_mount;
failed_mount9:
ext4_release_orphan_info(sb);
failed_mount8:
ext4_unregister_sysfs(sb);
kobject_put(&sbi->s_kobj);
......@@ -5502,8 +5515,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
if (err < 0)
goto out;
if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) {
if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) ||
ext4_has_feature_orphan_present(sb))) {
if (!ext4_orphan_file_empty(sb)) {
ext4_error(sb, "Orphan file not empty on read-only fs.");
err = -EFSCORRUPTED;
goto out;
}
ext4_clear_feature_journal_needs_recovery(sb);
ext4_clear_feature_orphan_present(sb);
ext4_commit_super(sb);
}
out:
......@@ -5646,6 +5666,8 @@ static int ext4_freeze(struct super_block *sb)
/* Journal blocked and flushed, clear needs_recovery flag. */
ext4_clear_feature_journal_needs_recovery(sb);
if (ext4_orphan_file_empty(sb))
ext4_clear_feature_orphan_present(sb);
}
error = ext4_commit_super(sb);
......@@ -5668,6 +5690,8 @@ static int ext4_unfreeze(struct super_block *sb)
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
ext4_set_feature_journal_needs_recovery(sb);
if (ext4_has_feature_orphan_file(sb))
ext4_set_feature_orphan_present(sb);
}
ext4_commit_super(sb);
......@@ -5871,7 +5895,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
* around from a previously readonly bdev mount,
* require a full umount/remount for now.
*/
if (es->s_last_orphan) {
if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) {
ext4_msg(sb, KERN_WARNING, "Couldn't "
"remount RDWR because of unprocessed "
"orphan inode list. Please "
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment