Commit 555a6e8c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
 "Various bug fixes and cleanups for ext4; no new features this cycle"

* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (29 commits)
  ext4: remove unnecessary wbc parameter from ext4_bio_write_page
  ext4: avoid s_mb_prefetch to be zero in individual scenarios
  ext4: defer saving error info from atomic context
  ext4: simplify ext4 error translation
  ext4: move functions in super.c
  ext4: make ext4_abort() use __ext4_error()
  ext4: standardize error message in ext4_protect_reserved_inode()
  ext4: remove redundant sb checksum recomputation
  ext4: don't remount read-only with errors=continue on reboot
  ext4: fix deadlock with fs freezing and EA inodes
  jbd2: add a helper to find out number of fast commit blocks
  ext4: make fast_commit.h byte identical with e2fsprogs/fast_commit.h
  ext4: fix fall-through warnings for Clang
  ext4: add docs about fast commit idempotence
  ext4: remove the unused EXT4_CURRENT_REV macro
  ext4: fix an IS_ERR() vs NULL check
  ext4: check for invalid block size early when mounting a file system
  ext4: fix a memory leak of ext4_free_data
  ext4: delete nonsensical (commented-out) code inside ext4_xattr_block_set()
  ext4: update ext4_data_block_valid related comments
  ...
parents 2f2fce3d be993933
......@@ -681,3 +681,53 @@ Here is the list of supported tags and their meanings:
- Stores the TID of the commit, CRC of the fast commit of which this tag
represents the end of
Fast Commit Replay Idempotence
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Fast commits tags are idempotent in nature provided the recovery code follows
certain rules. The guiding principle that the commit path follows while
committing is that it stores the result of a particular operation instead of
storing the procedure.
Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
was associated with inode 10. During fast commit, instead of storing this
operation as a procedure "rename a to b", we store the resulting file system
state as a "series" of outcomes:
- Link dirent b to inode 10
- Unlink dirent a
- Inode 10 with valid refcount
Now when recovery code runs, it needs "enforce" this state on the file
system. This is what guarantees idempotence of fast commit replay.
Let's take an example of a procedure that is not idempotent and see how fast
commits make it idempotent. Consider following sequence of operations:
1) rm A
2) mv B A
3) read A
If we store this sequence of operations as is then the replay is not idempotent.
Let's say while in replay, we crash after (2). During the second replay,
file A (which was actually created as a result of "mv B A" operation) would get
deleted. Thus, file named A would be absent when we try to read A. So, this
sequence of operations is not idempotent. However, as mentioned above, instead
of storing the procedure fast commits store the outcome of each procedure. Thus
the fast commit log for above procedure would be as follows:
(Let's assume dirent A was linked to inode 10 and dirent B was linked to
inode 11 before the replay)
1) Unlink A
2) Link A to inode 11
3) Unlink B
4) Inode 11
If we crash after (3) we will have file A linked to inode 11. During the second
replay, we will remove file A (inode 11). But we will create it back and make
it point to inode 11. We won't find B, so we'll just skip that step. At this
point, the refcount for inode 11 is not reliable, but that gets fixed by the
replay of last inode 11 tag. Thus, by converting a non-idempotent procedure
into a series of idempotent outcomes, fast commits ensured idempotence during
the replay.
......@@ -185,7 +185,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t start, tmp;
J_ASSERT_BH(bh, buffer_locked(bh));
ASSERT(buffer_locked(bh));
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
......
......@@ -176,12 +176,10 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
err = add_system_zone(system_blks, map.m_pblk, n, ino);
if (err < 0) {
if (err == -EFSCORRUPTED) {
__ext4_error(sb, __func__, __LINE__,
-err, map.m_pblk,
"blocks %llu-%llu from inode %u overlap system zone",
map.m_pblk,
map.m_pblk + map.m_len - 1,
ino);
EXT4_ERROR_INODE_ERR(inode, -err,
"blocks %llu-%llu from inode overlap system zone",
map.m_pblk,
map.m_pblk + map.m_len - 1);
}
break;
}
......@@ -206,7 +204,7 @@ static void ext4_destroy_system_zone(struct rcu_head *rcu)
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
* racing with ext4_data_block_valid() calls reading system_blks rbtree
* racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. That's why we first build the rbtree and then
* swap it in place.
*/
......@@ -258,7 +256,7 @@ int ext4_setup_system_zone(struct super_block *sb)
/*
* System blks rbtree complete, announce it once to prevent racing
* with ext4_data_block_valid() accessing the rbtree at the same
* with ext4_inode_block_valid() accessing the rbtree at the same
* time.
*/
rcu_assign_pointer(sbi->s_system_blks, system_blks);
......@@ -278,7 +276,7 @@ int ext4_setup_system_zone(struct super_block *sb)
*
* The update of system_blks pointer in this function is protected by
* sb->s_umount semaphore. However we have to be careful as we can be
* racing with ext4_data_block_valid() calls reading system_blks rbtree
* racing with ext4_inode_block_valid() calls reading system_blks rbtree
* protected only by RCU. So we first clear the system_blks pointer and
* then free the rbtree only after RCU grace period expires.
*/
......
......@@ -98,6 +98,16 @@
#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__)
#endif
#define ASSERT(assert) \
do { \
if (unlikely(!(assert))) { \
printk(KERN_EMERG \
"Assertion failure in %s() at %s:%d: '%s'\n", \
__func__, __FILE__, __LINE__, #assert); \
BUG(); \
} \
} while (0)
/* data type for block offset of block group */
typedef int ext4_grpblk_t;
......@@ -1619,6 +1629,27 @@ struct ext4_sb_info {
errseq_t s_bdev_wb_err;
spinlock_t s_bdev_wb_lock;
/* Information about errors that happened during this mount */
spinlock_t s_error_lock;
int s_add_error_count;
int s_first_error_code;
__u32 s_first_error_line;
__u32 s_first_error_ino;
__u64 s_first_error_block;
const char *s_first_error_func;
time64_t s_first_error_time;
int s_last_error_code;
__u32 s_last_error_line;
__u32 s_last_error_ino;
__u64 s_last_error_block;
const char *s_last_error_func;
time64_t s_last_error_time;
/*
* If we are in a context where we cannot update error information in
* the on-disk superblock, we queue this work to do it.
*/
struct work_struct s_error_work;
/* Ext4 fast commit stuff */
atomic_t s_fc_subtid;
atomic_t s_fc_ineligible_updates;
......@@ -1858,7 +1889,6 @@ static inline bool ext4_verity_in_progress(struct inode *inode)
#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */
#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */
#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV
#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV
#define EXT4_GOOD_OLD_INODE_SIZE 128
......@@ -2952,9 +2982,9 @@ extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
ext4_group_t block_group,
unsigned int flags);
extern __printf(6, 7)
void __ext4_error(struct super_block *, const char *, unsigned int, int, __u64,
const char *, ...);
extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
ext4_fsblk_t, int, const char *, ...);
......@@ -2963,9 +2993,6 @@ void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
unsigned int, int);
extern __printf(5, 6)
void __ext4_abort(struct super_block *, const char *, unsigned int, int,
const char *, ...);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
const char *, ...);
......@@ -2995,6 +3022,9 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define EXT4_ERROR_FILE(file, block, fmt, a...) \
ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)
#define ext4_abort(sb, err, fmt, a...) \
__ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)
#ifdef CONFIG_PRINTK
#define ext4_error_inode(inode, func, line, block, fmt, ...) \
......@@ -3005,11 +3035,11 @@ void __ext4_grp_locked_error(const char *, unsigned int,
#define ext4_error_file(file, func, line, block, fmt, ...) \
__ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...) \
__ext4_error((sb), __func__, __LINE__, 0, 0, (fmt), ##__VA_ARGS__)
__ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \
##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...) \
__ext4_error((sb), __func__, __LINE__, (err), 0, (fmt), ##__VA_ARGS__)
#define ext4_abort(sb, err, fmt, ...) \
__ext4_abort((sb), __func__, __LINE__, (err), (fmt), ##__VA_ARGS__)
__ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \
##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...) \
__ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...) \
......@@ -3042,17 +3072,12 @@ do { \
#define ext4_error(sb, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
__ext4_error(sb, "", 0, 0, 0, " "); \
__ext4_error(sb, "", 0, false, 0, 0, " "); \
} while (0)
#define ext4_error_err(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
__ext4_error(sb, "", 0, err, 0, " "); \
} while (0)
#define ext4_abort(sb, err, fmt, ...) \
do { \
no_printk(fmt, ##__VA_ARGS__); \
__ext4_abort(sb, "", 0, err, " "); \
__ext4_error(sb, "", 0, false, err, 0, " "); \
} while (0)
#define ext4_warning(sb, fmt, ...) \
do { \
......@@ -3361,6 +3386,21 @@ static inline void ext4_unlock_group(struct super_block *sb,
spin_unlock(ext4_group_lock_ptr(sb, group));
}
#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}
static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
return (ext4_has_feature_quota(sb) ||
sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
#endif
/*
* Block validity checking
*/
......@@ -3609,7 +3649,6 @@ extern void ext4_io_submit(struct ext4_io_submit *io);
extern int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
struct writeback_control *wbc,
bool keep_towrite);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);
......
......@@ -296,8 +296,8 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
if (err) {
ext4_journal_abort_handle(where, line, __func__,
bh, handle, err);
__ext4_abort(inode->i_sb, where, line, -err,
"error %d when attempting revoke", err);
__ext4_error(inode->i_sb, where, line, true, -err, 0,
"error %d when attempting revoke", err);
}
BUFFER_TRACE(bh, "exit");
return err;
......
......@@ -86,17 +86,14 @@
#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
ext4_has_feature_quota(sb)) ? 1 : 0)
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
ext4_has_feature_quota(sb)) ?\
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
ext4_has_feature_quota(sb)) ?\
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
......
......@@ -5815,8 +5815,8 @@ int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
int ret;
path = ext4_find_extent(inode, start, NULL, 0);
if (!path)
return -EINVAL;
if (IS_ERR(path))
return PTR_ERR(path);
ex = path[path->p_depth].p_ext;
if (!ex) {
ret = -EFSCORRUPTED;
......@@ -5988,7 +5988,6 @@ int ext4_ext_replay_set_iblocks(struct inode *inode)
kfree(path);
break;
}
ex = path2[path2->p_depth].p_ext;
for (i = 0; i <= max(path->p_depth, path2->p_depth); i++) {
cmp1 = cmp2 = 0;
if (i <= path->p_depth)
......
......@@ -103,8 +103,69 @@
*
* Replay code should thus check for all the valid tails in the FC area.
*
* Fast Commit Replay Idempotence
* ------------------------------
*
* Fast commits tags are idempotent in nature provided the recovery code follows
* certain rules. The guiding principle that the commit path follows while
* committing is that it stores the result of a particular operation instead of
* storing the procedure.
*
* Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a'
* was associated with inode 10. During fast commit, instead of storing this
* operation as a procedure "rename a to b", we store the resulting file system
* state as a "series" of outcomes:
*
* - Link dirent b to inode 10
* - Unlink dirent a
* - Inode <10> with valid refcount
*
* Now when recovery code runs, it needs "enforce" this state on the file
* system. This is what guarantees idempotence of fast commit replay.
*
* Let's take an example of a procedure that is not idempotent and see how fast
* commits make it idempotent. Consider following sequence of operations:
*
* rm A; mv B A; read A
* (x) (y) (z)
*
* (x), (y) and (z) are the points at which we can crash. If we store this
* sequence of operations as is then the replay is not idempotent. Let's say
* while in replay, we crash at (z). During the second replay, file A (which was
* actually created as a result of "mv B A" operation) would get deleted. Thus,
* file named A would be absent when we try to read A. So, this sequence of
* operations is not idempotent. However, as mentioned above, instead of storing
* the procedure fast commits store the outcome of each procedure. Thus the fast
* commit log for above procedure would be as follows:
*
* (Let's assume dirent A was linked to inode 10 and dirent B was linked to
* inode 11 before the replay)
*
* [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11]
* (w) (x) (y) (z)
*
* If we crash at (z), we will have file A linked to inode 11. During the second
* replay, we will remove file A (inode 11). But we will create it back and make
* it point to inode 11. We won't find B, so we'll just skip that step. At this
* point, the refcount for inode 11 is not reliable, but that gets fixed by the
* replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
* TODOs
* -----
*
* 0) Fast commit replay path hardening: Fast commit replay code should use
* journal handles to make sure all the updates it does during the replay
* path are atomic. With that if we crash during fast commit replay, after
* trying to do recovery again, we will find a file system where fast commit
* area is invalid (because new full commit would be found). In order to deal
* with that, fast commit replay code should ensure that the "FC_REPLAY"
* superblock state is persisted before starting the replay, so that after
* the crash, fast commit recovery code can look at that flag and perform
* fast commit recovery even if that area is invalidated by later full
* commits.
*
* 1) Make fast commit atomic updates more fine grained. Today, a fast commit
* eligible update must be protected within ext4_fc_start_update() and
* ext4_fc_stop_update(). These routines are called at much higher
......@@ -1220,18 +1281,6 @@ static void ext4_fc_cleanup(journal_t *journal, int full)
/* Ext4 Replay Path Routines */
/* Get length of a particular tlv */
static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
{
return le16_to_cpu(tl->fc_len);
}
/* Get a pointer to "value" of a tlv */
static inline u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
{
return (u8 *)tl + sizeof(*tl);
}
/* Helper struct for dentry replay routines */
struct dentry_info_args {
int parent_ino, dname_len, ino, inode_len;
......@@ -1770,32 +1819,6 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl)
return 0;
}
static inline const char *tag2str(u16 tag)
{
switch (tag) {
case EXT4_FC_TAG_LINK:
return "TAG_ADD_ENTRY";
case EXT4_FC_TAG_UNLINK:
return "TAG_DEL_ENTRY";
case EXT4_FC_TAG_ADD_RANGE:
return "TAG_ADD_RANGE";
case EXT4_FC_TAG_CREAT:
return "TAG_CREAT_DENTRY";
case EXT4_FC_TAG_DEL_RANGE:
return "TAG_DEL_RANGE";
case EXT4_FC_TAG_INODE:
return "TAG_INODE";
case EXT4_FC_TAG_PAD:
return "TAG_PAD";
case EXT4_FC_TAG_TAIL:
return "TAG_TAIL";
case EXT4_FC_TAG_HEAD:
return "TAG_HEAD";
default:
return "TAG_ERROR";
}
}
static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb)
{
struct ext4_fc_replay_state *state;
......
......@@ -3,6 +3,11 @@
#ifndef __FAST_COMMIT_H__
#define __FAST_COMMIT_H__
/*
* Note this file is present in e2fsprogs/lib/ext2fs/fast_commit.h and
* linux/fs/ext4/fast_commit.h. These file should always be byte identical.
*/
/* Fast commit tags */
#define EXT4_FC_TAG_ADD_RANGE 0x0001
#define EXT4_FC_TAG_DEL_RANGE 0x0002
......@@ -50,7 +55,7 @@ struct ext4_fc_del_range {
struct ext4_fc_dentry_info {
__le32 fc_parent_ino;
__le32 fc_ino;
u8 fc_dname[0];
__u8 fc_dname[0];
};
/* Value structure for EXT4_FC_TAG_INODE and EXT4_FC_TAG_INODE_PARTIAL. */
......@@ -65,19 +70,6 @@ struct ext4_fc_tail {
__le32 fc_crc;
};
/*
* In memory list of dentry updates that are performed on the file
* system used by fast commit code.
*/
struct ext4_fc_dentry_update {
int fcd_op; /* Type of update create / unlink / link */
int fcd_parent; /* Parent inode number */
int fcd_ino; /* Inode number */
struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list;
};
/*
* Fast commit reason codes
*/
......@@ -107,6 +99,20 @@ enum {
EXT4_FC_REASON_MAX
};
#ifdef __KERNEL__
/*
* In memory list of dentry updates that are performed on the file
* system used by fast commit code.
*/
struct ext4_fc_dentry_update {
int fcd_op; /* Type of update create / unlink / link */
int fcd_parent; /* Parent inode number */
int fcd_ino; /* Inode number */
struct qstr fcd_name; /* Dirent name */
unsigned char fcd_iname[DNAME_INLINE_LEN]; /* Dirent name string */
struct list_head fcd_list;
};
struct ext4_fc_stats {
unsigned int fc_ineligible_reason_count[EXT4_FC_REASON_MAX];
unsigned long fc_num_commits;
......@@ -145,13 +151,51 @@ struct ext4_fc_replay_state {
};
#define region_last(__region) (((__region)->lblk) + ((__region)->len) - 1)
#endif
#define fc_for_each_tl(__start, __end, __tl) \
for (tl = (struct ext4_fc_tl *)start; \
(u8 *)tl < (u8 *)end; \
tl = (struct ext4_fc_tl *)((u8 *)tl + \
for (tl = (struct ext4_fc_tl *)(__start); \
(__u8 *)tl < (__u8 *)(__end); \
tl = (struct ext4_fc_tl *)((__u8 *)tl + \
sizeof(struct ext4_fc_tl) + \
+ le16_to_cpu(tl->fc_len)))
static inline const char *tag2str(__u16 tag)
{
switch (tag) {
case EXT4_FC_TAG_LINK:
return "ADD_ENTRY";
case EXT4_FC_TAG_UNLINK:
return "DEL_ENTRY";
case EXT4_FC_TAG_ADD_RANGE:
return "ADD_RANGE";
case EXT4_FC_TAG_CREAT:
return "CREAT_DENTRY";
case EXT4_FC_TAG_DEL_RANGE:
return "DEL_RANGE";
case EXT4_FC_TAG_INODE:
return "INODE";
case EXT4_FC_TAG_PAD:
return "PAD";
case EXT4_FC_TAG_TAIL:
return "TAIL";
case EXT4_FC_TAG_HEAD:
return "HEAD";
default:
return "ERROR";
}
}
/* Get length of a particular tlv */
static inline int ext4_fc_tag_len(struct ext4_fc_tl *tl)
{
return le16_to_cpu(tl->fc_len);
}
/* Get a pointer to "value" of a tlv */
static inline __u8 *ext4_fc_tag_val(struct ext4_fc_tl *tl)
{
return (__u8 *)tl + sizeof(*tl);
}
#endif /* __FAST_COMMIT_H__ */
......@@ -136,7 +136,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (unlikely(ext4_forced_shutdown(sbi)))
return -EIO;
J_ASSERT(ext4_journal_current_handle() == NULL);
ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file_enter(file, datasync);
......
......@@ -534,8 +534,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_fsblk_t first_block = 0;
trace_ext4_ind_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
depth = ext4_block_to_path(inode, map->m_lblk, offsets,
&blocks_to_boundary);
......
......@@ -175,6 +175,7 @@ void ext4_evict_inode(struct inode *inode)
*/
int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
bool freeze_protected = false;
trace_ext4_evict_inode(inode);
......@@ -232,9 +233,14 @@ void ext4_evict_inode(struct inode *inode)
/*
* Protect us against freezing - iput() caller didn't have to have any
* protection against it
* protection against it. When we are in a running transaction though,
* we are already protected against freezing and we cannot grab further
* protection due to lock ordering constraints.
*/
sb_start_intwrite(inode->i_sb);
if (!ext4_journal_current_handle()) {
sb_start_intwrite(inode->i_sb);
freeze_protected = true;
}
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
......@@ -253,7 +259,8 @@ void ext4_evict_inode(struct inode *inode)
* cleaned up.
*/
ext4_orphan_del(NULL, inode);
sb_end_intwrite(inode->i_sb);
if (freeze_protected)
sb_end_intwrite(inode->i_sb);
goto no_delete;
}
......@@ -294,7 +301,8 @@ void ext4_evict_inode(struct inode *inode)
stop_handle:
ext4_journal_stop(handle);
ext4_orphan_del(NULL, inode);
sb_end_intwrite(inode->i_sb);
if (freeze_protected)
sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
goto no_delete;
}
......@@ -323,7 +331,8 @@ void ext4_evict_inode(struct inode *inode)
else
ext4_free_inode(handle, inode);
ext4_journal_stop(handle);
sb_end_intwrite(inode->i_sb);
if (freeze_protected)
sb_end_intwrite(inode->i_sb);
ext4_xattr_inode_array_free(ea_inode_array);
return;
no_delete:
......@@ -830,8 +839,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
int create = map_flags & EXT4_GET_BLOCKS_CREATE;
int err;
J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| handle != NULL || create == 0);
ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| handle != NULL || create == 0);
map.m_lblk = block;
map.m_len = 1;
......@@ -846,9 +855,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
J_ASSERT(create != 0);
J_ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| (handle != NULL));
ASSERT(create != 0);
ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
|| (handle != NULL));
/*
* Now that we do not always journal data, we should
......@@ -2055,7 +2064,7 @@ static int ext4_writepage(struct page *page,
unlock_page(page);
return -ENOMEM;
}
ret = ext4_bio_write_page(&io_submit, page, len, wbc, keep_towrite);
ret = ext4_bio_write_page(&io_submit, page, len, keep_towrite);
ext4_io_submit(&io_submit);
/* Drop io_end reference we got from init */
ext4_put_io_end_defer(io_submit.io_end);
......@@ -2089,7 +2098,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
len = size & ~PAGE_MASK;
else
len = PAGE_SIZE;
err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false);
err = ext4_bio_write_page(&mpd->io_submit, page, len, false);
if (!err)
mpd->wbc->nr_to_write--;
mpd->first_page++;
......@@ -4610,7 +4619,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
(ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
return ERR_PTR(-ESTALE);
__ext4_error(sb, function, line, EFSCORRUPTED, 0,
__ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
"inode #%lu: comm %s: iget: illegal inode #",
ino, current->comm);
return ERR_PTR(-EFSCORRUPTED);
......
......@@ -822,24 +822,6 @@ void ext4_mb_generate_buddy(struct super_block *sb,
spin_unlock(&sbi->s_bal_lock);
}
static void mb_regenerate_buddy(struct ext4_buddy *e4b)
{
int count;
int order = 1;
void *buddy;
while ((buddy = mb_find_buddy(e4b, order++, &count))) {
ext4_set_bits(buddy, 0, count);
}
e4b->bd_info->bb_fragments = 0;
memset(e4b->bd_info->bb_counters, 0,
sizeof(*e4b->bd_info->bb_counters) *
(e4b->bd_sb->s_blocksize_bits + 2));
ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
e4b->bd_bitmap, e4b->bd_group);
}
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
......@@ -1307,22 +1289,18 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
{
int order = 1;
int bb_incr = 1 << (e4b->bd_blkbits - 1);
int order = 1, max;
void *bb;
BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
bb = e4b->bd_buddy;
while (order <= e4b->bd_blkbits + 1) {
block = block >> 1;
if (!mb_test_bit(block, bb)) {
bb = mb_find_buddy(e4b, order, &max);
if (!mb_test_bit(block >> order, bb)) {
/* this block is part of buddy of order 'order' */
return order;
}
bb += bb_incr;
bb_incr >>= 1;
order++;
}
return 0;
......@@ -1512,7 +1490,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_regenerate_buddy(e4b);
goto done;
}
......@@ -2395,9 +2372,9 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
nr = sbi->s_mb_prefetch;
if (ext4_has_feature_flex_bg(sb)) {
nr = (group / sbi->s_mb_prefetch) *
sbi->s_mb_prefetch;
nr = nr + sbi->s_mb_prefetch - group;
nr = 1 << sbi->s_log_groups_per_flex;
nr -= group & (nr - 1);
nr = min(nr, sbi->s_mb_prefetch);
}
prefetch_grp = ext4_mb_prefetch(sb, group,
nr, &prefetch_ios);
......@@ -2733,7 +2710,8 @@ static int ext4_mb_init_backend(struct super_block *sb)
if (ext4_has_feature_flex_bg(sb)) {
/* a single flex group is supposed to be read by a single IO */
sbi->s_mb_prefetch = 1 << sbi->s_es->s_log_groups_per_flex;
sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
} else {
sbi->s_mb_prefetch = 32;
......@@ -5126,6 +5104,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
ext4_group_first_block_no(sb, group) +
EXT4_C2B(sbi, cluster),
"Block already on to-be-freed list");
kmem_cache_free(ext4_free_data_cachep, new_entry);
return 0;
}
}
......
......@@ -182,10 +182,6 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
return bh;
}
#ifndef assert
#define assert(test) J_ASSERT(test)
#endif
#ifdef DX_DEBUG
#define dxtrace(command) command
#else
......@@ -843,7 +839,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
break;
}
}
assert (at == p - 1);
ASSERT(at == p - 1);
}
at = p - 1;
......@@ -1259,8 +1255,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
struct dx_entry *old = frame->at, *new = old + 1;
int count = dx_get_count(entries);
assert(count < dx_get_limit(entries));
assert(old < entries + count);
ASSERT(count < dx_get_limit(entries));
ASSERT(old < entries + count);
memmove(new + 1, new, (char *)(entries + count) - (char *)(new));
dx_set_hash(new, hash);
dx_set_block(new, block);
......@@ -2959,7 +2955,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* hold i_mutex, or the inode can not be referenced from outside,
* so i_nlink should not be bumped due to race
*/
J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
......
......@@ -111,9 +111,6 @@ static void ext4_finish_bio(struct bio *bio)
unsigned under_io = 0;
unsigned long flags;
if (!page)
continue;
if (fscrypt_is_bounce_page(page)) {
bounce_page = page;
page = fscrypt_pagecache_page(bounce_page);
......@@ -438,7 +435,6 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
int ext4_bio_write_page(struct ext4_io_submit *io,
struct page *page,
int len,
struct writeback_control *wbc,
bool keep_towrite)
{
struct page *bounce_page = NULL;
......@@ -448,6 +444,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
int ret = 0;
int nr_submitted = 0;
int nr_to_submit = 0;
struct writeback_control *wbc = io->io_wbc;
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
......
This diff is collapsed.
......@@ -1927,7 +1927,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
} else {
/* Allocate a buffer where we construct the new block. */
s->base = kzalloc(sb->s_blocksize, GFP_NOFS);
/* assert(header == s->base) */
error = -ENOMEM;
if (s->base == NULL)
goto cleanup;
......
......@@ -1869,9 +1869,7 @@ static int load_superblock(journal_t *journal)
if (jbd2_has_feature_fast_commit(journal)) {
journal->j_fc_last = be32_to_cpu(sb->s_maxlen);
num_fc_blocks = be32_to_cpu(sb->s_num_fc_blks);
if (!num_fc_blocks)
num_fc_blocks = JBD2_MIN_FC_BLOCKS;
num_fc_blocks = jbd2_journal_get_num_fc_blks(sb);
if (journal->j_last - num_fc_blocks >= JBD2_MIN_JOURNAL_BLOCKS)
journal->j_last = journal->j_fc_last - num_fc_blocks;
journal->j_fc_first = journal->j_last + 1;
......@@ -2102,9 +2100,7 @@ jbd2_journal_initialize_fast_commit(journal_t *journal)
journal_superblock_t *sb = journal->j_superblock;
unsigned long long num_fc_blks;
num_fc_blks = be32_to_cpu(sb->s_num_fc_blks);
if (num_fc_blks == 0)
num_fc_blks = JBD2_MIN_FC_BLOCKS;
num_fc_blks = jbd2_journal_get_num_fc_blks(sb);
if (journal->j_last - num_fc_blks < JBD2_MIN_JOURNAL_BLOCKS)
return -ENOSPC;
......
......@@ -68,7 +68,7 @@ extern void *jbd2_alloc(size_t size, gfp_t flags);
extern void jbd2_free(void *ptr, size_t size);
#define JBD2_MIN_JOURNAL_BLOCKS 1024
#define JBD2_MIN_FC_BLOCKS 256
#define JBD2_DEFAULT_FAST_COMMIT_BLOCKS 256
#ifdef __KERNEL__
......@@ -538,6 +538,7 @@ struct transaction_chp_stats_s {
* The transaction keeps track of all of the buffers modified by a
* running transaction, and all of the buffers committed but not yet
* flushed to home for finished transactions.
* (Locking Documentation improved by LockDoc)
*/
/*
......@@ -658,12 +659,12 @@ struct transaction_s
unsigned long t_start;
/*
* When commit was requested
* When commit was requested [j_state_lock]
*/
unsigned long t_requested;
/*
* Checkpointing stats [j_checkpoint_sem]
* Checkpointing stats [j_list_lock]
*/
struct transaction_chp_stats_s t_chp_stats;
......@@ -1691,6 +1692,13 @@ static inline int jbd2_journal_has_csum_v2or3(journal_t *journal)
return journal->j_chksum_driver != NULL;
}
static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)
{
int num_fc_blocks = be32_to_cpu(jsb->s_num_fc_blks);
return num_fc_blocks ? num_fc_blocks : JBD2_DEFAULT_FAST_COMMIT_BLOCKS;
}
/*
* Return number of free blocks in the log. Must be called under j_state_lock.
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment