Commit b436b9be authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: Wait for proper transaction commit on fsync

We cannot rely on buffer dirty bits during fsync because pdflush can come
before fsync is called and clear dirty bits without forcing a transaction
commit. What we do is that we track which transaction has last changed
the inode and which transaction last changed allocation and force it to
disk on fsync.
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 194074ac
...@@ -709,6 +709,13 @@ struct ext4_inode_info { ...@@ -709,6 +709,13 @@ struct ext4_inode_info {
struct list_head i_aio_dio_complete_list; struct list_head i_aio_dio_complete_list;
/* current io_end structure for async DIO write*/ /* current io_end structure for async DIO write*/
ext4_io_end_t *cur_aio_dio; ext4_io_end_t *cur_aio_dio;
/*
* Transactions that contain inode's metadata needed to complete
* fsync and fdatasync, respectively.
*/
tid_t i_sync_tid;
tid_t i_datasync_tid;
}; };
/* /*
......
...@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) ...@@ -249,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
return 0; return 0;
} }
static inline void ext4_update_inode_fsync_trans(handle_t *handle,
struct inode *inode,
int datasync)
{
struct ext4_inode_info *ei = EXT4_I(inode);
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
if (datasync)
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
}
/* super.c */ /* super.c */
int ext4_force_commit(struct super_block *sb); int ext4_force_commit(struct super_block *sb);
......
...@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ...@@ -3058,6 +3058,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
ret = ext4_convert_unwritten_extents_dio(handle, inode, ret = ext4_convert_unwritten_extents_dio(handle, inode,
path); path);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
goto out2; goto out2;
} }
/* buffered IO case */ /* buffered IO case */
...@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ...@@ -3085,6 +3087,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
ret = ext4_ext_convert_to_initialized(handle, inode, ret = ext4_ext_convert_to_initialized(handle, inode,
path, iblock, path, iblock,
max_blocks); max_blocks);
if (ret >= 0)
ext4_update_inode_fsync_trans(handle, inode, 1);
out: out:
if (ret <= 0) { if (ret <= 0) {
err = ret; err = ret;
...@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ...@@ -3323,10 +3327,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
allocated = ext4_ext_get_actual_len(&newex); allocated = ext4_ext_get_actual_len(&newex);
set_buffer_new(bh_result); set_buffer_new(bh_result);
/* Cache only when it is _not_ an uninitialized extent */ /*
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) * Cache the extent and update transaction to commit on fdatasync only
* when it is _not_ an uninitialized extent.
*/
if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
ext4_ext_put_in_cache(inode, iblock, allocated, newblock, ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
EXT4_EXT_CACHE_EXTENT); EXT4_EXT_CACHE_EXTENT);
ext4_update_inode_fsync_trans(handle, inode, 1);
} else
ext4_update_inode_fsync_trans(handle, inode, 0);
out: out:
if (allocated > max_blocks) if (allocated > max_blocks)
allocated = max_blocks; allocated = max_blocks;
......
...@@ -51,25 +51,30 @@ ...@@ -51,25 +51,30 @@
int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
{ {
struct inode *inode = dentry->d_inode; struct inode *inode = dentry->d_inode;
struct ext4_inode_info *ei = EXT4_I(inode);
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
int err, ret = 0; int ret;
tid_t commit_tid;
J_ASSERT(ext4_journal_current_handle() == NULL); J_ASSERT(ext4_journal_current_handle() == NULL);
trace_ext4_sync_file(file, dentry, datasync); trace_ext4_sync_file(file, dentry, datasync);
if (inode->i_sb->s_flags & MS_RDONLY)
return 0;
ret = flush_aio_dio_completed_IO(inode); ret = flush_aio_dio_completed_IO(inode);
if (ret < 0) if (ret < 0)
return ret; return ret;
if (!journal)
return simple_fsync(file, dentry, datasync);
/* /*
* data=writeback: * data=writeback,ordered:
* The caller's filemap_fdatawrite()/wait will sync the data. * The caller's filemap_fdatawrite()/wait will sync the data.
* sync_inode() will sync the metadata * Metadata is in the journal, we wait for proper transaction to
* * commit here.
* data=ordered:
* The caller's filemap_fdatawrite() will write the data and
* sync_inode() will write the inode if it is dirty. Then the caller's
* filemap_fdatawait() will wait on the pages.
* *
* data=journal: * data=journal:
* filemap_fdatawrite won't do anything (the buffers are clean). * filemap_fdatawrite won't do anything (the buffers are clean).
...@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) ...@@ -82,27 +87,10 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ext4_should_journal_data(inode)) if (ext4_should_journal_data(inode))
return ext4_force_commit(inode->i_sb); return ext4_force_commit(inode->i_sb);
if (!journal) commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
ret = sync_mapping_buffers(inode->i_mapping); if (jbd2_log_start_commit(journal, commit_tid))
jbd2_log_wait_commit(journal, commit_tid);
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) else if (journal->j_flags & JBD2_BARRIER)
goto out;
/*
* The VFS has written the file data. If the inode is unaltered
* then we need not start a commit.
*/
if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 0, /* sys_fsync did this */
};
err = sync_inode(inode, &wbc);
if (ret == 0)
ret = err;
}
out:
if (journal && (journal->j_flags & JBD2_BARRIER))
blkdev_issue_flush(inode->i_sb->s_bdev, NULL); blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
return ret; return ret;
} }
...@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ...@@ -983,6 +983,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
goto cleanup; goto cleanup;
set_buffer_new(bh_result); set_buffer_new(bh_result);
ext4_update_inode_fsync_trans(handle, inode, 1);
got_it: got_it:
map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
if (count > blocks_to_boundary) if (count > blocks_to_boundary)
...@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ...@@ -4738,6 +4740,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
struct ext4_inode *raw_inode; struct ext4_inode *raw_inode;
struct ext4_inode_info *ei; struct ext4_inode_info *ei;
struct inode *inode; struct inode *inode;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret; long ret;
int block; int block;
...@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ...@@ -4802,6 +4805,31 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_data[block] = raw_inode->i_block[block]; ei->i_data[block] = raw_inode->i_block[block];
INIT_LIST_HEAD(&ei->i_orphan); INIT_LIST_HEAD(&ei->i_orphan);
/*
* Set transaction id's of transactions that have to be committed
* to finish f[data]sync. We set them to currently running transaction
* as we cannot be sure that the inode or some of its metadata isn't
* part of the transaction - the inode could have been reclaimed and
* now it is reread from disk.
*/
if (journal) {
transaction_t *transaction;
tid_t tid;
spin_lock(&journal->j_state_lock);
if (journal->j_running_transaction)
transaction = journal->j_running_transaction;
else
transaction = journal->j_committing_transaction;
if (transaction)
tid = transaction->t_tid;
else
tid = journal->j_commit_sequence;
spin_unlock(&journal->j_state_lock);
ei->i_sync_tid = tid;
ei->i_datasync_tid = tid;
}
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
...@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle, ...@@ -5056,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
err = rc; err = rc;
ei->i_state &= ~EXT4_STATE_NEW; ei->i_state &= ~EXT4_STATE_NEW;
ext4_update_inode_fsync_trans(handle, inode, 0);
out_brelse: out_brelse:
brelse(bh); brelse(bh);
ext4_std_error(inode->i_sb, err); ext4_std_error(inode->i_sb, err);
......
...@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ...@@ -706,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
spin_lock_init(&(ei->i_block_reservation_lock)); spin_lock_init(&(ei->i_block_reservation_lock));
INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
ei->cur_aio_dio = NULL; ei->cur_aio_dio = NULL;
ei->i_sync_tid = 0;
ei->i_datasync_tid = 0;
return &ei->vfs_inode; return &ei->vfs_inode;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment