Commit b34090e5 authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

jbd2: refine waiting for shadow buffers

Currently when we add a buffer to a transaction, we wait until the
buffer is removed from BJ_Shadow list (so that we prevent any changes
to the buffer that is just written to the journal).  This can take
unnecessarily long as a lot happens between the time the buffer is
submitted to the journal and the time when we remove the buffer from
BJ_Shadow list.  (e.g.  We wait for all data buffers in the
transaction, we issue a cache flush, etc.)  Also this creates a
dependency of do_get_write_access() on transaction commit (namely
waiting for data IO to complete) which we want to avoid when
implementing transaction reservation.

So we modify commit code to set new BH_Shadow flag when temporary
shadowing buffer is created and we clear that flag once IO on that
buffer is complete.  This allows do_get_write_access() to wait only
for BH_Shadow bit and thus removes the dependency on data IO
completion.
Reviewed-by: default avatarZheng Liu <wenqing.lz@taobao.com>
Signed-off-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent e5a120ae
...@@ -30,15 +30,22 @@ ...@@ -30,15 +30,22 @@
#include <trace/events/jbd2.h> #include <trace/events/jbd2.h>
/* /*
* Default IO end handler for temporary BJ_IO buffer_heads. * IO end handler for temporary buffer_heads handling writes to the journal.
*/ */
static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
{ {
struct buffer_head *orig_bh = bh->b_private;
BUFFER_TRACE(bh, ""); BUFFER_TRACE(bh, "");
if (uptodate) if (uptodate)
set_buffer_uptodate(bh); set_buffer_uptodate(bh);
else else
clear_buffer_uptodate(bh); clear_buffer_uptodate(bh);
if (orig_bh) {
clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
smp_mb__after_clear_bit();
wake_up_bit(&orig_bh->b_state, BH_Shadow);
}
unlock_buffer(bh); unlock_buffer(bh);
} }
...@@ -832,6 +839,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -832,6 +839,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
bh = jh2bh(jh); bh = jh2bh(jh);
clear_buffer_jwrite(bh); clear_buffer_jwrite(bh);
J_ASSERT_BH(bh, buffer_jbddirty(bh)); J_ASSERT_BH(bh, buffer_jbddirty(bh));
J_ASSERT_BH(bh, !buffer_shadow(bh));
/* The metadata is now released for reuse, but we need /* The metadata is now released for reuse, but we need
to remember it against this transaction so that when to remember it against this transaction so that when
...@@ -839,14 +847,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) ...@@ -839,14 +847,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
required. */ required. */
JBUFFER_TRACE(jh, "file as BJ_Forget"); JBUFFER_TRACE(jh, "file as BJ_Forget");
jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget); jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
/*
* Wake up any transactions which were waiting for this IO to
* complete. The barrier must be here so that changes by
* jbd2_journal_file_buffer() take effect before wake_up_bit()
* does the waitqueue check.
*/
smp_mb();
wake_up_bit(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "brelse shadowed buffer"); JBUFFER_TRACE(jh, "brelse shadowed buffer");
__brelse(bh); __brelse(bh);
} }
......
...@@ -451,6 +451,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -451,6 +451,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
new_bh->b_size = bh_in->b_size; new_bh->b_size = bh_in->b_size;
new_bh->b_bdev = journal->j_dev; new_bh->b_bdev = journal->j_dev;
new_bh->b_blocknr = blocknr; new_bh->b_blocknr = blocknr;
new_bh->b_private = bh_in;
set_buffer_mapped(new_bh); set_buffer_mapped(new_bh);
set_buffer_dirty(new_bh); set_buffer_dirty(new_bh);
...@@ -465,6 +466,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, ...@@ -465,6 +466,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
__jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow); __jbd2_journal_file_buffer(jh_in, transaction, BJ_Shadow);
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
set_buffer_shadow(bh_in);
jbd_unlock_bh_state(bh_in); jbd_unlock_bh_state(bh_in);
return do_escape | (done_copy_out << 1); return do_escape | (done_copy_out << 1);
......
...@@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh) ...@@ -619,6 +619,12 @@ static void warn_dirty_buffer(struct buffer_head *bh)
bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
} }
static int sleep_on_shadow_bh(void *word)
{
io_schedule();
return 0;
}
/* /*
* If the buffer is already part of the current transaction, then there * If the buffer is already part of the current transaction, then there
* is nothing we need to do. If it is already part of a prior * is nothing we need to do. If it is already part of a prior
...@@ -754,41 +760,29 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, ...@@ -754,41 +760,29 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* journaled. If the primary copy is already going to * journaled. If the primary copy is already going to
* disk then we cannot do copy-out here. */ * disk then we cannot do copy-out here. */
if (jh->b_jlist == BJ_Shadow) { if (buffer_shadow(bh)) {
DEFINE_WAIT_BIT(wait, &bh->b_state, BH_Unshadow);
wait_queue_head_t *wqh;
wqh = bit_waitqueue(&bh->b_state, BH_Unshadow);
JBUFFER_TRACE(jh, "on shadow: sleep"); JBUFFER_TRACE(jh, "on shadow: sleep");
jbd_unlock_bh_state(bh); jbd_unlock_bh_state(bh);
/* commit wakes up all shadow buffers after IO */ wait_on_bit(&bh->b_state, BH_Shadow,
for ( ; ; ) { sleep_on_shadow_bh, TASK_UNINTERRUPTIBLE);
prepare_to_wait(wqh, &wait.wait,
TASK_UNINTERRUPTIBLE);
if (jh->b_jlist != BJ_Shadow)
break;
schedule();
}
finish_wait(wqh, &wait.wait);
goto repeat; goto repeat;
} }
/* Only do the copy if the currently-owning transaction /*
* still needs it. If it is on the Forget list, the * Only do the copy if the currently-owning transaction still
* committing transaction is past that stage. The * needs it. If buffer isn't on BJ_Metadata list, the
* buffer had better remain locked during the kmalloc, * committing transaction is past that stage (here we use the
* but that should be true --- we hold the journal lock * fact that BH_Shadow is set under bh_state lock together with
* still and the buffer is already on the BUF_JOURNAL * refiling to BJ_Shadow list and at this point we know the
* list so won't be flushed. * buffer doesn't have BH_Shadow set).
* *
* Subtle point, though: if this is a get_undo_access, * Subtle point, though: if this is a get_undo_access,
* then we will be relying on the frozen_data to contain * then we will be relying on the frozen_data to contain
* the new value of the committed_data record after the * the new value of the committed_data record after the
* transaction, so we HAVE to force the frozen_data copy * transaction, so we HAVE to force the frozen_data copy
* in that case. */ * in that case.
*/
if (jh->b_jlist != BJ_Forget || force_copy) { if (jh->b_jlist == BJ_Metadata || force_copy) {
JBUFFER_TRACE(jh, "generate frozen data"); JBUFFER_TRACE(jh, "generate frozen data");
if (!frozen_buffer) { if (!frozen_buffer) {
JBUFFER_TRACE(jh, "allocate memory for buffer"); JBUFFER_TRACE(jh, "allocate memory for buffer");
......
...@@ -244,6 +244,31 @@ typedef struct journal_superblock_s ...@@ -244,6 +244,31 @@ typedef struct journal_superblock_s
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/sched.h> #include <linux/sched.h>
enum jbd_state_bits {
BH_JBD /* Has an attached ext3 journal_head */
= BH_PrivateStart,
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
BH_Freed, /* Has been freed (truncated) */
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
BH_JBDPrivateStart, /* First bit available for private use by FS */
};
BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
#include <linux/jbd_common.h> #include <linux/jbd_common.h>
#define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT(assert) BUG_ON(!(assert))
......
...@@ -302,6 +302,34 @@ typedef struct journal_superblock_s ...@@ -302,6 +302,34 @@ typedef struct journal_superblock_s
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/sched.h> #include <linux/sched.h>
enum jbd_state_bits {
BH_JBD /* Has an attached ext3 journal_head */
= BH_PrivateStart,
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
BH_Freed, /* Has been freed (truncated) */
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Shadow, /* IO on shadow buffer is running */
BH_Verified, /* Metadata block has been verified ok */
BH_JBDPrivateStart, /* First bit available for private use by FS */
};
BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Shadow, shadow)
BUFFER_FNS(Verified, verified)
#include <linux/jbd_common.h> #include <linux/jbd_common.h>
#define J_ASSERT(assert) BUG_ON(!(assert)) #define J_ASSERT(assert) BUG_ON(!(assert))
......
#ifndef _LINUX_JBD_STATE_H #ifndef _LINUX_JBD_STATE_H
#define _LINUX_JBD_STATE_H #define _LINUX_JBD_STATE_H
enum jbd_state_bits {
BH_JBD /* Has an attached ext3 journal_head */
= BH_PrivateStart,
BH_JWrite, /* Being written to log (@@@ DEBUGGING) */
BH_Freed, /* Has been freed (truncated) */
BH_Revoked, /* Has been revoked from the log */
BH_RevokeValid, /* Revoked flag is valid */
BH_JBDDirty, /* Is dirty but journaled */
BH_State, /* Pins most journal_head state */
BH_JournalHead, /* Pins bh->b_private and jh->b_bh */
BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */
BH_Verified, /* Metadata block has been verified ok */
BH_JBDPrivateStart, /* First bit available for private use by FS */
};
BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JWrite, jwrite)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
BUFFER_FNS(Revoked, revoked)
TAS_BUFFER_FNS(Revoked, revoked)
BUFFER_FNS(RevokeValid, revokevalid)
TAS_BUFFER_FNS(RevokeValid, revokevalid)
BUFFER_FNS(Freed, freed)
BUFFER_FNS(Verified, verified)
static inline struct buffer_head *jh2bh(struct journal_head *jh) static inline struct buffer_head *jh2bh(struct journal_head *jh)
{ {
return jh->b_bh; return jh->b_bh;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment