Commit 2b38960c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] JBD: ordered-data commit cleanup

For data=ordered, kjournald at commit time has to write out and wait upon a
long list of buffers.  It does this in a rather awkward way with a single
list.  it causes complexity and long lock hold times, and makes the addition
of rescheduling points quite hard

So what we do instead (based on Chris Mason's suggestion) is to add a new
buffer list (t_locked_list) to the journal.  It contains buffers which have
been placed under I/O.

So as we walk the t_sync_datalist list we move buffers over to t_locked_list
as they are written out.

When t_sync_datalist is empty we may then walk t_locked_list waiting for the
I/O to complete.

As a side-effect this means that we can remove the nasty synchronous wait in
journal_dirty_data which is there to avoid the kjournald livelock which would
otherwise occur when someone is continuously dirtying a buffer.
parent 376fd482
...@@ -78,6 +78,21 @@ static void release_buffer_page(struct buffer_head *bh) ...@@ -78,6 +78,21 @@ static void release_buffer_page(struct buffer_head *bh)
__brelse(bh); __brelse(bh);
} }
/*
* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
* held. For ranking reasons we must trylock. If we lose, schedule away and
* return 0. j_list_lock is dropped in this case.
*/
static int inverted_lock(journal_t *journal, struct buffer_head *bh)
{
if (!jbd_trylock_bh_state(bh)) {
spin_unlock(&journal->j_list_lock);
schedule();
return 0;
}
return 1;
}
/* /*
* journal_commit_transaction * journal_commit_transaction
* *
...@@ -88,7 +103,6 @@ void journal_commit_transaction(journal_t *journal) ...@@ -88,7 +103,6 @@ void journal_commit_transaction(journal_t *journal)
{ {
transaction_t *commit_transaction; transaction_t *commit_transaction;
struct journal_head *jh, *new_jh, *descriptor; struct journal_head *jh, *new_jh, *descriptor;
struct journal_head *next_jh, *last_jh;
struct buffer_head *wbuf[64]; struct buffer_head *wbuf[64];
int bufs; int bufs;
int flags; int flags;
...@@ -222,113 +236,110 @@ void journal_commit_transaction(journal_t *journal) ...@@ -222,113 +236,110 @@ void journal_commit_transaction(journal_t *journal)
err = 0; err = 0;
/* /*
* Whenever we unlock the journal and sleep, things can get added * Whenever we unlock the journal and sleep, things can get added
* onto ->t_datalist, so we have to keep looping back to write_out_data * onto ->t_sync_datalist, so we have to keep looping back to
* until we *know* that the list is empty. * write_out_data until we *know* that the list is empty.
*/ */
write_out_data: bufs = 0;
/* /*
* Cleanup any flushed data buffers from the data list. Even in * Cleanup any flushed data buffers from the data list. Even in
* abort mode, we want to flush this out as soon as possible. * abort mode, we want to flush this out as soon as possible.
*
* We take j_list_lock to protect the lists from
* journal_try_to_free_buffers().
*/ */
write_out_data:
cond_resched();
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
write_out_data_locked: while (commit_transaction->t_sync_datalist) {
bufs = 0;
next_jh = commit_transaction->t_sync_datalist;
if (next_jh == NULL)
goto sync_datalist_empty;
last_jh = next_jh->b_tprev;
do {
struct buffer_head *bh; struct buffer_head *bh;
jh = next_jh; jh = commit_transaction->t_sync_datalist;
next_jh = jh->b_tnext; commit_transaction->t_sync_datalist = jh->b_tnext;
bh = jh2bh(jh); bh = jh2bh(jh);
if (!buffer_locked(bh)) { if (buffer_locked(bh)) {
BUFFER_TRACE(bh, "locked");
if (!inverted_lock(journal, bh))
goto write_out_data;
__journal_unfile_buffer(jh);
__journal_file_buffer(jh, jh->b_transaction, BJ_Locked);
jbd_unlock_bh_state(bh);
if (need_resched()) {
spin_unlock(&journal->j_list_lock);
goto write_out_data;
}
} else {
if (buffer_dirty(bh)) { if (buffer_dirty(bh)) {
BUFFER_TRACE(bh, "start journal writeout"); BUFFER_TRACE(bh, "start journal writeout");
atomic_inc(&bh->b_count); get_bh(bh);
wbuf[bufs++] = bh; wbuf[bufs++] = bh;
} else { if (bufs == ARRAY_SIZE(wbuf)) {
BUFFER_TRACE(bh, "writeout complete: unfile"); jbd_debug(2, "submit %d writes\n",
/* bufs);
* We have a lock ranking problem..
*/
if (!jbd_trylock_bh_state(bh)) {
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
schedule(); ll_rw_block(WRITE, bufs, wbuf);
journal_brelse_array(wbuf, bufs);
bufs = 0;
goto write_out_data; goto write_out_data;
} }
} else {
BUFFER_TRACE(bh, "writeout complete: unfile");
if (!inverted_lock(journal, bh))
goto write_out_data;
__journal_unfile_buffer(jh); __journal_unfile_buffer(jh);
jh->b_transaction = NULL; jh->b_transaction = NULL;
jbd_unlock_bh_state(bh); jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh); journal_remove_journal_head(bh);
__brelse(bh); put_bh(bh);
if (need_resched() && commit_transaction-> if (need_resched()) {
t_sync_datalist) {
commit_transaction->t_sync_datalist =
next_jh;
if (bufs)
break;
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
cond_resched();
goto write_out_data; goto write_out_data;
} }
} }
} }
if (bufs == ARRAY_SIZE(wbuf)) { }
/*
* Major speedup: start here on the next scan
*/
J_ASSERT(commit_transaction->t_sync_datalist != 0);
commit_transaction->t_sync_datalist = jh;
break;
}
} while (jh != last_jh);
if (bufs || need_resched()) { if (bufs) {
jbd_debug(2, "submit %d writes\n", bufs);
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
if (bufs) ll_rw_block(WRITE, bufs, wbuf);
ll_rw_block(WRITE, bufs, wbuf);
cond_resched();
journal_brelse_array(wbuf, bufs); journal_brelse_array(wbuf, bufs);
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
goto write_out_data_locked;
} }
/* /*
* Wait for all previously submitted IO on the data list to complete. * Wait for all previously submitted IO to complete.
*/ */
jh = commit_transaction->t_sync_datalist; while (commit_transaction->t_locked_list) {
if (jh == NULL)
goto sync_datalist_empty;
do {
struct buffer_head *bh; struct buffer_head *bh;
jh = jh->b_tprev; /* Wait on the last written */
jh = commit_transaction->t_locked_list->b_tprev;
bh = jh2bh(jh); bh = jh2bh(jh);
get_bh(bh);
if (buffer_locked(bh)) { if (buffer_locked(bh)) {
get_bh(bh);
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
wait_on_buffer(bh); wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) if (unlikely(!buffer_uptodate(bh)))
err = -EIO; err = -EIO;
spin_lock(&journal->j_list_lock);
}
if (!inverted_lock(journal, bh)) {
put_bh(bh); put_bh(bh);
/* the journal_head may have been removed now */ spin_lock(&journal->j_list_lock);
goto write_out_data; continue;
} else if (buffer_dirty(bh)) {
goto write_out_data_locked;
} }
} while (jh != commit_transaction->t_sync_datalist); if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
goto write_out_data_locked; __journal_unfile_buffer(jh);
jh->b_transaction = NULL;
sync_datalist_empty: jbd_unlock_bh_state(bh);
journal_remove_journal_head(bh);
put_bh(bh);
} else {
jbd_unlock_bh_state(bh);
}
put_bh(bh);
if (need_resched()) {
spin_unlock(&journal->j_list_lock);
cond_resched();
spin_lock(&journal->j_list_lock);
}
}
spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_list_lock);
journal_write_revoke_records(journal, commit_transaction); journal_write_revoke_records(journal, commit_transaction);
......
...@@ -1010,7 +1010,8 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ...@@ -1010,7 +1010,8 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
* the write() data. * the write() data.
*/ */
if (jh->b_jlist != BJ_None && if (jh->b_jlist != BJ_None &&
jh->b_jlist != BJ_SyncData) { jh->b_jlist != BJ_SyncData &&
jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "Not stealing"); JBUFFER_TRACE(jh, "Not stealing");
goto no_journal; goto no_journal;
} }
...@@ -1048,7 +1049,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh) ...@@ -1048,7 +1049,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
* committing transaction, so might still be left on that * committing transaction, so might still be left on that
* transaction's metadata lists. * transaction's metadata lists.
*/ */
if (jh->b_jlist != BJ_SyncData) { if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
JBUFFER_TRACE(jh, "not on correct data list: unfile"); JBUFFER_TRACE(jh, "not on correct data list: unfile");
J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
__journal_unfile_buffer(jh); __journal_unfile_buffer(jh);
...@@ -1539,6 +1540,9 @@ void __journal_unfile_buffer(struct journal_head *jh) ...@@ -1539,6 +1540,9 @@ void __journal_unfile_buffer(struct journal_head *jh)
case BJ_Reserved: case BJ_Reserved:
list = &transaction->t_reserved_list; list = &transaction->t_reserved_list;
break; break;
case BJ_Locked:
list = &transaction->t_locked_list;
break;
} }
__blist_del_buffer(list, jh); __blist_del_buffer(list, jh);
...@@ -1576,7 +1580,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) ...@@ -1576,7 +1580,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
spin_lock(&journal->j_list_lock); spin_lock(&journal->j_list_lock);
if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) { if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
if (jh->b_jlist == BJ_SyncData) { if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
/* A written-back ordered data buffer */ /* A written-back ordered data buffer */
JBUFFER_TRACE(jh, "release data"); JBUFFER_TRACE(jh, "release data");
__journal_unfile_buffer(jh); __journal_unfile_buffer(jh);
...@@ -1985,6 +1989,9 @@ void __journal_file_buffer(struct journal_head *jh, ...@@ -1985,6 +1989,9 @@ void __journal_file_buffer(struct journal_head *jh,
case BJ_Reserved: case BJ_Reserved:
list = &transaction->t_reserved_list; list = &transaction->t_reserved_list;
break; break;
case BJ_Locked:
list = &transaction->t_locked_list;
break;
} }
__blist_add_buffer(list, jh); __blist_add_buffer(list, jh);
......
...@@ -486,6 +486,12 @@ struct transaction_s ...@@ -486,6 +486,12 @@ struct transaction_s
*/ */
struct journal_head *t_reserved_list; struct journal_head *t_reserved_list;
/*
* Doubly-linked circular list of all buffers under writeout during
* commit [j_list_lock]
*/
struct journal_head *t_locked_list;
/* /*
* Doubly-linked circular list of all metadata buffers owned by this * Doubly-linked circular list of all metadata buffers owned by this
* transaction [j_list_lock] * transaction [j_list_lock]
...@@ -1079,7 +1085,8 @@ static inline int jbd_space_needed(journal_t *journal) ...@@ -1079,7 +1085,8 @@ static inline int jbd_space_needed(journal_t *journal)
#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ #define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
#define BJ_LogCtl 6 /* Buffer contains log descriptors */ #define BJ_LogCtl 6 /* Buffer contains log descriptors */
#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ #define BJ_Reserved 7 /* Buffer is reserved for access by journal */
#define BJ_Types 8 #define BJ_Locked 8 /* Locked for I/O during commit */
#define BJ_Types 9
extern int jbd_blocks_per_page(struct inode *inode); extern int jbd_blocks_per_page(struct inode *inode);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment