Commit 18ce3751 authored by Jens Axboe's avatar Jens Axboe

Properly notify block layer of sync writes

fsync_buffers_list() and sync_dirty_buffer() both issue async writes and
then immediately wait on them. Conceptually, that makes them sync writes
and we should treat them as such so that the IO schedulers can handle
them appropriately.

This patch fixes a write starvation issue that Lin Ming reported, where
xx is stuck for more than 2 minutes because of a large number of
synchronous IO in the system:

INFO: task kjournald:20558 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
message.
kjournald     D ffff810010820978  6712 20558      2
ffff81022ddb1d10 0000000000000046 ffff81022e7baa10 ffffffff803ba6f2
ffff81022ecd0000 ffff8101e6dc9160 ffff81022ecd0348 000000008048b6cb
0000000000000086 ffff81022c4e8d30 0000000000000000 ffffffff80247537
Call Trace:
[<ffffffff803ba6f2>] kobject_get+0x12/0x17
[<ffffffff80247537>] getnstimeofday+0x2f/0x83
[<ffffffff8029c1ac>] sync_buffer+0x0/0x3f
[<ffffffff8066d195>] io_schedule+0x5d/0x9f
[<ffffffff8029c1e7>] sync_buffer+0x3b/0x3f
[<ffffffff8066d3f0>] __wait_on_bit+0x40/0x6f
[<ffffffff8029c1ac>] sync_buffer+0x0/0x3f
[<ffffffff8066d48b>] out_of_line_wait_on_bit+0x6c/0x78
[<ffffffff80243909>] wake_bit_function+0x0/0x23
[<ffffffff8029e3ad>] sync_dirty_buffer+0x98/0xcb
[<ffffffff8030056b>] journal_commit_transaction+0x97d/0xcb6
[<ffffffff8023a676>] lock_timer_base+0x26/0x4b
[<ffffffff8030300a>] kjournald+0xc1/0x1fb
[<ffffffff802438db>] autoremove_wake_function+0x0/0x2e
[<ffffffff80302f49>] kjournald+0x0/0x1fb
[<ffffffff802437bb>] kthread+0x47/0x74
[<ffffffff8022de51>] schedule_tail+0x28/0x5d
[<ffffffff8020cac8>] child_rip+0xa/0x12
[<ffffffff80243774>] kthread+0x0/0x74
[<ffffffff8020cabe>] child_rip+0x0/0x12

Lin Ming confirms that this patch fixes the issue. I've run tests with
it for the past week and no ill effects have been observed, so I'm
proposing it for inclusion into 2.6.26.
Signed-off-by: default avatarJens Axboe <jens.axboe@oracle.com>
parent d585d0b9
...@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) ...@@ -821,7 +821,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* contents - it is a noop if I/O is still in * contents - it is a noop if I/O is still in
* flight on potentially older contents. * flight on potentially older contents.
*/ */
ll_rw_block(SWRITE, 1, &bh); ll_rw_block(SWRITE_SYNC, 1, &bh);
brelse(bh); brelse(bh);
spin_lock(lock); spin_lock(lock);
} }
...@@ -2940,15 +2940,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) ...@@ -2940,15 +2940,18 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
for (i = 0; i < nr; i++) { for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i]; struct buffer_head *bh = bhs[i];
if (rw == SWRITE) if (rw == SWRITE || rw == SWRITE_SYNC)
lock_buffer(bh); lock_buffer(bh);
else if (test_set_buffer_locked(bh)) else if (test_set_buffer_locked(bh))
continue; continue;
if (rw == WRITE || rw == SWRITE) { if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
if (test_clear_buffer_dirty(bh)) { if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync; bh->b_end_io = end_buffer_write_sync;
get_bh(bh); get_bh(bh);
if (rw == SWRITE_SYNC)
submit_bh(WRITE_SYNC, bh);
else
submit_bh(WRITE, bh); submit_bh(WRITE, bh);
continue; continue;
} }
...@@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh) ...@@ -2978,7 +2981,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
if (test_clear_buffer_dirty(bh)) { if (test_clear_buffer_dirty(bh)) {
get_bh(bh); get_bh(bh);
bh->b_end_io = end_buffer_write_sync; bh->b_end_io = end_buffer_write_sync;
ret = submit_bh(WRITE, bh); ret = submit_bh(WRITE_SYNC, bh);
wait_on_buffer(bh); wait_on_buffer(bh);
if (buffer_eopnotsupp(bh)) { if (buffer_eopnotsupp(bh)) {
clear_buffer_eopnotsupp(bh); clear_buffer_eopnotsupp(bh);
......
...@@ -83,6 +83,7 @@ extern int dir_notify_enable; ...@@ -83,6 +83,7 @@ extern int dir_notify_enable;
#define READ_SYNC (READ | (1 << BIO_RW_SYNC)) #define READ_SYNC (READ | (1 << BIO_RW_SYNC))
#define READ_META (READ | (1 << BIO_RW_META)) #define READ_META (READ | (1 << BIO_RW_META))
#define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC))
#define SWRITE_SYNC (SWRITE | (1 << BIO_RW_SYNC))
#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) #define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER))
#define SEL_IN 1 #define SEL_IN 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment