Commit ccc5ff94 authored by Linus Torvalds's avatar Linus Torvalds

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable

* git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-unstable:
  Btrfs: fix btrfs fallocate oops and deadlock
  Btrfs: use the right node in reada_for_balance
  Btrfs: fix oops on page->mapping->host during writepage
  Btrfs: add a priority queue to the async thread helpers
  Btrfs: use WRITE_SYNC for synchronous writes
parents c19c6c32 546888da
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#define WORK_QUEUED_BIT 0 #define WORK_QUEUED_BIT 0
#define WORK_DONE_BIT 1 #define WORK_DONE_BIT 1
#define WORK_ORDER_DONE_BIT 2 #define WORK_ORDER_DONE_BIT 2
#define WORK_HIGH_PRIO_BIT 3
/* /*
* container for the kthread task pointer and the list of pending work * container for the kthread task pointer and the list of pending work
...@@ -36,6 +37,7 @@ struct btrfs_worker_thread { ...@@ -36,6 +37,7 @@ struct btrfs_worker_thread {
/* list of struct btrfs_work that are waiting for service */ /* list of struct btrfs_work that are waiting for service */
struct list_head pending; struct list_head pending;
struct list_head prio_pending;
/* list of worker threads from struct btrfs_workers */ /* list of worker threads from struct btrfs_workers */
struct list_head worker_list; struct list_head worker_list;
...@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers, ...@@ -103,10 +105,16 @@ static noinline int run_ordered_completions(struct btrfs_workers *workers,
spin_lock_irqsave(&workers->lock, flags); spin_lock_irqsave(&workers->lock, flags);
while (!list_empty(&workers->order_list)) { while (1) {
if (!list_empty(&workers->prio_order_list)) {
work = list_entry(workers->prio_order_list.next,
struct btrfs_work, order_list);
} else if (!list_empty(&workers->order_list)) {
work = list_entry(workers->order_list.next, work = list_entry(workers->order_list.next,
struct btrfs_work, order_list); struct btrfs_work, order_list);
} else {
break;
}
if (!test_bit(WORK_DONE_BIT, &work->flags)) if (!test_bit(WORK_DONE_BIT, &work->flags))
break; break;
...@@ -143,8 +151,14 @@ static int worker_loop(void *arg) ...@@ -143,8 +151,14 @@ static int worker_loop(void *arg)
do { do {
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
again_locked: again_locked:
while (!list_empty(&worker->pending)) { while (1) {
if (!list_empty(&worker->prio_pending))
cur = worker->prio_pending.next;
else if (!list_empty(&worker->pending))
cur = worker->pending.next; cur = worker->pending.next;
else
break;
work = list_entry(cur, struct btrfs_work, list); work = list_entry(cur, struct btrfs_work, list);
list_del(&work->list); list_del(&work->list);
clear_bit(WORK_QUEUED_BIT, &work->flags); clear_bit(WORK_QUEUED_BIT, &work->flags);
...@@ -163,7 +177,6 @@ static int worker_loop(void *arg) ...@@ -163,7 +177,6 @@ static int worker_loop(void *arg)
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
check_idle_worker(worker); check_idle_worker(worker);
} }
if (freezing(current)) { if (freezing(current)) {
worker->working = 0; worker->working = 0;
...@@ -178,7 +191,8 @@ static int worker_loop(void *arg) ...@@ -178,7 +191,8 @@ static int worker_loop(void *arg)
* jump_in? * jump_in?
*/ */
smp_mb(); smp_mb();
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
continue; continue;
/* /*
...@@ -191,7 +205,8 @@ static int worker_loop(void *arg) ...@@ -191,7 +205,8 @@ static int worker_loop(void *arg)
*/ */
schedule_timeout(1); schedule_timeout(1);
smp_mb(); smp_mb();
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
continue; continue;
if (kthread_should_stop()) if (kthread_should_stop())
...@@ -200,7 +215,8 @@ static int worker_loop(void *arg) ...@@ -200,7 +215,8 @@ static int worker_loop(void *arg)
/* still no more work?, sleep for real */ /* still no more work?, sleep for real */
spin_lock_irq(&worker->lock); spin_lock_irq(&worker->lock);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&worker->pending)) if (!list_empty(&worker->pending) ||
!list_empty(&worker->prio_pending))
goto again_locked; goto again_locked;
/* /*
...@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) ...@@ -248,6 +264,7 @@ void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max)
INIT_LIST_HEAD(&workers->worker_list); INIT_LIST_HEAD(&workers->worker_list);
INIT_LIST_HEAD(&workers->idle_list); INIT_LIST_HEAD(&workers->idle_list);
INIT_LIST_HEAD(&workers->order_list); INIT_LIST_HEAD(&workers->order_list);
INIT_LIST_HEAD(&workers->prio_order_list);
spin_lock_init(&workers->lock); spin_lock_init(&workers->lock);
workers->max_workers = max; workers->max_workers = max;
workers->idle_thresh = 32; workers->idle_thresh = 32;
...@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) ...@@ -273,6 +290,7 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
} }
INIT_LIST_HEAD(&worker->pending); INIT_LIST_HEAD(&worker->pending);
INIT_LIST_HEAD(&worker->prio_pending);
INIT_LIST_HEAD(&worker->worker_list); INIT_LIST_HEAD(&worker->worker_list);
spin_lock_init(&worker->lock); spin_lock_init(&worker->lock);
atomic_set(&worker->num_pending, 0); atomic_set(&worker->num_pending, 0);
...@@ -396,6 +414,9 @@ int btrfs_requeue_work(struct btrfs_work *work) ...@@ -396,6 +414,9 @@ int btrfs_requeue_work(struct btrfs_work *work)
goto out; goto out;
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending); list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending); atomic_inc(&worker->num_pending);
...@@ -422,6 +443,11 @@ int btrfs_requeue_work(struct btrfs_work *work) ...@@ -422,6 +443,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
return 0; return 0;
} }
void btrfs_set_work_high_prio(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
/* /*
* places a struct btrfs_work into the pending queue of one of the kthreads * places a struct btrfs_work into the pending queue of one of the kthreads
*/ */
...@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) ...@@ -438,7 +464,12 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
worker = find_worker(workers); worker = find_worker(workers);
if (workers->ordered) { if (workers->ordered) {
spin_lock_irqsave(&workers->lock, flags); spin_lock_irqsave(&workers->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags)) {
list_add_tail(&work->order_list,
&workers->prio_order_list);
} else {
list_add_tail(&work->order_list, &workers->order_list); list_add_tail(&work->order_list, &workers->order_list);
}
spin_unlock_irqrestore(&workers->lock, flags); spin_unlock_irqrestore(&workers->lock, flags);
} else { } else {
INIT_LIST_HEAD(&work->order_list); INIT_LIST_HEAD(&work->order_list);
...@@ -446,6 +477,9 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) ...@@ -446,6 +477,9 @@ int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work)
spin_lock_irqsave(&worker->lock, flags); spin_lock_irqsave(&worker->lock, flags);
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
list_add_tail(&work->list, &worker->prio_pending);
else
list_add_tail(&work->list, &worker->pending); list_add_tail(&work->list, &worker->pending);
atomic_inc(&worker->num_pending); atomic_inc(&worker->num_pending);
check_busy_worker(worker); check_busy_worker(worker);
......
...@@ -85,6 +85,7 @@ struct btrfs_workers { ...@@ -85,6 +85,7 @@ struct btrfs_workers {
* of work items waiting for completion * of work items waiting for completion
*/ */
struct list_head order_list; struct list_head order_list;
struct list_head prio_order_list;
/* lock for finding the next worker thread to queue on */ /* lock for finding the next worker thread to queue on */
spinlock_t lock; spinlock_t lock;
...@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); ...@@ -98,4 +99,5 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers);
int btrfs_stop_workers(struct btrfs_workers *workers); int btrfs_stop_workers(struct btrfs_workers *workers);
void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max);
int btrfs_requeue_work(struct btrfs_work *work); int btrfs_requeue_work(struct btrfs_work *work);
void btrfs_set_work_high_prio(struct btrfs_work *work);
#endif #endif
...@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root, ...@@ -1325,12 +1325,12 @@ static noinline int reada_for_balance(struct btrfs_root *root,
int ret = 0; int ret = 0;
int blocksize; int blocksize;
parent = path->nodes[level - 1]; parent = path->nodes[level + 1];
if (!parent) if (!parent)
return 0; return 0;
nritems = btrfs_header_nritems(parent); nritems = btrfs_header_nritems(parent);
slot = path->slots[level]; slot = path->slots[level + 1];
blocksize = btrfs_level_size(root, level); blocksize = btrfs_level_size(root, level);
if (slot > 0) { if (slot > 0) {
...@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, ...@@ -1341,7 +1341,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
block1 = 0; block1 = 0;
free_extent_buffer(eb); free_extent_buffer(eb);
} }
if (slot < nritems) { if (slot + 1 < nritems) {
block2 = btrfs_node_blockptr(parent, slot + 1); block2 = btrfs_node_blockptr(parent, slot + 1);
gen = btrfs_node_ptr_generation(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1);
eb = btrfs_find_tree_block(root, block2, blocksize); eb = btrfs_find_tree_block(root, block2, blocksize);
...@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root, ...@@ -1351,7 +1351,11 @@ static noinline int reada_for_balance(struct btrfs_root *root,
} }
if (block1 || block2) { if (block1 || block2) {
ret = -EAGAIN; ret = -EAGAIN;
/* release the whole path */
btrfs_release_path(root, path); btrfs_release_path(root, path);
/* read the blocks */
if (block1) if (block1)
readahead_tree_block(root, block1, blocksize, 0); readahead_tree_block(root, block1, blocksize, 0);
if (block2) if (block2)
...@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root, ...@@ -1361,7 +1365,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
eb = read_tree_block(root, block1, blocksize, 0); eb = read_tree_block(root, block1, blocksize, 0);
free_extent_buffer(eb); free_extent_buffer(eb);
} }
if (block1) { if (block2) {
eb = read_tree_block(root, block2, blocksize, 0); eb = read_tree_block(root, block2, blocksize, 0);
free_extent_buffer(eb); free_extent_buffer(eb);
} }
...@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans, ...@@ -1481,12 +1485,15 @@ read_block_for_search(struct btrfs_trans_handle *trans,
* of the btree by dropping locks before * of the btree by dropping locks before
* we read. * we read.
*/ */
btrfs_release_path(NULL, p); btrfs_unlock_up_safe(p, level + 1);
btrfs_set_path_blocking(p);
if (tmp) if (tmp)
free_extent_buffer(tmp); free_extent_buffer(tmp);
if (p->reada) if (p->reada)
reada_for_search(root, p, level, slot, key->objectid); reada_for_search(root, p, level, slot, key->objectid);
btrfs_release_path(NULL, p);
tmp = read_tree_block(root, blocknr, blocksize, gen); tmp = read_tree_block(root, blocknr, blocksize, gen);
if (tmp) if (tmp)
free_extent_buffer(tmp); free_extent_buffer(tmp);
......
...@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, ...@@ -579,6 +579,10 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
async->bio_flags = bio_flags; async->bio_flags = bio_flags;
atomic_inc(&fs_info->nr_async_submits); atomic_inc(&fs_info->nr_async_submits);
if (rw & (1 << BIO_RW_SYNCIO))
btrfs_set_work_high_prio(&async->work);
btrfs_queue_worker(&fs_info->workers, &async->work); btrfs_queue_worker(&fs_info->workers, &async->work);
#if 0 #if 0
int limit = btrfs_async_submit_limit(fs_info); int limit = btrfs_async_submit_limit(fs_info);
...@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, ...@@ -656,6 +660,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
mirror_num, 0); mirror_num, 0);
} }
/* /*
* kthread helpers are used to submit writes so that checksumming * kthread helpers are used to submit writes so that checksumming
* can happen in parallel across all CPUs * can happen in parallel across all CPUs
...@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device, ...@@ -2095,10 +2100,10 @@ static int write_dev_supers(struct btrfs_device *device,
device->barriers = 0; device->barriers = 0;
get_bh(bh); get_bh(bh);
lock_buffer(bh); lock_buffer(bh);
ret = submit_bh(WRITE, bh); ret = submit_bh(WRITE_SYNC, bh);
} }
} else { } else {
ret = submit_bh(WRITE, bh); ret = submit_bh(WRITE_SYNC, bh);
} }
if (!ret && wait) { if (!ret && wait) {
......
...@@ -50,7 +50,10 @@ struct extent_page_data { ...@@ -50,7 +50,10 @@ struct extent_page_data {
/* tells writepage not to lock the state bits for this range /* tells writepage not to lock the state bits for this range
* it still does the unlocking * it still does the unlocking
*/ */
int extent_locked; unsigned int extent_locked:1;
/* tells the submit_bio code to use a WRITE_SYNC */
unsigned int sync_io:1;
}; };
int __init extent_io_init(void) int __init extent_io_init(void)
...@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, ...@@ -2101,6 +2104,16 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
return ret; return ret;
} }
static noinline void update_nr_written(struct page *page,
struct writeback_control *wbc,
unsigned long nr_written)
{
wbc->nr_to_write -= nr_written;
if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
page->mapping->writeback_index = page->index + nr_written;
}
/* /*
* the writepage semantics are similar to regular writepage. extent * the writepage semantics are similar to regular writepage. extent
* records are inserted to lock ranges in the tree, and as dirty areas * records are inserted to lock ranges in the tree, and as dirty areas
...@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2136,8 +2149,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
u64 delalloc_end; u64 delalloc_end;
int page_started; int page_started;
int compressed; int compressed;
int write_flags;
unsigned long nr_written = 0; unsigned long nr_written = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
write_flags = WRITE_SYNC_PLUG;
else
write_flags = WRITE;
WARN_ON(!PageLocked(page)); WARN_ON(!PageLocked(page));
pg_offset = i_size & (PAGE_CACHE_SIZE - 1); pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
if (page->index > end_index || if (page->index > end_index ||
...@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2164,6 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
delalloc_end = 0; delalloc_end = 0;
page_started = 0; page_started = 0;
if (!epd->extent_locked) { if (!epd->extent_locked) {
/*
* make sure the wbc mapping index is at least updated
* to this page.
*/
update_nr_written(page, wbc, 0);
while (delalloc_end < page_end) { while (delalloc_end < page_end) {
nr_delalloc = find_lock_delalloc_range(inode, tree, nr_delalloc = find_lock_delalloc_range(inode, tree,
page, page,
...@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2185,7 +2210,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
*/ */
if (page_started) { if (page_started) {
ret = 0; ret = 0;
goto update_nr_written; /*
* we've unlocked the page, so we can't update
* the mapping's writeback index, just update
* nr_to_write.
*/
wbc->nr_to_write -= nr_written;
goto done_unlocked;
} }
} }
lock_extent(tree, start, page_end, GFP_NOFS); lock_extent(tree, start, page_end, GFP_NOFS);
...@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2198,13 +2229,18 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
if (ret == -EAGAIN) { if (ret == -EAGAIN) {
unlock_extent(tree, start, page_end, GFP_NOFS); unlock_extent(tree, start, page_end, GFP_NOFS);
redirty_page_for_writepage(wbc, page); redirty_page_for_writepage(wbc, page);
update_nr_written(page, wbc, nr_written);
unlock_page(page); unlock_page(page);
ret = 0; ret = 0;
goto update_nr_written; goto done_unlocked;
} }
} }
nr_written++; /*
* we don't want to touch the inode after unlocking the page,
* so we update the mapping writeback index now
*/
update_nr_written(page, wbc, nr_written + 1);
end = page_end; end = page_end;
if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
...@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2314,9 +2350,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
(unsigned long long)end); (unsigned long long)end);
} }
ret = submit_extent_page(WRITE, tree, page, sector, ret = submit_extent_page(write_flags, tree, page,
iosize, pg_offset, bdev, sector, iosize, pg_offset,
&epd->bio, max_nr, bdev, &epd->bio, max_nr,
end_bio_extent_writepage, end_bio_extent_writepage,
0, 0, 0); 0, 0, 0);
if (ret) if (ret)
...@@ -2336,11 +2372,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ...@@ -2336,11 +2372,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
unlock_extent(tree, unlock_start, page_end, GFP_NOFS); unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
unlock_page(page); unlock_page(page);
update_nr_written: done_unlocked:
wbc->nr_to_write -= nr_written;
if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
page->mapping->writeback_index = page->index + nr_written;
return 0; return 0;
} }
...@@ -2460,15 +2493,23 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, ...@@ -2460,15 +2493,23 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
return ret; return ret;
} }
static noinline void flush_write_bio(void *data) static void flush_epd_write_bio(struct extent_page_data *epd)
{ {
struct extent_page_data *epd = data;
if (epd->bio) { if (epd->bio) {
if (epd->sync_io)
submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
else
submit_one_bio(WRITE, epd->bio, 0, 0); submit_one_bio(WRITE, epd->bio, 0, 0);
epd->bio = NULL; epd->bio = NULL;
} }
} }
static noinline void flush_write_bio(void *data)
{
struct extent_page_data *epd = data;
flush_epd_write_bio(epd);
}
int extent_write_full_page(struct extent_io_tree *tree, struct page *page, int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
get_extent_t *get_extent, get_extent_t *get_extent,
struct writeback_control *wbc) struct writeback_control *wbc)
...@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, ...@@ -2480,23 +2521,22 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
.tree = tree, .tree = tree,
.get_extent = get_extent, .get_extent = get_extent,
.extent_locked = 0, .extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
}; };
struct writeback_control wbc_writepages = { struct writeback_control wbc_writepages = {
.bdi = wbc->bdi, .bdi = wbc->bdi,
.sync_mode = WB_SYNC_NONE, .sync_mode = wbc->sync_mode,
.older_than_this = NULL, .older_than_this = NULL,
.nr_to_write = 64, .nr_to_write = 64,
.range_start = page_offset(page) + PAGE_CACHE_SIZE, .range_start = page_offset(page) + PAGE_CACHE_SIZE,
.range_end = (loff_t)-1, .range_end = (loff_t)-1,
}; };
ret = __extent_writepage(page, wbc, &epd); ret = __extent_writepage(page, wbc, &epd);
extent_write_cache_pages(tree, mapping, &wbc_writepages, extent_write_cache_pages(tree, mapping, &wbc_writepages,
__extent_writepage, &epd, flush_write_bio); __extent_writepage, &epd, flush_write_bio);
if (epd.bio) flush_epd_write_bio(&epd);
submit_one_bio(WRITE, epd.bio, 0, 0);
return ret; return ret;
} }
...@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, ...@@ -2515,6 +2555,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
.tree = tree, .tree = tree,
.get_extent = get_extent, .get_extent = get_extent,
.extent_locked = 1, .extent_locked = 1,
.sync_io = mode == WB_SYNC_ALL,
}; };
struct writeback_control wbc_writepages = { struct writeback_control wbc_writepages = {
.bdi = inode->i_mapping->backing_dev_info, .bdi = inode->i_mapping->backing_dev_info,
...@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, ...@@ -2540,8 +2581,7 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
start += PAGE_CACHE_SIZE; start += PAGE_CACHE_SIZE;
} }
if (epd.bio) flush_epd_write_bio(&epd);
submit_one_bio(WRITE, epd.bio, 0, 0);
return ret; return ret;
} }
...@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree, ...@@ -2556,13 +2596,13 @@ int extent_writepages(struct extent_io_tree *tree,
.tree = tree, .tree = tree,
.get_extent = get_extent, .get_extent = get_extent,
.extent_locked = 0, .extent_locked = 0,
.sync_io = wbc->sync_mode == WB_SYNC_ALL,
}; };
ret = extent_write_cache_pages(tree, mapping, wbc, ret = extent_write_cache_pages(tree, mapping, wbc,
__extent_writepage, &epd, __extent_writepage, &epd,
flush_write_bio); flush_write_bio);
if (epd.bio) flush_epd_write_bio(&epd);
submit_one_bio(WRITE, epd.bio, 0, 0);
return ret; return ret;
} }
......
...@@ -830,7 +830,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ...@@ -830,7 +830,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path, del_slot, del_nr); ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
BUG_ON(ret); BUG_ON(ret);
goto done; goto release;
} else if (split == start) { } else if (split == start) {
if (locked_end < extent_end) { if (locked_end < extent_end) {
ret = try_lock_extent(&BTRFS_I(inode)->io_tree, ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
...@@ -926,6 +926,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, ...@@ -926,6 +926,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
} }
done: done:
btrfs_mark_buffer_dirty(leaf); btrfs_mark_buffer_dirty(leaf);
release:
btrfs_release_path(root, path); btrfs_release_path(root, path);
if (split_end && split == start) { if (split_end && split == start) {
split = end; split = end;
...@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ...@@ -1131,7 +1133,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
if (will_write) { if (will_write) {
btrfs_fdatawrite_range(inode->i_mapping, pos, btrfs_fdatawrite_range(inode->i_mapping, pos,
pos + write_bytes - 1, pos + write_bytes - 1,
WB_SYNC_NONE); WB_SYNC_ALL);
} else { } else {
balance_dirty_pages_ratelimited_nr(inode->i_mapping, balance_dirty_pages_ratelimited_nr(inode->i_mapping,
num_pages); num_pages);
......
...@@ -4970,10 +4970,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, ...@@ -4970,10 +4970,10 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
return err; return err;
} }
static int prealloc_file_range(struct inode *inode, u64 start, u64 end, static int prealloc_file_range(struct btrfs_trans_handle *trans,
struct inode *inode, u64 start, u64 end,
u64 alloc_hint, int mode) u64 alloc_hint, int mode)
{ {
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins; struct btrfs_key ins;
u64 alloc_size; u64 alloc_size;
...@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ...@@ -4981,10 +4981,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
u64 num_bytes = end - start; u64 num_bytes = end - start;
int ret = 0; int ret = 0;
trans = btrfs_join_transaction(root, 1);
BUG_ON(!trans);
btrfs_set_trans_block_group(trans, inode);
while (num_bytes > 0) { while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent); alloc_size = min(num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_extent(trans, root, alloc_size, ret = btrfs_reserve_extent(trans, root, alloc_size,
...@@ -5015,7 +5011,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end, ...@@ -5015,7 +5011,6 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
BUG_ON(ret); BUG_ON(ret);
} }
btrfs_end_transaction(trans, root);
return ret; return ret;
} }
...@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode, ...@@ -5029,11 +5024,18 @@ static long btrfs_fallocate(struct inode *inode, int mode,
u64 alloc_hint = 0; u64 alloc_hint = 0;
u64 mask = BTRFS_I(inode)->root->sectorsize - 1; u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
struct extent_map *em; struct extent_map *em;
struct btrfs_trans_handle *trans;
int ret; int ret;
alloc_start = offset & ~mask; alloc_start = offset & ~mask;
alloc_end = (offset + len + mask) & ~mask; alloc_end = (offset + len + mask) & ~mask;
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
*/
btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
if (alloc_start > inode->i_size) { if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, alloc_start); ret = btrfs_cont_expand(inode, alloc_start);
...@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode, ...@@ -5043,6 +5045,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
while (1) { while (1) {
struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *ordered;
trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
if (!trans) {
ret = -EIO;
goto out;
}
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, lock_extent(&BTRFS_I(inode)->io_tree, alloc_start,
alloc_end - 1, GFP_NOFS); alloc_end - 1, GFP_NOFS);
ordered = btrfs_lookup_first_ordered_extent(inode, ordered = btrfs_lookup_first_ordered_extent(inode,
...@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, ...@@ -5053,6 +5065,12 @@ static long btrfs_fallocate(struct inode *inode, int mode,
btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered);
unlock_extent(&BTRFS_I(inode)->io_tree, unlock_extent(&BTRFS_I(inode)->io_tree,
alloc_start, alloc_end - 1, GFP_NOFS); alloc_start, alloc_end - 1, GFP_NOFS);
btrfs_end_transaction(trans, BTRFS_I(inode)->root);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
btrfs_wait_ordered_range(inode, alloc_start, btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start); alloc_end - alloc_start);
} else { } else {
...@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, ...@@ -5070,7 +5088,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
last_byte = min(extent_map_end(em), alloc_end); last_byte = min(extent_map_end(em), alloc_end);
last_byte = (last_byte + mask) & ~mask; last_byte = (last_byte + mask) & ~mask;
if (em->block_start == EXTENT_MAP_HOLE) { if (em->block_start == EXTENT_MAP_HOLE) {
ret = prealloc_file_range(inode, cur_offset, ret = prealloc_file_range(trans, inode, cur_offset,
last_byte, alloc_hint, mode); last_byte, alloc_hint, mode);
if (ret < 0) { if (ret < 0) {
free_extent_map(em); free_extent_map(em);
...@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, ...@@ -5089,6 +5107,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
} }
unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1,
GFP_NOFS); GFP_NOFS);
btrfs_end_transaction(trans, BTRFS_I(inode)->root);
out: out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
return ret; return ret;
......
...@@ -489,7 +489,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) ...@@ -489,7 +489,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
/* start IO across the range first to instantiate any delalloc /* start IO across the range first to instantiate any delalloc
* extents * extents
*/ */
btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL);
/* The compression code will leave pages locked but return from /* The compression code will leave pages locked but return from
* writepage without setting the page writeback. Starting again * writepage without setting the page writeback. Starting again
......
...@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) ...@@ -125,6 +125,20 @@ static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
return NULL; return NULL;
} }
static void requeue_list(struct btrfs_pending_bios *pending_bios,
struct bio *head, struct bio *tail)
{
struct bio *old_head;
old_head = pending_bios->head;
pending_bios->head = head;
if (pending_bios->tail)
tail->bi_next = old_head;
else
pending_bios->tail = tail;
}
/* /*
* we try to collect pending bios for a device so we don't get a large * we try to collect pending bios for a device so we don't get a large
* number of procs sending bios down to the same device. This greatly * number of procs sending bios down to the same device. This greatly
...@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -141,10 +155,12 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
struct bio *pending; struct bio *pending;
struct backing_dev_info *bdi; struct backing_dev_info *bdi;
struct btrfs_fs_info *fs_info; struct btrfs_fs_info *fs_info;
struct btrfs_pending_bios *pending_bios;
struct bio *tail; struct bio *tail;
struct bio *cur; struct bio *cur;
int again = 0; int again = 0;
unsigned long num_run = 0; unsigned long num_run;
unsigned long num_sync_run;
unsigned long limit; unsigned long limit;
unsigned long last_waited = 0; unsigned long last_waited = 0;
...@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -153,20 +169,30 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
limit = btrfs_async_submit_limit(fs_info); limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3; limit = limit * 2 / 3;
/* we want to make sure that every time we switch from the sync
* list to the normal list, we unplug
*/
num_sync_run = 0;
loop: loop:
spin_lock(&device->io_lock); spin_lock(&device->io_lock);
num_run = 0;
loop_lock: loop_lock:
/* take all the bios off the list at once and process them /* take all the bios off the list at once and process them
* later on (without the lock held). But, remember the * later on (without the lock held). But, remember the
* tail and other pointers so the bios can be properly reinserted * tail and other pointers so the bios can be properly reinserted
* into the list if we hit congestion * into the list if we hit congestion
*/ */
pending = device->pending_bios; if (device->pending_sync_bios.head)
tail = device->pending_bio_tail; pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
pending = pending_bios->head;
tail = pending_bios->tail;
WARN_ON(pending && !tail); WARN_ON(pending && !tail);
device->pending_bios = NULL;
device->pending_bio_tail = NULL;
/* /*
* if pending was null this time around, no bios need processing * if pending was null this time around, no bios need processing
...@@ -176,16 +202,41 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -176,16 +202,41 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
* device->running_pending is used to synchronize with the * device->running_pending is used to synchronize with the
* schedule_bio code. * schedule_bio code.
*/ */
if (pending) { if (device->pending_sync_bios.head == NULL &&
again = 1; device->pending_bios.head == NULL) {
device->running_pending = 1;
} else {
again = 0; again = 0;
device->running_pending = 0; device->running_pending = 0;
} else {
again = 1;
device->running_pending = 1;
} }
pending_bios->head = NULL;
pending_bios->tail = NULL;
spin_unlock(&device->io_lock); spin_unlock(&device->io_lock);
/*
* if we're doing the regular priority list, make sure we unplug
* for any high prio bios we've sent down
*/
if (pending_bios == &device->pending_bios && num_sync_run > 0) {
num_sync_run = 0;
blk_run_backing_dev(bdi, NULL);
}
while (pending) { while (pending) {
rmb();
if (pending_bios != &device->pending_sync_bios &&
device->pending_sync_bios.head &&
num_run > 16) {
cond_resched();
spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
goto loop_lock;
}
cur = pending; cur = pending;
pending = pending->bi_next; pending = pending->bi_next;
cur->bi_next = NULL; cur->bi_next = NULL;
...@@ -196,10 +247,18 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -196,10 +247,18 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
wake_up(&fs_info->async_submit_wait); wake_up(&fs_info->async_submit_wait);
BUG_ON(atomic_read(&cur->bi_cnt) == 0); BUG_ON(atomic_read(&cur->bi_cnt) == 0);
bio_get(cur);
submit_bio(cur->bi_rw, cur); submit_bio(cur->bi_rw, cur);
bio_put(cur);
num_run++; num_run++;
if (bio_sync(cur))
num_sync_run++;
if (need_resched()) {
if (num_sync_run) {
blk_run_backing_dev(bdi, NULL);
num_sync_run = 0;
}
cond_resched();
}
/* /*
* we made progress, there is more work to do and the bdi * we made progress, there is more work to do and the bdi
...@@ -208,7 +267,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -208,7 +267,6 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
*/ */
if (pending && bdi_write_congested(bdi) && num_run > 16 && if (pending && bdi_write_congested(bdi) && num_run > 16 &&
fs_info->fs_devices->open_devices > 1) { fs_info->fs_devices->open_devices > 1) {
struct bio *old_head;
struct io_context *ioc; struct io_context *ioc;
ioc = current->io_context; ioc = current->io_context;
...@@ -233,17 +291,17 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -233,17 +291,17 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
* against it before looping * against it before looping
*/ */
last_waited = ioc->last_waited; last_waited = ioc->last_waited;
if (need_resched()) {
if (num_sync_run) {
blk_run_backing_dev(bdi, NULL);
num_sync_run = 0;
}
cond_resched();
}
continue; continue;
} }
spin_lock(&device->io_lock); spin_lock(&device->io_lock);
requeue_list(pending_bios, pending, tail);
old_head = device->pending_bios;
device->pending_bios = pending;
if (device->pending_bio_tail)
tail->bi_next = old_head;
else
device->pending_bio_tail = tail;
device->running_pending = 1; device->running_pending = 1;
spin_unlock(&device->io_lock); spin_unlock(&device->io_lock);
...@@ -251,11 +309,18 @@ static noinline int run_scheduled_bios(struct btrfs_device *device) ...@@ -251,11 +309,18 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
goto done; goto done;
} }
} }
if (num_sync_run) {
num_sync_run = 0;
blk_run_backing_dev(bdi, NULL);
}
cond_resched();
if (again) if (again)
goto loop; goto loop;
spin_lock(&device->io_lock); spin_lock(&device->io_lock);
if (device->pending_bios) if (device->pending_bios.head || device->pending_sync_bios.head)
goto loop_lock; goto loop_lock;
spin_unlock(&device->io_lock); spin_unlock(&device->io_lock);
...@@ -2497,7 +2562,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, ...@@ -2497,7 +2562,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
max_errors = 1; max_errors = 1;
} }
} }
if (multi_ret && rw == WRITE && if (multi_ret && (rw & (1 << BIO_RW)) &&
stripes_allocated < stripes_required) { stripes_allocated < stripes_required) {
stripes_allocated = map->num_stripes; stripes_allocated = map->num_stripes;
free_extent_map(em); free_extent_map(em);
...@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root, ...@@ -2762,6 +2827,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
int rw, struct bio *bio) int rw, struct bio *bio)
{ {
int should_queue = 1; int should_queue = 1;
struct btrfs_pending_bios *pending_bios;
/* don't bother with additional async steps for reads, right now */ /* don't bother with additional async steps for reads, right now */
if (!(rw & (1 << BIO_RW))) { if (!(rw & (1 << BIO_RW))) {
...@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root, ...@@ -2783,13 +2849,17 @@ static noinline int schedule_bio(struct btrfs_root *root,
bio->bi_rw |= rw; bio->bi_rw |= rw;
spin_lock(&device->io_lock); spin_lock(&device->io_lock);
if (bio_sync(bio))
pending_bios = &device->pending_sync_bios;
else
pending_bios = &device->pending_bios;
if (device->pending_bio_tail) if (pending_bios->tail)
device->pending_bio_tail->bi_next = bio; pending_bios->tail->bi_next = bio;
device->pending_bio_tail = bio; pending_bios->tail = bio;
if (!device->pending_bios) if (!pending_bios->head)
device->pending_bios = bio; pending_bios->head = bio;
if (device->running_pending) if (device->running_pending)
should_queue = 0; should_queue = 0;
......
...@@ -23,13 +23,22 @@ ...@@ -23,13 +23,22 @@
#include "async-thread.h" #include "async-thread.h"
struct buffer_head; struct buffer_head;
struct btrfs_pending_bios {
struct bio *head;
struct bio *tail;
};
struct btrfs_device { struct btrfs_device {
struct list_head dev_list; struct list_head dev_list;
struct list_head dev_alloc_list; struct list_head dev_alloc_list;
struct btrfs_fs_devices *fs_devices; struct btrfs_fs_devices *fs_devices;
struct btrfs_root *dev_root; struct btrfs_root *dev_root;
struct bio *pending_bios;
struct bio *pending_bio_tail; /* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* WRITE_SYNC bios */
struct btrfs_pending_bios pending_sync_bios;
int running_pending; int running_pending;
u64 generation; u64 generation;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment