Commit e288e931 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'bcache' (bcache fixes from Kent Overstreet)

Merge bcache fixes from Kent Overstreet:
 "There's fixes for _three_ different data corruption bugs, all of which
  were found by users hitting them in the wild.

  The first one isn't bcache specific - in 3.11 bcache was switched to
  the bio_copy_data in fs/bio.c, and that's when the bug in that code
  was discovered, but it's also used by raid1 and pktcdvd.  (That was my
  code too, so the bug's doubly embarassing given that it was or
  should've been just a cut and paste from bcache code.  Dunno what
  happened there).

  Most of these (all the non data corruption bugs, actually) were ready
  before the merge window and have been sitting in Jens' tree, but I
  don't know what's been up with him lately..."

* emailed patches from Kent Overstreet <kmo@daterainc.com>:
  bcache: Fix flushes in writeback mode
  bcache: Fix for handling overlapping extents when reading in a btree node
  bcache: Fix a shrinker deadlock
  bcache: Fix a dumb CPU spinning bug in writeback
  bcache: Fix a flush/fua performance bug
  bcache: Fix a writeback performance regression
  bcache: Correct printf()-style format length modifier
  bcache: Fix for when no journal entries are found
  bcache: Strip endline when writing the label through sysfs
  bcache: Fix a dumb journal discard bug
  block: Fix bio_copy_data()
parents db6aaf4d c0f04d88
...@@ -498,7 +498,7 @@ struct cached_dev { ...@@ -498,7 +498,7 @@ struct cached_dev {
*/ */
atomic_t has_dirty; atomic_t has_dirty;
struct ratelimit writeback_rate; struct bch_ratelimit writeback_rate;
struct delayed_work writeback_rate_update; struct delayed_work writeback_rate_update;
/* /*
...@@ -507,10 +507,9 @@ struct cached_dev { ...@@ -507,10 +507,9 @@ struct cached_dev {
*/ */
sector_t last_read; sector_t last_read;
/* Number of writeback bios in flight */ /* Limit number of writeback bios in flight */
atomic_t in_flight; struct semaphore in_flight;
struct closure_with_timer writeback; struct closure_with_timer writeback;
struct closure_waitlist writeback_wait;
struct keybuf writeback_keys; struct keybuf writeback_keys;
......
...@@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search) ...@@ -926,28 +926,45 @@ struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
/* Mergesort */ /* Mergesort */
static void sort_key_next(struct btree_iter *iter,
struct btree_iter_set *i)
{
i->k = bkey_next(i->k);
if (i->k == i->end)
*i = iter->data[--iter->used];
}
static void btree_sort_fixup(struct btree_iter *iter) static void btree_sort_fixup(struct btree_iter *iter)
{ {
while (iter->used > 1) { while (iter->used > 1) {
struct btree_iter_set *top = iter->data, *i = top + 1; struct btree_iter_set *top = iter->data, *i = top + 1;
struct bkey *k;
if (iter->used > 2 && if (iter->used > 2 &&
btree_iter_cmp(i[0], i[1])) btree_iter_cmp(i[0], i[1]))
i++; i++;
for (k = i->k; if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
k = bkey_next(k))
if (top->k > i->k)
__bch_cut_front(top->k, k);
else if (KEY_SIZE(k))
bch_cut_back(&START_KEY(k), top->k);
if (top->k < i->k || k == i->k)
break; break;
if (!KEY_SIZE(i->k)) {
sort_key_next(iter, i);
heap_sift(iter, i - top, btree_iter_cmp); heap_sift(iter, i - top, btree_iter_cmp);
continue;
}
if (top->k > i->k) {
if (bkey_cmp(top->k, i->k) >= 0)
sort_key_next(iter, i);
else
bch_cut_front(top->k, i->k);
heap_sift(iter, i - top, btree_iter_cmp);
} else {
/* can't happen because of comparison func */
BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
bch_cut_back(&START_KEY(i->k), top->k);
}
} }
} }
......
...@@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b) ...@@ -255,7 +255,7 @@ void bch_btree_node_read(struct btree *b)
return; return;
err: err:
bch_cache_set_error(b->c, "io error reading bucket %lu", bch_cache_set_error(b->c, "io error reading bucket %zu",
PTR_BUCKET_NR(b->c, &b->key, 0)); PTR_BUCKET_NR(b->c, &b->key, 0));
} }
...@@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink, ...@@ -612,7 +612,7 @@ static unsigned long bch_mca_scan(struct shrinker *shrink,
return SHRINK_STOP; return SHRINK_STOP;
/* Return -1 if we can't do anything right now */ /* Return -1 if we can't do anything right now */
if (sc->gfp_mask & __GFP_WAIT) if (sc->gfp_mask & __GFP_IO)
mutex_lock(&c->bucket_lock); mutex_lock(&c->bucket_lock);
else if (!mutex_trylock(&c->bucket_lock)) else if (!mutex_trylock(&c->bucket_lock))
return -1; return -1;
......
...@@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, ...@@ -153,7 +153,8 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
bitmap_zero(bitmap, SB_JOURNAL_BUCKETS); bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
pr_debug("%u journal buckets", ca->sb.njournal_buckets); pr_debug("%u journal buckets", ca->sb.njournal_buckets);
/* Read journal buckets ordered by golden ratio hash to quickly /*
* Read journal buckets ordered by golden ratio hash to quickly
* find a sequence of buckets with valid journal entries * find a sequence of buckets with valid journal entries
*/ */
for (i = 0; i < ca->sb.njournal_buckets; i++) { for (i = 0; i < ca->sb.njournal_buckets; i++) {
...@@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, ...@@ -166,18 +167,20 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
goto bsearch; goto bsearch;
} }
/* If that fails, check all the buckets we haven't checked /*
* If that fails, check all the buckets we haven't checked
* already * already
*/ */
pr_debug("falling back to linear search"); pr_debug("falling back to linear search");
for (l = 0; l < ca->sb.njournal_buckets; l++) { for (l = find_first_zero_bit(bitmap, ca->sb.njournal_buckets);
if (test_bit(l, bitmap)) l < ca->sb.njournal_buckets;
continue; l = find_next_zero_bit(bitmap, ca->sb.njournal_buckets, l + 1))
if (read_bucket(l)) if (read_bucket(l))
goto bsearch; goto bsearch;
}
if (list_empty(list))
continue;
bsearch: bsearch:
/* Binary search */ /* Binary search */
m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
...@@ -197,10 +200,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, ...@@ -197,10 +200,12 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
r = m; r = m;
} }
/* Read buckets in reverse order until we stop finding more /*
* Read buckets in reverse order until we stop finding more
* journal entries * journal entries
*/ */
pr_debug("finishing up"); pr_debug("finishing up: m %u njournal_buckets %u",
m, ca->sb.njournal_buckets);
l = m; l = m;
while (1) { while (1) {
...@@ -228,6 +233,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list, ...@@ -228,6 +233,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list,
} }
} }
if (!list_empty(list))
c->journal.seq = list_entry(list->prev, c->journal.seq = list_entry(list->prev,
struct journal_replay, struct journal_replay,
list)->j.seq; list)->j.seq;
...@@ -428,7 +434,7 @@ static void do_journal_discard(struct cache *ca) ...@@ -428,7 +434,7 @@ static void do_journal_discard(struct cache *ca)
return; return;
} }
switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) { switch (atomic_read(&ja->discard_in_flight)) {
case DISCARD_IN_FLIGHT: case DISCARD_IN_FLIGHT:
return; return;
...@@ -689,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl) ...@@ -689,6 +695,7 @@ void bch_journal_meta(struct cache_set *c, struct closure *cl)
if (cl) if (cl)
BUG_ON(!closure_wait(&w->wait, cl)); BUG_ON(!closure_wait(&w->wait, cl));
closure_flush(&c->journal.io);
__journal_try_write(c, true); __journal_try_write(c, true);
} }
} }
......
...@@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s) ...@@ -997,14 +997,17 @@ static void request_write(struct cached_dev *dc, struct search *s)
} else { } else {
bch_writeback_add(dc); bch_writeback_add(dc);
if (s->op.flush_journal) { if (bio->bi_rw & REQ_FLUSH) {
/* Also need to send a flush to the backing device */ /* Also need to send a flush to the backing device */
s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO, struct bio *flush = bio_alloc_bioset(0, GFP_NOIO,
dc->disk.bio_split); dc->disk.bio_split);
bio->bi_size = 0; flush->bi_rw = WRITE_FLUSH;
bio->bi_vcnt = 0; flush->bi_bdev = bio->bi_bdev;
closure_bio_submit(bio, cl, s->d); flush->bi_end_io = request_endio;
flush->bi_private = cl;
closure_bio_submit(flush, cl, s->d);
} else { } else {
s->op.cache_bio = bio; s->op.cache_bio = bio;
} }
......
...@@ -223,8 +223,13 @@ STORE(__cached_dev) ...@@ -223,8 +223,13 @@ STORE(__cached_dev)
} }
if (attr == &sysfs_label) { if (attr == &sysfs_label) {
/* note: endlines are preserved */ if (size > SB_LABEL_SIZE)
memcpy(dc->sb.label, buf, SB_LABEL_SIZE); return -EINVAL;
memcpy(dc->sb.label, buf, size);
if (size < SB_LABEL_SIZE)
dc->sb.label[size] = '\0';
if (size && dc->sb.label[size - 1] == '\n')
dc->sb.label[size - 1] = '\0';
bch_write_bdev_super(dc, NULL); bch_write_bdev_super(dc, NULL);
if (dc->disk.c) { if (dc->disk.c) {
memcpy(dc->disk.c->uuids[dc->disk.id].label, memcpy(dc->disk.c->uuids[dc->disk.id].label,
......
...@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time) ...@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
stats->last = now ?: 1; stats->last = now ?: 1;
} }
unsigned bch_next_delay(struct ratelimit *d, uint64_t done) /**
* bch_next_delay() - increment @d by the amount of work done, and return how
* long to delay until the next time to do some work.
*
* @d - the struct bch_ratelimit to update
* @done - the amount of work done, in arbitrary units
*
* Returns the amount of time to delay by, in jiffies
*/
uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{ {
uint64_t now = local_clock(); uint64_t now = local_clock();
......
...@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units) ...@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
(ewma) >> factor; \ (ewma) >> factor; \
}) })
struct ratelimit { struct bch_ratelimit {
/* Next time we want to do some work, in nanoseconds */
uint64_t next; uint64_t next;
/*
* Rate at which we want to do work, in units per nanosecond
* The units here correspond to the units passed to bch_next_delay()
*/
unsigned rate; unsigned rate;
}; };
static inline void ratelimit_reset(struct ratelimit *d) static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{ {
d->next = local_clock(); d->next = local_clock();
} }
unsigned bch_next_delay(struct ratelimit *d, uint64_t done); uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);
#define __DIV_SAFE(n, d, zero) \ #define __DIV_SAFE(n, d, zero) \
({ \ ({ \
......
...@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work) ...@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)
static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{ {
uint64_t ret;
if (atomic_read(&dc->disk.detaching) || if (atomic_read(&dc->disk.detaching) ||
!dc->writeback_percent) !dc->writeback_percent)
return 0; return 0;
return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
return min_t(uint64_t, ret, HZ);
} }
/* Background writeback */ /* Background writeback */
...@@ -208,7 +212,7 @@ static void refill_dirty(struct closure *cl) ...@@ -208,7 +212,7 @@ static void refill_dirty(struct closure *cl)
up_write(&dc->writeback_lock); up_write(&dc->writeback_lock);
ratelimit_reset(&dc->writeback_rate); bch_ratelimit_reset(&dc->writeback_rate);
/* Punt to workqueue only so we don't recurse and blow the stack */ /* Punt to workqueue only so we don't recurse and blow the stack */
continue_at(cl, read_dirty, dirty_wq); continue_at(cl, read_dirty, dirty_wq);
...@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl) ...@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
} }
bch_keybuf_del(&dc->writeback_keys, w); bch_keybuf_del(&dc->writeback_keys, w);
atomic_dec_bug(&dc->in_flight); up(&dc->in_flight);
closure_wake_up(&dc->writeback_wait);
closure_return_with_destructor(cl, dirty_io_destructor); closure_return_with_destructor(cl, dirty_io_destructor);
} }
...@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl) ...@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl, &io->dc->disk); closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty_finish, dirty_wq); continue_at(cl, write_dirty_finish, system_wq);
} }
static void read_dirty_endio(struct bio *bio, int error) static void read_dirty_endio(struct bio *bio, int error)
...@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl) ...@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)
closure_bio_submit(&io->bio, cl, &io->dc->disk); closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty, dirty_wq); continue_at(cl, write_dirty, system_wq);
} }
static void read_dirty(struct closure *cl) static void read_dirty(struct closure *cl)
...@@ -394,12 +396,8 @@ static void read_dirty(struct closure *cl) ...@@ -394,12 +396,8 @@ static void read_dirty(struct closure *cl)
if (delay > 0 && if (delay > 0 &&
(KEY_START(&w->key) != dc->last_read || (KEY_START(&w->key) != dc->last_read ||
jiffies_to_msecs(delay) > 50)) { jiffies_to_msecs(delay) > 50))
w->private = NULL; delay = schedule_timeout_uninterruptible(delay);
closure_delay(&dc->writeback, delay);
continue_at(cl, read_dirty, dirty_wq);
}
dc->last_read = KEY_OFFSET(&w->key); dc->last_read = KEY_OFFSET(&w->key);
...@@ -424,15 +422,10 @@ static void read_dirty(struct closure *cl) ...@@ -424,15 +422,10 @@ static void read_dirty(struct closure *cl)
trace_bcache_writeback(&w->key); trace_bcache_writeback(&w->key);
closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl); down(&dc->in_flight);
closure_call(&io->cl, read_dirty_submit, NULL, cl);
delay = writeback_delay(dc, KEY_SIZE(&w->key)); delay = writeback_delay(dc, KEY_SIZE(&w->key));
atomic_inc(&dc->in_flight);
if (!closure_wait_event(&dc->writeback_wait, cl,
atomic_read(&dc->in_flight) < 64))
continue_at(cl, read_dirty, dirty_wq);
} }
if (0) { if (0) {
...@@ -442,7 +435,11 @@ static void read_dirty(struct closure *cl) ...@@ -442,7 +435,11 @@ static void read_dirty(struct closure *cl)
bch_keybuf_del(&dc->writeback_keys, w); bch_keybuf_del(&dc->writeback_keys, w);
} }
refill_dirty(cl); /*
* Wait for outstanding writeback IOs to finish (and keybuf slots to be
* freed) before refilling again
*/
continue_at(cl, refill_dirty, dirty_wq);
} }
/* Init */ /* Init */
...@@ -484,6 +481,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc) ...@@ -484,6 +481,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)
void bch_cached_dev_writeback_init(struct cached_dev *dc) void bch_cached_dev_writeback_init(struct cached_dev *dc)
{ {
sema_init(&dc->in_flight, 64);
closure_init_unlocked(&dc->writeback); closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock); init_rwsem(&dc->writeback_lock);
...@@ -513,7 +511,7 @@ void bch_writeback_exit(void) ...@@ -513,7 +511,7 @@ void bch_writeback_exit(void)
int __init bch_writeback_init(void) int __init bch_writeback_init(void)
{ {
dirty_wq = create_singlethread_workqueue("bcache_writeback"); dirty_wq = create_workqueue("bcache_writeback");
if (!dirty_wq) if (!dirty_wq)
return -ENOMEM; return -ENOMEM;
......
...@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src) ...@@ -917,8 +917,8 @@ void bio_copy_data(struct bio *dst, struct bio *src)
src_p = kmap_atomic(src_bv->bv_page); src_p = kmap_atomic(src_bv->bv_page);
dst_p = kmap_atomic(dst_bv->bv_page); dst_p = kmap_atomic(dst_bv->bv_page);
memcpy(dst_p + dst_bv->bv_offset, memcpy(dst_p + dst_offset,
src_p + src_bv->bv_offset, src_p + src_offset,
bytes); bytes);
kunmap_atomic(dst_p); kunmap_atomic(dst_p);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment