Commit d0e3d023 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'bcache-for-3.11' of git://evilpiepirate.org/~kent/linux-bcache into for-3.11/drivers

parents 5f0e5afa 8e51e414
......@@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't
have to manually attach:
make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
bcache-tools now ships udev rules, and bcache devices are known to the kernel
immediately. Without udev, you can manually register devices like this:
echo /dev/sdb > /sys/fs/bcache/register
echo /dev/sdc > /sys/fs/bcache/register
To register your bcache devices automatically, you could add something like
this to an init script:
Registering the backing device makes the bcache device show up in /dev; you can
now format it and use it as normal. But the first time using a new bcache
device, it'll be running in passthrough mode until you attach it to a cache.
See the section on attaching.
echo /dev/sd* > /sys/fs/bcache/register_quiet
The devices show up as:
It'll look for bcache superblocks and ignore everything that doesn't have one.
/dev/bcache<N>
Registering the backing device makes the bcache show up in /dev; you can now
format it and use it as normal. But the first time using a new bcache device,
it'll be running in passthrough mode until you attach it to a cache. See the
section on attaching.
As well as (with udev):
The devices show up at /dev/bcacheN, and can be controlled via sysfs from
/sys/block/bcacheN/bcache:
/dev/bcache/by-uuid/<uuid>
/dev/bcache/by-label/<label>
To get started:
mkfs.ext4 /dev/bcache0
mount /dev/bcache0 /mnt
You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache .
Cache devices are managed as sets; multiple caches per set isn't supported yet
but will allow for mirroring of metadata and dirty data in the future. Your new
cache set shows up as /sys/fs/bcache/<UUID>
......@@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing
device to a cache set is done thusly, with the UUID of the cache set in
/sys/fs/bcache:
echo <UUID> > /sys/block/bcache0/bcache/attach
echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
This only has to be done once. The next time you reboot, just reregister all
your bcache devices. If a backing device has data in a cache somewhere, the
/dev/bcache# device won't be created until the cache shows up - particularly
/dev/bcache<N> device won't be created until the cache shows up - particularly
important if you have writeback caching turned on.
If you're booting up and your cache device is gone and never coming back, you
......@@ -181,7 +185,7 @@ want for getting the best possible numbers when benchmarking.
In practice this isn't an issue because as soon as a write comes along it'll
cause the btree node to be split, and you need almost no write traffic for
this to not show up enough to be noticable (especially since bcache's btree
this to not show up enough to be noticeable (especially since bcache's btree
nodes are huge and index large regions of the device). But when you're
benchmarking, if you're trying to warm the cache by reading a bunch of data
and there's no other traffic - that can be a problem.
......@@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking.
SYSFS - BACKING DEVICE:
Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
(if attached) /sys/fs/bcache/<cset-uuid>/bdev*
attach
Echo the UUID of a cache set to this file to enable caching.
......@@ -222,7 +229,7 @@ running
it's in passthrough mode or caching).
sequential_cutoff
A sequential IO will bypass the cache once it passes this threshhold; the
A sequential IO will bypass the cache once it passes this threshold; the
most recent 128 IOs are tracked so sequential IO can be detected even when
it isn't all done at once.
......@@ -296,10 +303,12 @@ cache_miss_collisions
since the synchronization for cache misses was rewritten)
cache_readaheads
Count of times readahead occured.
Count of times readahead occurred.
SYSFS - CACHE SET:
Available at /sys/fs/bcache/<cset-uuid>
average_key_size
Average data per key in the btree.
......@@ -362,7 +371,7 @@ unregister
SYSFS - CACHE SET INTERNAL:
This directory also exposes timings for a number of internal operations, with
separate files for average duration, average frequency, last occurence and max
separate files for average duration, average frequency, last occurrence and max
duration: garbage collection, btree read, btree node sorts and btree splits.
active_journal_entries
......@@ -390,6 +399,8 @@ trigger_gc
SYSFS - CACHE DEVICE:
Available at /sys/block/<cdev>/bcache
block_size
Minimum granularity of writes - should match hardware sector size.
......@@ -417,7 +428,7 @@ freelist_percent
space.
io_errors
Number of errors that have occured, decayed by io_error_halflife.
Number of errors that have occurred, decayed by io_error_halflife.
metadata_written
Sum of all non data writes (btree writes and all other metadata).
......
......@@ -1621,7 +1621,7 @@ S: Maintained
F: drivers/net/hamradio/baycom*
BCACHE (BLOCK LAYER CACHE)
M: Kent Overstreet <koverstreet@google.com>
M: Kent Overstreet <kmo@daterainc.com>
L: linux-bcache@vger.kernel.org
W: http://bcache.evilpiepirate.org
S: Maintained:
......
......@@ -63,7 +63,9 @@
#include "bcache.h"
#include "btree.h"
#include <linux/kthread.h>
#include <linux/random.h>
#include <trace/events/bcache.h>
#define MAX_IN_FLIGHT_DISCARDS 8U
......@@ -151,7 +153,7 @@ static void discard_finish(struct work_struct *w)
mutex_unlock(&ca->set->bucket_lock);
closure_wake_up(&ca->set->bucket_wait);
wake_up(&ca->set->alloc_wait);
wake_up_process(ca->alloc_thread);
closure_put(&ca->set->cl);
}
......@@ -350,38 +352,31 @@ static void invalidate_buckets(struct cache *ca)
break;
}
pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
fifo_used(&ca->free), ca->free.size,
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->unused), ca->unused.size);
trace_bcache_alloc_invalidate(ca);
}
#define allocator_wait(ca, cond) \
do { \
DEFINE_WAIT(__wait); \
\
while (1) { \
prepare_to_wait(&ca->set->alloc_wait, \
&__wait, TASK_INTERRUPTIBLE); \
set_current_state(TASK_INTERRUPTIBLE); \
if (cond) \
break; \
\
mutex_unlock(&(ca)->set->bucket_lock); \
if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) { \
finish_wait(&ca->set->alloc_wait, &__wait); \
closure_return(cl); \
closure_put(&ca->set->cl); \
return 0; \
} \
\
schedule(); \
mutex_lock(&(ca)->set->bucket_lock); \
} \
\
finish_wait(&ca->set->alloc_wait, &__wait); \
__set_current_state(TASK_RUNNING); \
} while (0)
void bch_allocator_thread(struct closure *cl)
static int bch_allocator_thread(void *arg)
{
struct cache *ca = container_of(cl, struct cache, alloc);
struct cache *ca = arg;
mutex_lock(&ca->set->bucket_lock);
......@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
{
long r = -1;
again:
wake_up(&ca->set->alloc_wait);
wake_up_process(ca->alloc_thread);
if (fifo_used(&ca->free) > ca->watermark[watermark] &&
fifo_pop(&ca->free, r)) {
......@@ -476,9 +471,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
return r;
}
pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
fifo_used(&ca->free_inc), fifo_used(&ca->unused));
trace_bcache_alloc_fail(ca);
if (cl) {
closure_wait(&ca->set->bucket_wait, cl);
......@@ -552,6 +545,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
/* Init */
int bch_cache_allocator_start(struct cache *ca)
{
ca->alloc_thread = kthread_create(bch_allocator_thread,
ca, "bcache_allocator");
if (IS_ERR(ca->alloc_thread))
return PTR_ERR(ca->alloc_thread);
closure_get(&ca->set->cl);
wake_up_process(ca->alloc_thread);
return 0;
}
void bch_cache_allocator_exit(struct cache *ca)
{
struct discard *d;
......
......@@ -178,7 +178,6 @@
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
#include <linux/bio.h>
#include <linux/blktrace_api.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
......@@ -388,8 +387,6 @@ struct keybuf_key {
typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
struct keybuf {
keybuf_pred_fn *key_predicate;
struct bkey last_scanned;
spinlock_t lock;
......@@ -438,8 +435,10 @@ struct bcache_device {
/* If nonzero, we're detaching/unregistering from cache set */
atomic_t detaching;
atomic_long_t sectors_dirty;
unsigned long sectors_dirty_gc;
uint64_t nr_stripes;
unsigned stripe_size_bits;
atomic_t *stripe_sectors_dirty;
unsigned long sectors_dirty_last;
long sectors_dirty_derivative;
......@@ -531,6 +530,7 @@ struct cached_dev {
unsigned sequential_merge:1;
unsigned verify:1;
unsigned partial_stripes_expensive:1;
unsigned writeback_metadata:1;
unsigned writeback_running:1;
unsigned char writeback_percent;
......@@ -565,8 +565,7 @@ struct cache {
unsigned watermark[WATERMARK_MAX];
struct closure alloc;
struct workqueue_struct *alloc_workqueue;
struct task_struct *alloc_thread;
struct closure prio;
struct prio_set *disk_buckets;
......@@ -703,9 +702,6 @@ struct cache_set {
/* For the btree cache */
struct shrinker shrink;
/* For the allocator itself */
wait_queue_head_t alloc_wait;
/* For the btree cache and anything allocation related */
struct mutex bucket_lock;
......@@ -823,10 +819,9 @@ struct cache_set {
/*
* A btree node on disk could have too many bsets for an iterator to fit
* on the stack - this is a single element mempool for btree_read_work()
* on the stack - have to dynamically allocate them
*/
struct mutex fill_lock;
struct btree_iter *fill_iter;
mempool_t *fill_iter;
/*
* btree_sort() is a merge sort and requires temporary space - single
......@@ -834,6 +829,7 @@ struct cache_set {
*/
struct mutex sort_lock;
struct bset *sort;
unsigned sort_crit_factor;
/* List of buckets we're currently writing data to */
struct list_head data_buckets;
......@@ -906,8 +902,6 @@ static inline unsigned local_clock_us(void)
return local_clock() >> 10;
}
#define MAX_BSETS 4U
#define BTREE_PRIO USHRT_MAX
#define INITIAL_PRIO 32768
......@@ -1112,23 +1106,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}
/* Blktrace macros */
#define blktrace_msg(c, fmt, ...) \
do { \
struct request_queue *q = bdev_get_queue(c->bdev); \
if (q) \
blk_add_trace_msg(q, fmt, ##__VA_ARGS__); \
} while (0)
#define blktrace_msg_all(s, fmt, ...) \
do { \
struct cache *_c; \
unsigned i; \
for_each_cache(_c, (s), i) \
blktrace_msg(_c, fmt, ##__VA_ARGS__); \
} while (0)
static inline void cached_dev_put(struct cached_dev *dc)
{
if (atomic_dec_and_test(&dc->count))
......@@ -1173,10 +1150,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
static struct kobj_attribute ksysfs_##n = \
__ATTR(n, S_IWUSR|S_IRUSR, show, store)
/* Forward declarations */
static inline void wake_up_allocators(struct cache_set *c)
{
struct cache *ca;
unsigned i;
for_each_cache(ca, c, i)
wake_up_process(ca->alloc_thread);
}
void bch_writeback_queue(struct cached_dev *);
void bch_writeback_add(struct cached_dev *, unsigned);
/* Forward declarations */
void bch_count_io_errors(struct cache *, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
......@@ -1193,7 +1176,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
void bch_allocator_thread(struct closure *);
long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
void bch_bucket_free(struct cache_set *, struct bkey *);
......@@ -1241,9 +1223,9 @@ void bch_cache_set_stop(struct cache_set *);
struct cache_set *bch_cache_set_alloc(struct cache_sb *);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
void bch_cached_dev_writeback_init(struct cached_dev *);
void bch_moving_init_cache_set(struct cache_set *);
int bch_cache_allocator_start(struct cache *ca);
void bch_cache_allocator_exit(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);
......
......@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
{
unsigned i;
char buf[80];
if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
goto bad;
......@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
return false;
bad:
cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
bch_bkey_to_text(buf, sizeof(buf), k);
cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
return true;
}
......@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
#ifdef CONFIG_BCACHE_EDEBUG
bug:
mutex_unlock(&b->c->bucket_lock);
btree_bug(b,
{
char buf[80];
bch_bkey_to_text(buf, sizeof(buf), k);
btree_bug(b,
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
}
return true;
#endif
}
......@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
new->sets->size = 0;
}
#define SORT_CRIT (4096 / sizeof(uint64_t))
void bch_btree_sort_lazy(struct btree *b)
{
if (b->nsets) {
unsigned i, j, keys = 0, total;
unsigned crit = SORT_CRIT;
int i;
for (i = 0; i <= b->nsets; i++)
keys += b->sets[i].data->keys;
total = keys;
/* Don't sort if nothing to do */
if (!b->nsets)
goto out;
for (j = 0; j < b->nsets; j++) {
if (keys * 2 < total ||
keys < 1000) {
bch_btree_sort_partial(b, j);
return;
}
/* If not a leaf node, always sort */
if (b->level) {
bch_btree_sort(b);
return;
}
keys -= b->sets[j].data->keys;
}
for (i = b->nsets - 1; i >= 0; --i) {
crit *= b->c->sort_crit_factor;
/* Must sort if b->nsets == 3 or we'll overflow */
if (b->nsets >= (MAX_BSETS - 1) - b->level) {
bch_btree_sort(b);
if (b->sets[i].data->keys < crit) {
bch_btree_sort_partial(b, i);
return;
}
}
/* Sort if we'd overflow */
if (b->nsets + 1 == MAX_BSETS) {
bch_btree_sort(b);
return;
}
out:
bset_build_written_tree(b);
}
......
#ifndef _BCACHE_BSET_H
#define _BCACHE_BSET_H
#include <linux/slab.h>
/*
* BKEYS:
*
......@@ -142,6 +144,8 @@
/* Btree key comparison/iteration */
#define MAX_BSETS 4U
struct btree_iter {
size_t size, used;
struct btree_iter_set {
......
This diff is collapsed.
......@@ -102,7 +102,6 @@
#include "debug.h"
struct btree_write {
struct closure *owner;
atomic_t *journal;
/* If btree_split() frees a btree node, it writes a new pointer to that
......@@ -142,16 +141,12 @@ struct btree {
*/
struct bset_tree sets[MAX_BSETS];
/* Used to refcount bio splits, also protects b->bio */
/* For outstanding btree writes, used as a lock - protects write_idx */
struct closure_with_waitlist io;
/* Gets transferred to w->prio_blocked - see the comment there */
int prio_blocked;
struct list_head list;
struct delayed_work work;
uint64_t io_start_time;
struct btree_write writes[2];
struct bio *bio;
};
......@@ -164,13 +159,11 @@ static inline void set_btree_node_ ## flag(struct btree *b) \
{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
enum btree_flags {
BTREE_NODE_read_done,
BTREE_NODE_io_error,
BTREE_NODE_dirty,
BTREE_NODE_write_idx,
};
BTREE_FLAG(read_done);
BTREE_FLAG(io_error);
BTREE_FLAG(dirty);
BTREE_FLAG(write_idx);
......@@ -278,6 +271,13 @@ struct btree_op {
BKEY_PADDED(replace);
};
enum {
BTREE_INSERT_STATUS_INSERT,
BTREE_INSERT_STATUS_BACK_MERGE,
BTREE_INSERT_STATUS_OVERWROTE,
BTREE_INSERT_STATUS_FRONT_MERGE,
};
void bch_btree_op_init_stack(struct btree_op *);
static inline void rw_lock(bool w, struct btree *b, int level)
......@@ -293,9 +293,7 @@ static inline void rw_unlock(bool w, struct btree *b)
#ifdef CONFIG_BCACHE_EDEBUG
unsigned i;
if (w &&
b->key.ptr[0] &&
btree_node_read_done(b))
if (w && b->key.ptr[0])
for (i = 0; i <= b->nsets; i++)
bch_check_key_order(b, b->sets[i].data);
#endif
......@@ -370,9 +368,8 @@ static inline bool should_split(struct btree *b)
> btree_blocks(b));
}
void bch_btree_read_done(struct closure *);
void bch_btree_read(struct btree *);
void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
void bch_btree_node_read(struct btree *);
void bch_btree_node_write(struct btree *, struct closure *);
void bch_cannibalize_unlock(struct cache_set *, struct closure *);
void bch_btree_set_root(struct btree *);
......@@ -380,7 +377,6 @@ struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
int, struct btree_op *);
bool bch_btree_insert_keys(struct btree *, struct btree_op *);
bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
struct bio *);
int bch_btree_insert(struct btree_op *, struct cache_set *);
......@@ -393,13 +389,14 @@ void bch_moving_gc(struct closure *);
int bch_btree_check(struct cache_set *, struct btree_op *);
uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
void bch_keybuf_init(struct keybuf *);
void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *,
keybuf_pred_fn *);
bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
struct bkey *);
void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
struct keybuf_key *bch_keybuf_next(struct keybuf *);
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
struct keybuf *, struct bkey *);
struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
struct bkey *, keybuf_pred_fn *);
#endif
......@@ -47,11 +47,10 @@ const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
return "";
}
struct keyprint_hack bch_pkey(const struct bkey *k)
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
unsigned i = 0;
struct keyprint_hack r;
char *out = r.s, *end = r.s + KEYHACK_SIZE;
char *out = buf, *end = buf + size;
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
......@@ -75,16 +74,14 @@ struct keyprint_hack bch_pkey(const struct bkey *k)
if (KEY_CSUM(k))
p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
#undef p
return r;
return out - buf;
}
struct keyprint_hack bch_pbtree(const struct btree *b)
int bch_btree_to_text(char *buf, size_t size, const struct btree *b)
{
struct keyprint_hack r;
snprintf(r.s, 40, "%zu level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
b->level, b->c->root ? b->c->root->level : -1);
return r;
return scnprintf(buf, size, "%zu level %i/%i",
PTR_BUCKET_NR(b->c, &b->key, 0),
b->level, b->c->root ? b->c->root->level : -1);
}
#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
......@@ -100,10 +97,12 @@ static void dump_bset(struct btree *b, struct bset *i)
{
struct bkey *k;
unsigned j;
char buf[80];
for (k = i->start; k < end(i); k = bkey_next(k)) {
bch_bkey_to_text(buf, sizeof(buf), k);
printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
(uint64_t *) k - i->d, i->keys, pkey(k));
(uint64_t *) k - i->d, i->keys, buf);
for (j = 0; j < KEY_PTRS(k); j++) {
size_t n = PTR_BUCKET_NR(b->c, k, j);
......@@ -144,7 +143,7 @@ void bch_btree_verify(struct btree *b, struct bset *new)
v->written = 0;
v->level = b->level;
bch_btree_read(v);
bch_btree_node_read(v);
closure_wait_event(&v->io.wait, &cl,
atomic_read(&b->io.cl.remaining) == -1);
......@@ -200,7 +199,7 @@ void bch_data_verify(struct search *s)
if (!check)
return;
if (bch_bio_alloc_pages(check, GFP_NOIO))
if (bio_alloc_pages(check, GFP_NOIO))
goto out_put;
check->bi_rw = READ_SYNC;
......@@ -252,6 +251,7 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
va_list args)
{
unsigned i;
char buf[80];
console_lock();
......@@ -262,7 +262,8 @@ static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
console_unlock();
panic("at %s\n", pbtree(b));
bch_btree_to_text(buf, sizeof(buf), b);
panic("at %s\n", buf);
}
void bch_check_key_order_msg(struct btree *b, struct bset *i,
......@@ -337,6 +338,7 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
{
struct dump_iterator *i = file->private_data;
ssize_t ret = 0;
char kbuf[80];
while (size) {
struct keybuf_key *w;
......@@ -355,11 +357,12 @@ static ssize_t bch_dump_read(struct file *file, char __user *buf,
if (i->bytes)
break;
w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY, dump_pred);
if (!w)
break;
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
bch_bkey_to_text(kbuf, sizeof(kbuf), &w->key);
i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", kbuf);
bch_keybuf_del(&i->keys, w);
}
......@@ -377,7 +380,7 @@ static int bch_dump_open(struct inode *inode, struct file *file)
file->private_data = i;
i->c = c;
bch_keybuf_init(&i->keys, dump_pred);
bch_keybuf_init(&i->keys);
i->keys.last_scanned = KEY(0, 0, 0);
return 0;
......@@ -409,142 +412,6 @@ void bch_debug_init_cache_set(struct cache_set *c)
#endif
/* Fuzz tester has rotted: */
#if 0
static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
const char *buffer, size_t size)
{
void dump(struct btree *b)
{
struct bset *i;
for (i = b->sets[0].data;
index(i, b) < btree_blocks(b) &&
i->seq == b->sets[0].data->seq;
i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
dump_bset(b, i);
}
struct cache_sb *sb;
struct cache_set *c;
struct btree *all[3], *b, *fill, *orig;
int j;
struct btree_op op;
bch_btree_op_init_stack(&op);
sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
if (!sb)
return -ENOMEM;
sb->bucket_size = 128;
sb->block_size = 4;
c = bch_cache_set_alloc(sb);
if (!c)
return -ENOMEM;
for (j = 0; j < 3; j++) {
BUG_ON(list_empty(&c->btree_cache));
all[j] = list_first_entry(&c->btree_cache, struct btree, list);
list_del_init(&all[j]->list);
all[j]->key = KEY(0, 0, c->sb.bucket_size);
bkey_copy_key(&all[j]->key, &MAX_KEY);
}
b = all[0];
fill = all[1];
orig = all[2];
while (1) {
for (j = 0; j < 3; j++)
all[j]->written = all[j]->nsets = 0;
bch_bset_init_next(b);
while (1) {
struct bset *i = write_block(b);
struct bkey *k = op.keys.top;
unsigned rand;
bkey_init(k);
rand = get_random_int();
op.type = rand & 1
? BTREE_INSERT
: BTREE_REPLACE;
rand >>= 1;
SET_KEY_SIZE(k, bucket_remainder(c, rand));
rand >>= c->bucket_bits;
rand &= 1024 * 512 - 1;
rand += c->sb.bucket_size;
SET_KEY_OFFSET(k, rand);
#if 0
SET_KEY_PTRS(k, 1);
#endif
bch_keylist_push(&op.keys);
bch_btree_insert_keys(b, &op);
if (should_split(b) ||
set_blocks(i, b->c) !=
__set_blocks(i, i->keys + 15, b->c)) {
i->csum = csum_set(i);
memcpy(write_block(fill),
i, set_bytes(i));
b->written += set_blocks(i, b->c);
fill->written = b->written;
if (b->written == btree_blocks(b))
break;
bch_btree_sort_lazy(b);
bch_bset_init_next(b);
}
}
memcpy(orig->sets[0].data,
fill->sets[0].data,
btree_bytes(c));
bch_btree_sort(b);
fill->written = 0;
bch_btree_read_done(&fill->io.cl);
if (b->sets[0].data->keys != fill->sets[0].data->keys ||
memcmp(b->sets[0].data->start,
fill->sets[0].data->start,
b->sets[0].data->keys * sizeof(uint64_t))) {
struct bset *i = b->sets[0].data;
struct bkey *k, *l;
for (k = i->start,
l = fill->sets[0].data->start;
k < end(i);
k = bkey_next(k), l = bkey_next(l))
if (bkey_cmp(k, l) ||
KEY_SIZE(k) != KEY_SIZE(l))
pr_err("key %zi differs: %s != %s",
(uint64_t *) k - i->d,
pkey(k), pkey(l));
for (j = 0; j < 3; j++) {
pr_err("**** Set %i ****", j);
dump(all[j]);
}
panic("\n");
}
pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
}
}
kobj_attribute_write(fuzz, btree_fuzz);
#endif
void bch_debug_exit(void)
{
if (!IS_ERR_OR_NULL(debug))
......@@ -554,11 +421,6 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
int ret = 0;
#if 0
ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
if (ret)
return ret;
#endif
debug = debugfs_create_dir("bcache", NULL);
return ret;
......
......@@ -3,15 +3,8 @@
/* Btree/bkey debug printing */
#define KEYHACK_SIZE 80
struct keyprint_hack {
char s[KEYHACK_SIZE];
};
struct keyprint_hack bch_pkey(const struct bkey *k);
struct keyprint_hack bch_pbtree(const struct btree *b);
#define pkey(k) (&bch_pkey(k).s[0])
#define pbtree(b) (&bch_pbtree(b).s[0])
int bch_bkey_to_text(char *buf, size_t size, const struct bkey *k);
int bch_btree_to_text(char *buf, size_t size, const struct btree *b);
#ifdef CONFIG_BCACHE_EDEBUG
......
......@@ -9,6 +9,8 @@
#include "bset.h"
#include "debug.h"
#include <linux/blkdev.h>
static void bch_bi_idx_hack_endio(struct bio *bio, int error)
{
struct bio *p = bio->bi_private;
......@@ -66,13 +68,6 @@ static void bch_generic_make_request_hack(struct bio *bio)
* The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
* bvec boundry; it is the caller's responsibility to ensure that @bio is not
* freed before the split.
*
* If bch_bio_split() is running under generic_make_request(), it's not safe to
* allocate more than one bio from the same bio set. Therefore, if it is running
* under generic_make_request() it masks out __GFP_WAIT when doing the
* allocation. The caller must check for failure if there's any possibility of
* it being called from under generic_make_request(); it is then the caller's
* responsibility to retry from a safe context (by e.g. punting to workqueue).
*/
struct bio *bch_bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
......@@ -83,20 +78,13 @@ struct bio *bch_bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
/*
* If we're being called from underneath generic_make_request() and we
* already allocated any bios from this bio set, we risk deadlock if we
* use the mempool. So instead, we possibly fail and let the caller punt
* to workqueue or somesuch and retry in a safe context.
*/
if (current->bio_list)
gfp &= ~__GFP_WAIT;
if (sectors >= bio_sectors(bio))
return bio;
if (bio->bi_rw & REQ_DISCARD) {
ret = bio_alloc_bioset(gfp, 1, bs);
if (!ret)
return NULL;
idx = 0;
goto out;
}
......@@ -160,17 +148,18 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned max_segments = min_t(unsigned, BIO_MAX_PAGES,
queue_max_segments(q));
struct bio_vec *bv, *end = bio_iovec(bio) +
min_t(int, bio_segments(bio), max_segments);
if (bio->bi_rw & REQ_DISCARD)
return min(ret, q->limits.max_discard_sectors);
if (bio_segments(bio) > max_segments ||
q->merge_bvec_fn) {
struct bio_vec *bv;
int i, seg = 0;
ret = 0;
for (bv = bio_iovec(bio); bv < end; bv++) {
bio_for_each_segment(bv, bio, i) {
struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_sector,
......@@ -178,10 +167,14 @@ static unsigned bch_bio_max_sectors(struct bio *bio)
.bi_rw = bio->bi_rw,
};
if (seg == max_segments)
break;
if (q->merge_bvec_fn &&
q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
break;
seg++;
ret += bv->bv_len >> 9;
}
}
......@@ -218,30 +211,10 @@ static void bch_bio_submit_split_endio(struct bio *bio, int error)
closure_put(cl);
}
static void __bch_bio_submit_split(struct closure *cl)
{
struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
struct bio *bio = s->bio, *n;
do {
n = bch_bio_split(bio, bch_bio_max_sectors(bio),
GFP_NOIO, s->p->bio_split);
if (!n)
continue_at(cl, __bch_bio_submit_split, system_wq);
n->bi_end_io = bch_bio_submit_split_endio;
n->bi_private = cl;
closure_get(cl);
bch_generic_make_request_hack(n);
} while (n != bio);
continue_at(cl, bch_bio_submit_split_done, NULL);
}
void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
{
struct bio_split_hook *s;
struct bio *n;
if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
goto submit;
......@@ -250,6 +223,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
goto submit;
s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
closure_init(&s->cl, NULL);
s->bio = bio;
s->p = p;
......@@ -257,8 +231,18 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
s->bi_private = bio->bi_private;
bio_get(bio);
closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
return;
do {
n = bch_bio_split(bio, bch_bio_max_sectors(bio),
GFP_NOIO, s->p->bio_split);
n->bi_end_io = bch_bio_submit_split_endio;
n->bi_private = &s->cl;
closure_get(&s->cl);
bch_generic_make_request_hack(n);
} while (n != bio);
continue_at(&s->cl, bch_bio_submit_split_done, NULL);
submit:
bch_generic_make_request_hack(bio);
}
......
......@@ -9,6 +9,8 @@
#include "debug.h"
#include "request.h"
#include <trace/events/bcache.h>
/*
* Journal replay/recovery:
*
......@@ -300,7 +302,8 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list,
for (k = i->j.start;
k < end(&i->j);
k = bkey_next(k)) {
pr_debug("%s", pkey(k));
trace_bcache_journal_replay_key(k);
bkey_copy(op->keys.top, k);
bch_keylist_push(&op->keys);
......@@ -384,7 +387,7 @@ static void btree_flush_write(struct cache_set *c)
return;
found:
if (btree_node_dirty(best))
bch_btree_write(best, true, NULL);
bch_btree_node_write(best, NULL);
rw_unlock(true, best);
}
......@@ -617,7 +620,7 @@ static void journal_write_unlocked(struct closure *cl)
bio_reset(bio);
bio->bi_sector = PTR_OFFSET(k, i);
bio->bi_bdev = ca->bdev;
bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA;
bio->bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
......@@ -712,7 +715,8 @@ void bch_journal(struct closure *cl)
spin_lock(&c->journal.lock);
if (journal_full(&c->journal)) {
/* XXX: tracepoint */
trace_bcache_journal_full(c);
closure_wait(&c->journal.wait, cl);
journal_reclaim(c);
......@@ -728,13 +732,15 @@ void bch_journal(struct closure *cl)
if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
b > c->journal.blocks_free) {
/* XXX: If we were inserting so many keys that they won't fit in
trace_bcache_journal_entry_full(c);
/*
* XXX: If we were inserting so many keys that they won't fit in
* an _empty_ journal write, we'll deadlock. For now, handle
* this in bch_keylist_realloc() - but something to think about.
*/
BUG_ON(!w->data->keys);
/* XXX: tracepoint */
BUG_ON(!closure_wait(&w->wait, cl));
closure_flush(&c->journal.io);
......
......@@ -9,6 +9,8 @@
#include "debug.h"
#include "request.h"
#include <trace/events/bcache.h>
struct moving_io {
struct keybuf_key *w;
struct search s;
......@@ -44,14 +46,14 @@ static void write_moving_finish(struct closure *cl)
{
struct moving_io *io = container_of(cl, struct moving_io, s.cl);
struct bio *bio = &io->bio.bio;
struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
struct bio_vec *bv;
int i;
while (bv-- != bio->bi_io_vec)
bio_for_each_segment_all(bv, bio, i)
__free_page(bv->bv_page);
pr_debug("%s %s", io->s.op.insert_collision
? "collision moving" : "moved",
pkey(&io->w->key));
if (io->s.op.insert_collision)
trace_bcache_gc_copy_collision(&io->w->key);
bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
......@@ -94,8 +96,6 @@ static void write_moving(struct closure *cl)
struct moving_io *io = container_of(s, struct moving_io, s);
if (!s->error) {
trace_bcache_write_moving(&io->bio.bio);
moving_init(io);
io->bio.bio.bi_sector = KEY_START(&io->w->key);
......@@ -122,7 +122,6 @@ static void read_moving_submit(struct closure *cl)
struct moving_io *io = container_of(s, struct moving_io, s);
struct bio *bio = &io->bio.bio;
trace_bcache_read_moving(bio);
bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
continue_at(cl, write_moving, bch_gc_wq);
......@@ -138,7 +137,8 @@ static void read_moving(struct closure *cl)
/* XXX: if we error, background writeback could stall indefinitely */
while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
w = bch_keybuf_next_rescan(c, &c->moving_gc_keys,
&MAX_KEY, moving_pred);
if (!w)
break;
......@@ -159,10 +159,10 @@ static void read_moving(struct closure *cl)
bio->bi_rw = READ;
bio->bi_end_io = read_moving_endio;
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
if (bio_alloc_pages(bio, GFP_KERNEL))
goto err;
pr_debug("%s", pkey(&w->key));
trace_bcache_gc_copy(&w->key);
closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
......@@ -250,5 +250,5 @@ void bch_moving_gc(struct closure *cl)
void bch_moving_init_cache_set(struct cache_set *c)
{
bch_keybuf_init(&c->moving_gc_keys, moving_pred);
bch_keybuf_init(&c->moving_gc_keys);
}
This diff is collapsed.
......@@ -30,7 +30,7 @@ struct search {
};
void bch_cache_read_endio(struct bio *, int);
int bch_get_congested(struct cache_set *);
unsigned bch_get_congested(struct cache_set *);
void bch_insert_data(struct closure *cl);
void bch_btree_insert_async(struct closure *);
void bch_cache_read_endio(struct bio *, int);
......
This diff is collapsed.
......@@ -9,7 +9,9 @@
#include "sysfs.h"
#include "btree.h"
#include "request.h"
#include "writeback.h"
#include <linux/blkdev.h>
#include <linux/sort.h>
static const char * const cache_replacement_policies[] = {
......@@ -79,6 +81,9 @@ rw_attribute(writeback_rate_p_term_inverse);
rw_attribute(writeback_rate_d_smooth);
read_attribute(writeback_rate_debug);
read_attribute(stripe_size);
read_attribute(partial_stripes_expensive);
rw_attribute(synchronous);
rw_attribute(journal_delay_ms);
rw_attribute(discard);
......@@ -127,7 +132,7 @@ SHOW(__bch_cached_dev)
char derivative[20];
char target[20];
bch_hprint(dirty,
atomic_long_read(&dc->disk.sectors_dirty) << 9);
bcache_dev_sectors_dirty(&dc->disk) << 9);
bch_hprint(derivative, dc->writeback_rate_derivative << 9);
bch_hprint(target, dc->writeback_rate_target << 9);
......@@ -143,7 +148,10 @@ SHOW(__bch_cached_dev)
}
sysfs_hprint(dirty_data,
atomic_long_read(&dc->disk.sectors_dirty) << 9);
bcache_dev_sectors_dirty(&dc->disk) << 9);
sysfs_hprint(stripe_size, (1 << dc->disk.stripe_size_bits) << 9);
var_printf(partial_stripes_expensive, "%u");
var_printf(sequential_merge, "%i");
var_hprint(sequential_cutoff);
......@@ -170,6 +178,7 @@ STORE(__cached_dev)
disk.kobj);
unsigned v = size;
struct cache_set *c;
struct kobj_uevent_env *env;
#define d_strtoul(var) sysfs_strtoul(var, dc->var)
#define d_strtoi_h(var) sysfs_hatoi(var, dc->var)
......@@ -214,6 +223,7 @@ STORE(__cached_dev)
}
if (attr == &sysfs_label) {
/* note: endlines are preserved */
memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
bch_write_bdev_super(dc, NULL);
if (dc->disk.c) {
......@@ -221,6 +231,13 @@ STORE(__cached_dev)
buf, SB_LABEL_SIZE);
bch_uuid_write(dc->disk.c);
}
env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
add_uevent_var(env, "DRIVER=bcache");
add_uevent_var(env, "CACHED_UUID=%pU", dc->sb.uuid),
add_uevent_var(env, "CACHED_LABEL=%s", buf);
kobject_uevent_env(
&disk_to_dev(dc->disk.disk)->kobj, KOBJ_CHANGE, env->envp);
kfree(env);
}
if (attr == &sysfs_attach) {
......@@ -284,6 +301,8 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_rate_d_smooth,
&sysfs_writeback_rate_debug,
&sysfs_dirty_data,
&sysfs_stripe_size,
&sysfs_partial_stripes_expensive,
&sysfs_sequential_cutoff,
&sysfs_sequential_merge,
&sysfs_clear_stats,
......@@ -665,12 +684,10 @@ SHOW(__bch_cache)
int cmp(const void *l, const void *r)
{ return *((uint16_t *) r) - *((uint16_t *) l); }
/* Number of quantiles we compute */
const unsigned nq = 31;
size_t n = ca->sb.nbuckets, i, unused, btree;
uint64_t sum = 0;
uint16_t q[nq], *p, *cached;
/* Compute 31 quantiles */
uint16_t q[31], *p, *cached;
ssize_t ret;
cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
......@@ -703,26 +720,29 @@ SHOW(__bch_cache)
if (n)
do_div(sum, n);
for (i = 0; i < nq; i++)
q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
for (i = 0; i < ARRAY_SIZE(q); i++)
q[i] = INITIAL_PRIO - cached[n * (i + 1) /
(ARRAY_SIZE(q) + 1)];
vfree(p);
ret = snprintf(buf, PAGE_SIZE,
"Unused: %zu%%\n"
"Metadata: %zu%%\n"
"Average: %llu\n"
"Sectors per Q: %zu\n"
"Quantiles: [",
unused * 100 / (size_t) ca->sb.nbuckets,
btree * 100 / (size_t) ca->sb.nbuckets, sum,
n * ca->sb.bucket_size / (nq + 1));
for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
ret += snprintf(buf + ret, PAGE_SIZE - ret,
i < nq - 1 ? "%u " : "%u]\n", q[i]);
buf[PAGE_SIZE - 1] = '\0';
ret = scnprintf(buf, PAGE_SIZE,
"Unused: %zu%%\n"
"Metadata: %zu%%\n"
"Average: %llu\n"
"Sectors per Q: %zu\n"
"Quantiles: [",
unused * 100 / (size_t) ca->sb.nbuckets,
btree * 100 / (size_t) ca->sb.nbuckets, sum,
n * ca->sb.bucket_size / (ARRAY_SIZE(q) + 1));
for (i = 0; i < ARRAY_SIZE(q); i++)
ret += scnprintf(buf + ret, PAGE_SIZE - ret,
"%u ", q[i]);
ret--;
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "]\n");
return ret;
}
......
......@@ -2,6 +2,7 @@
#include "btree.h"
#include "request.h"
#include <linux/blktrace_api.h>
#include <linux/module.h>
#define CREATE_TRACE_POINTS
......@@ -9,18 +10,44 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_sequential);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_bypass_congested);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_replay_key);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_entry_full);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_cache_cannibalize);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_alloc_fail);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_free);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_gc_coalesce);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_copy_collision);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_insert_key);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_node_compact);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_set_root);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_invalidate);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_alloc_fail);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback_collision);
......@@ -228,23 +228,6 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp)
{
int i;
struct bio_vec *bv;
bio_for_each_segment(bv, bio, i) {
bv->bv_page = alloc_page(gfp);
if (!bv->bv_page) {
while (bv-- != bio->bi_io_vec + bio->bi_idx)
__free_page(bv->bv_page);
return -ENOMEM;
}
}
return 0;
}
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
......
......@@ -15,8 +15,6 @@
struct closure;
#include <trace/events/bcache.h>
#ifdef CONFIG_BCACHE_EDEBUG
#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
......@@ -566,12 +564,8 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
return x;
}
#define bio_end(bio) ((bio)->bi_sector + bio_sectors(bio))
void bch_bio_map(struct bio *bio, void *base);
int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
return bdev->bd_inode->i_size >> 9;
......
......@@ -9,6 +9,9 @@
#include "bcache.h"
#include "btree.h"
#include "debug.h"
#include "writeback.h"
#include <trace/events/bcache.h>
static struct workqueue_struct *dirty_wq;
......@@ -36,7 +39,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
int change = 0;
int64_t error;
int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t derivative = dirty - dc->disk.sectors_dirty_last;
dc->disk.sectors_dirty_last = dirty;
......@@ -105,6 +108,31 @@ static bool dirty_pred(struct keybuf *buf, struct bkey *k)
return KEY_DIRTY(k);
}
static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k)
{
uint64_t stripe;
unsigned nr_sectors = KEY_SIZE(k);
struct cached_dev *dc = container_of(buf, struct cached_dev,
writeback_keys);
unsigned stripe_size = 1 << dc->disk.stripe_size_bits;
if (!KEY_DIRTY(k))
return false;
stripe = KEY_START(k) >> dc->disk.stripe_size_bits;
while (1) {
if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) !=
stripe_size)
return false;
if (nr_sectors <= stripe_size)
return true;
nr_sectors -= stripe_size;
stripe++;
}
}
static void dirty_init(struct keybuf_key *w)
{
struct dirty_io *io = w->private;
......@@ -149,7 +177,22 @@ static void refill_dirty(struct closure *cl)
searched_from_start = true;
}
bch_refill_keybuf(dc->disk.c, buf, &end);
if (dc->partial_stripes_expensive) {
uint64_t i;
for (i = 0; i < dc->disk.nr_stripes; i++)
if (atomic_read(dc->disk.stripe_sectors_dirty + i) ==
1 << dc->disk.stripe_size_bits)
goto full_stripes;
goto normal_refill;
full_stripes:
bch_refill_keybuf(dc->disk.c, buf, &end,
dirty_full_stripe_pred);
} else {
normal_refill:
bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
}
if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
/* Searched the entire btree - delay awhile */
......@@ -181,10 +224,8 @@ void bch_writeback_queue(struct cached_dev *dc)
}
}
void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
void bch_writeback_add(struct cached_dev *dc)
{
atomic_long_add(sectors, &dc->disk.sectors_dirty);
if (!atomic_read(&dc->has_dirty) &&
!atomic_xchg(&dc->has_dirty, 1)) {
atomic_inc(&dc->count);
......@@ -203,6 +244,34 @@ void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
}
}
void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
uint64_t offset, int nr_sectors)
{
struct bcache_device *d = c->devices[inode];
unsigned stripe_size, stripe_offset;
uint64_t stripe;
if (!d)
return;
stripe_size = 1 << d->stripe_size_bits;
stripe = offset >> d->stripe_size_bits;
stripe_offset = offset & (stripe_size - 1);
while (nr_sectors) {
int s = min_t(unsigned, abs(nr_sectors),
stripe_size - stripe_offset);
if (nr_sectors < 0)
s = -s;
atomic_add(s, d->stripe_sectors_dirty + stripe);
nr_sectors -= s;
stripe_offset = 0;
stripe++;
}
}
/* Background writeback - IO loop */
static void dirty_io_destructor(struct closure *cl)
......@@ -216,9 +285,10 @@ static void write_dirty_finish(struct closure *cl)
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
struct cached_dev *dc = io->dc;
struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
struct bio_vec *bv;
int i;
while (bv-- != io->bio.bi_io_vec)
bio_for_each_segment_all(bv, &io->bio, i)
__free_page(bv->bv_page);
/* This is kind of a dumb way of signalling errors. */
......@@ -236,10 +306,12 @@ static void write_dirty_finish(struct closure *cl)
for (i = 0; i < KEY_PTRS(&w->key); i++)
atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
pr_debug("clearing %s", pkey(&w->key));
bch_btree_insert(&op, dc->disk.c);
closure_sync(&op.cl);
if (op.insert_collision)
trace_bcache_writeback_collision(&w->key);
atomic_long_inc(op.insert_collision
? &dc->disk.c->writeback_keys_failed
: &dc->disk.c->writeback_keys_done);
......@@ -275,7 +347,6 @@ static void write_dirty(struct closure *cl)
io->bio.bi_bdev = io->dc->bdev;
io->bio.bi_end_io = dirty_endio;
trace_bcache_write_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty_finish, dirty_wq);
......@@ -296,7 +367,6 @@ static void read_dirty_submit(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
trace_bcache_read_dirty(&io->bio);
closure_bio_submit(&io->bio, cl, &io->dc->disk);
continue_at(cl, write_dirty, dirty_wq);
......@@ -349,10 +419,10 @@ static void read_dirty(struct closure *cl)
io->bio.bi_rw = READ;
io->bio.bi_end_io = read_dirty_endio;
if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
if (bio_alloc_pages(&io->bio, GFP_KERNEL))
goto err_free;
pr_debug("%s", pkey(&w->key));
trace_bcache_writeback(&w->key);
closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
......@@ -375,12 +445,49 @@ static void read_dirty(struct closure *cl)
refill_dirty(cl);
}
/* Init */
static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
struct cached_dev *dc)
{
struct bkey *k;
struct btree_iter iter;
bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
if (!b->level) {
if (KEY_INODE(k) > dc->disk.id)
break;
if (KEY_DIRTY(k))
bcache_dev_sectors_dirty_add(b->c, dc->disk.id,
KEY_START(k),
KEY_SIZE(k));
} else {
btree(sectors_dirty_init, k, b, op, dc);
if (KEY_INODE(k) > dc->disk.id)
break;
cond_resched();
}
return 0;
}
void bch_sectors_dirty_init(struct cached_dev *dc)
{
struct btree_op op;
bch_btree_op_init_stack(&op);
btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
}
void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
closure_init_unlocked(&dc->writeback);
init_rwsem(&dc->writeback_lock);
bch_keybuf_init(&dc->writeback_keys, dirty_pred);
bch_keybuf_init(&dc->writeback_keys);
dc->writeback_metadata = true;
dc->writeback_running = true;
......
#ifndef _BCACHE_WRITEBACK_H
#define _BCACHE_WRITEBACK_H
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{
uint64_t i, ret = 0;
for (i = 0; i < d->nr_stripes; i++)
ret += atomic_read(d->stripe_sectors_dirty + i);
return ret;
}
static inline bool bcache_dev_stripe_dirty(struct bcache_device *d,
uint64_t offset,
unsigned nr_sectors)
{
uint64_t stripe = offset >> d->stripe_size_bits;
while (1) {
if (atomic_read(d->stripe_sectors_dirty + stripe))
return true;
if (nr_sectors <= 1 << d->stripe_size_bits)
return false;
nr_sectors -= 1 << d->stripe_size_bits;
stripe++;
}
}
static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
unsigned cache_mode, bool would_skip)
{
unsigned in_use = dc->disk.c->gc_stats.in_use;
if (cache_mode != CACHE_MODE_WRITEBACK ||
atomic_read(&dc->disk.detaching) ||
in_use > CUTOFF_WRITEBACK_SYNC)
return false;
if (dc->partial_stripes_expensive &&
bcache_dev_stripe_dirty(&dc->disk, bio->bi_sector,
bio_sectors(bio)))
return true;
if (would_skip)
return false;
return bio->bi_rw & REQ_SYNC ||
in_use <= CUTOFF_WRITEBACK;
}
void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
void bch_writeback_queue(struct cached_dev *);
void bch_writeback_add(struct cached_dev *);
void bch_sectors_dirty_init(struct cached_dev *dc);
void bch_cached_dev_writeback_init(struct cached_dev *);
#endif
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment