Commit f25d8215 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Kill allocator threads & freelists

Now that we have new persistent data structures for the allocator, this
patch converts the allocator to use them.

Now, foreground bucket allocation uses the freespace btree to find
buckets to allocate, instead of popping buckets off the freelist.

The background allocator threads are no longer needed and are deleted,
as well as the allocator freelists. Now we only need background tasks
for invalidating buckets containing cached data (when we are low on
empty buckets), and for issuing discards.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent c6b2826c
...@@ -27,13 +27,6 @@ ...@@ -27,13 +27,6 @@
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sort.h> #include <linux/sort.h>
const char * const bch2_allocator_states[] = {
#define x(n) #n,
ALLOC_THREAD_STATES()
#undef x
NULL
};
/* Persistent alloc info: */ /* Persistent alloc info: */
static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
...@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only) ...@@ -431,7 +424,6 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
g->_mark.gen = a.gen; g->_mark.gen = a.gen;
g->io_time[READ] = a.io_time[READ]; g->io_time[READ] = a.io_time[READ];
g->io_time[WRITE] = a.io_time[WRITE]; g->io_time[WRITE] = a.io_time[WRITE];
g->oldest_gen = !gc ? a.oldest_gen : a.gen;
g->gen_valid = 1; g->gen_valid = 1;
if (!gc || if (!gc ||
...@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans, ...@@ -553,7 +545,6 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
} }
if (old_a.data_type && !new_a->data_type && if (old_a.data_type && !new_a->data_type &&
...@@ -698,493 +689,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ...@@ -698,493 +689,6 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
return ret; return ret;
} }
/* Background allocator thread: */
/*
* Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
* (marking them as invalidated on disk), then optionally issues discard
* commands to the newly free buckets, then puts them on the various freelists.
*/
/*
* bucket_gc_gen() returns the difference between the bucket's current gen and
* the oldest gen of any pointer into that bucket in the btree.
*/
static inline u8 bucket_gc_gen(struct bucket *g)
{
return g->mark.gen - g->oldest_gen;
}
static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
struct bucket_mark m)
{
u8 gc_gen;
if (!is_available_bucket(m))
return false;
if (m.owned_by_allocator)
return false;
if (ca->buckets_nouse &&
test_bit(b, ca->buckets_nouse))
return false;
if (ca->new_fs_bucket_idx) {
/*
* Device or filesystem is still being initialized, and we
* haven't fully marked superblocks & journal:
*/
if (is_superblock_bucket(ca, b))
return false;
if (b < ca->new_fs_bucket_idx)
return false;
}
gc_gen = bucket_gc_gen(bucket(ca, b));
ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
return gc_gen < BUCKET_GC_GEN_MAX;
}
/*
* Determines what order we're going to reuse buckets, smallest bucket_key()
* first.
*/
static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
u64 now, u64 last_seq_ondisk)
{
unsigned used = m.cached_sectors;
if (used) {
/*
* Prefer to keep buckets that have been read more recently, and
* buckets that have more data in them:
*/
u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
return -last_read_scaled;
} else {
/*
* Prefer to use buckets with smaller gc_gen so that we don't
* have to walk the btree and recalculate oldest_gen - but shift
* off the low bits so that buckets will still have equal sort
* keys when there's only a small difference, so that we can
* keep sequential buckets together:
*/
return bucket_gc_gen(g) >> 4;
}
}
static inline int bucket_alloc_cmp(alloc_heap *h,
struct alloc_heap_entry l,
struct alloc_heap_entry r)
{
return cmp_int(l.key, r.key) ?:
cmp_int(r.nr, l.nr) ?:
cmp_int(l.bucket, r.bucket);
}
static inline int bucket_idx_cmp(const void *_l, const void *_r)
{
const struct alloc_heap_entry *l = _l, *r = _r;
return cmp_int(l->bucket, r->bucket);
}
static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
{
struct bucket_array *buckets;
struct alloc_heap_entry e = { 0 };
u64 now, last_seq_ondisk;
size_t b, i, nr = 0;
down_read(&ca->bucket_lock);
buckets = bucket_array(ca);
ca->alloc_heap.used = 0;
now = atomic64_read(&c->io_clock[READ].now);
last_seq_ondisk = c->journal.flushed_seq_ondisk;
/*
* Find buckets with lowest read priority, by building a maxheap sorted
* by read priority and repeatedly replacing the maximum element until
* all buckets have been visited.
*/
for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
struct bucket *g = &buckets->b[b];
struct bucket_mark m = READ_ONCE(g->mark);
unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
cond_resched();
if (!bch2_can_invalidate_bucket(ca, b, m))
continue;
if (!m.data_type &&
bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
last_seq_ondisk,
ca->dev_idx, b)) {
ca->buckets_waiting_on_journal++;
continue;
}
if (e.nr && e.bucket + e.nr == b && e.key == key) {
e.nr++;
} else {
if (e.nr)
heap_add_or_replace(&ca->alloc_heap, e,
-bucket_alloc_cmp, NULL);
e = (struct alloc_heap_entry) {
.bucket = b,
.nr = 1,
.key = key,
};
}
}
if (e.nr)
heap_add_or_replace(&ca->alloc_heap, e,
-bucket_alloc_cmp, NULL);
for (i = 0; i < ca->alloc_heap.used; i++)
nr += ca->alloc_heap.data[i].nr;
while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
nr -= ca->alloc_heap.data[0].nr;
heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
}
up_read(&ca->bucket_lock);
}
static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
{
size_t i, nr = 0;
ca->inc_gen_needs_gc = 0;
ca->inc_gen_really_needs_gc = 0;
ca->buckets_waiting_on_journal = 0;
find_reclaimable_buckets_lru(c, ca);
heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
for (i = 0; i < ca->alloc_heap.used; i++)
nr += ca->alloc_heap.data[i].nr;
return nr;
}
static int bucket_invalidate_btree(struct btree_trans *trans,
struct bch_dev *ca, u64 b,
struct bkey_i_alloc_v4 *a)
{
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
POS(ca->dev_idx, b),
BTREE_ITER_CACHED|
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret)
goto err;
bkey_alloc_v4_init(&a->k_i);
a->k.p = iter.pos;
bch2_alloc_to_v4(k, &a->v);
a->v.gen++;
a->v.data_type = 0;
a->v.dirty_sectors = 0;
a->v.cached_sectors = 0;
a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now);
ret = bch2_trans_update(trans, &iter, &a->k_i,
BTREE_TRIGGER_BUCKET_INVALIDATE|
BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
u64 *journal_seq, unsigned flags)
{
struct bkey_i_alloc_v4 a;
size_t b;
u64 commit_seq = 0;
int ret = 0;
/*
* If the read-only path is trying to shut down, we can't be generating
* new btree updates:
*/
if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
return 1;
BUG_ON(!ca->alloc_heap.used ||
!ca->alloc_heap.data[0].nr);
b = ca->alloc_heap.data[0].bucket;
/* first, put on free_inc and mark as owned by allocator: */
percpu_down_read(&c->mark_lock);
bch2_mark_alloc_bucket(c, ca, b, true);
spin_lock(&c->freelist_lock);
verify_not_on_freelist(c, ca, b);
BUG_ON(!fifo_push(&ca->free_inc, b));
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
ret = bch2_trans_do(c, NULL, &commit_seq,
BTREE_INSERT_NOCHECK_RW|
BTREE_INSERT_NOFAIL|
flags,
bucket_invalidate_btree(&trans, ca, b, &a));
if (!ret) {
/* remove from alloc_heap: */
struct alloc_heap_entry e, *top = ca->alloc_heap.data;
top->bucket++;
top->nr--;
if (!top->nr)
heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
/*
* If we invalidating cached data then we need to wait on the
* journal commit:
*/
if (a.v.data_type)
*journal_seq = max(*journal_seq, commit_seq);
/*
* We already waiting on u.alloc_seq when we filtered out
* buckets that need journal commit:
*/
BUG_ON(*journal_seq > a.v.journal_seq);
} else {
size_t b2;
/* remove from free_inc: */
percpu_down_read(&c->mark_lock);
spin_lock(&c->freelist_lock);
bch2_mark_alloc_bucket(c, ca, b, false);
BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
BUG_ON(b != b2);
spin_unlock(&c->freelist_lock);
percpu_up_read(&c->mark_lock);
}
return ret < 0 ? ret : 0;
}
/*
* Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
*/
static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
{
u64 journal_seq = 0;
int ret = 0;
/* Only use nowait if we've already invalidated at least one bucket: */
while (!ret &&
!fifo_full(&ca->free_inc) &&
ca->alloc_heap.used) {
if (kthread_should_stop()) {
ret = 1;
break;
}
ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
(!fifo_empty(&ca->free_inc)
? BTREE_INSERT_NOWAIT : 0));
/*
* We only want to batch up invalidates when they're going to
* require flushing the journal:
*/
if (!journal_seq)
break;
}
/* If we used NOWAIT, don't return the error: */
if (!fifo_empty(&ca->free_inc))
ret = 0;
if (ret < 0)
bch_err(ca, "error invalidating buckets: %i", ret);
if (ret)
return ret;
if (journal_seq)
ret = bch2_journal_flush_seq(&c->journal, journal_seq);
if (ret) {
bch_err(ca, "journal error: %i", ret);
return ret;
}
return 0;
}
static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
{
if (ca->allocator_state != new_state) {
ca->allocator_state = new_state;
closure_wake_up(&ca->fs->freelist_wait);
}
}
static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
unsigned i;
int ret = 0;
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
/*
* Don't strand buckets on the copygc freelist until
* after recovery is finished:
*/
if (i == RESERVE_movinggc &&
!test_bit(BCH_FS_STARTED, &c->flags))
continue;
if (fifo_push(&ca->free[i], b)) {
fifo_pop(&ca->free_inc, b);
ret = 1;
break;
}
}
spin_unlock(&c->freelist_lock);
ca->allocator_state = ret
? ALLOCATOR_running
: ALLOCATOR_blocked_full;
closure_wake_up(&c->freelist_wait);
return ret;
}
static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
{
if (!c->opts.nochanges &&
ca->mi.discard &&
bdev_max_discard_sectors(ca->disk_sb.bdev))
blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
ca->mi.bucket_size, GFP_NOFS);
}
static bool allocator_thread_running(struct bch_dev *ca)
{
unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
? ALLOCATOR_running
: ALLOCATOR_stopped;
alloc_thread_set_state(ca, state);
return state == ALLOCATOR_running;
}
static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
{
s64 available = dev_buckets_reclaimable(ca) -
(gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
bool ret = available > 0;
alloc_thread_set_state(ca, ret
? ALLOCATOR_running
: ALLOCATOR_blocked);
return ret;
}
/**
* bch_allocator_thread - move buckets from free_inc to reserves
*
* The free_inc FIFO is populated by find_reclaimable_buckets(), and
* the reserves are depleted by bucket allocation. When we run out
* of free_inc, try to invalidate some buckets and write out
* prios and gens.
*/
static int bch2_allocator_thread(void *arg)
{
struct bch_dev *ca = arg;
struct bch_fs *c = ca->fs;
unsigned long gc_count = c->gc_count;
size_t nr;
int ret;
set_freezable();
while (1) {
ret = kthread_wait_freezable(allocator_thread_running(ca));
if (ret)
goto stop;
while (!ca->alloc_heap.used) {
cond_resched();
ret = kthread_wait_freezable(buckets_available(ca, gc_count));
if (ret)
goto stop;
gc_count = c->gc_count;
nr = find_reclaimable_buckets(c, ca);
if (!nr && ca->buckets_waiting_on_journal) {
ret = bch2_journal_flush(&c->journal);
if (ret)
goto stop;
} else if (nr < (ca->mi.nbuckets >> 6) &&
ca->buckets_waiting_on_journal >= nr / 2) {
bch2_journal_flush_async(&c->journal, NULL);
}
if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
ca->inc_gen_really_needs_gc) &&
c->gc_thread) {
atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread);
}
trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
ca->inc_gen_really_needs_gc);
}
ret = bch2_invalidate_buckets(c, ca);
if (ret)
goto stop;
while (!fifo_empty(&ca->free_inc)) {
u64 b = fifo_peek(&ca->free_inc);
discard_one_bucket(c, ca, b);
ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
if (ret)
goto stop;
}
}
stop:
alloc_thread_set_state(ca, ALLOCATOR_stopped);
return 0;
}
/* Startup/shutdown (ro/rw): */ /* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c) void bch2_recalc_capacity(struct bch_fs *c)
...@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c) ...@@ -1193,7 +697,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 capacity = 0, reserved_sectors = 0, gc_reserve; u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0; unsigned bucket_size_max = 0;
unsigned long ra_pages = 0; unsigned long ra_pages = 0;
unsigned i, j; unsigned i;
lockdep_assert_held(&c->state_lock); lockdep_assert_held(&c->state_lock);
...@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c) ...@@ -1224,8 +728,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
* allocations for foreground writes must wait - * allocations for foreground writes must wait -
* not -ENOSPC calculations. * not -ENOSPC calculations.
*/ */
for (j = 0; j < RESERVE_none; j++)
dev_reserve += ca->free[j].size; dev_reserve += ca->nr_btree_reserve * 2;
dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
dev_reserve += 1; /* btree write point */ dev_reserve += 1; /* btree write point */
dev_reserve += 1; /* copygc write point */ dev_reserve += 1; /* copygc write point */
...@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ...@@ -1281,8 +786,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
{ {
unsigned i; unsigned i;
BUG_ON(ca->alloc_thread);
/* First, remove device from allocation groups: */ /* First, remove device from allocation groups: */
for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
...@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ...@@ -1356,61 +859,6 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
set_bit(ca->dev_idx, c->rw_devs[i].d); set_bit(ca->dev_idx, c->rw_devs[i].d);
} }
void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
{
if (ca->alloc_thread)
closure_wait_event(&c->freelist_wait,
ca->allocator_state != ALLOCATOR_running);
}
/* stop allocator thread: */
void bch2_dev_allocator_stop(struct bch_dev *ca)
{
struct task_struct *p;
p = rcu_dereference_protected(ca->alloc_thread, 1);
ca->alloc_thread = NULL;
/*
* We need an rcu barrier between setting ca->alloc_thread = NULL and
* the thread shutting down to avoid bch2_wake_allocator() racing:
*
* XXX: it would be better to have the rcu barrier be asynchronous
* instead of blocking us here
*/
synchronize_rcu();
if (p) {
kthread_stop(p);
put_task_struct(p);
}
}
/* start allocator thread: */
int bch2_dev_allocator_start(struct bch_dev *ca)
{
struct task_struct *p;
/*
* allocator thread already started?
*/
if (ca->alloc_thread)
return 0;
p = kthread_create(bch2_allocator_thread, ca,
"bch-alloc/%s", ca->name);
if (IS_ERR(p)) {
bch_err(ca->fs, "error creating allocator thread: %li",
PTR_ERR(p));
return PTR_ERR(p);
}
get_task_struct(p);
rcu_assign_pointer(ca->alloc_thread, p);
wake_up_process(p);
return 0;
}
void bch2_fs_allocator_background_init(struct bch_fs *c) void bch2_fs_allocator_background_init(struct bch_fs *c)
{ {
spin_lock_init(&c->freelist_lock); spin_lock_init(&c->freelist_lock);
......
...@@ -8,8 +8,6 @@ ...@@ -8,8 +8,6 @@
#include "debug.h" #include "debug.h"
#include "super.h" #include "super.h"
extern const char * const bch2_allocator_states[];
/* How out of date a pointer gen is allowed to be: */ /* How out of date a pointer gen is allowed to be: */
#define BUCKET_GC_GEN_MAX 96U #define BUCKET_GC_GEN_MAX 96U
...@@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c, ...@@ -117,42 +115,11 @@ int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
struct bkey_i *, unsigned); struct bkey_i *, unsigned);
int bch2_fs_freespace_init(struct bch_fs *); int bch2_fs_freespace_init(struct bch_fs *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(ca->alloc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
size_t bucket)
{
if (bch2_expensive_debug_checks) {
size_t iter;
long i;
unsigned j;
for (j = 0; j < RESERVE_NR; j++)
fifo_for_each_entry(i, &ca->free[j], iter)
BUG_ON(i == bucket);
fifo_for_each_entry(i, &ca->free_inc, iter)
BUG_ON(i == bucket);
}
}
void bch2_recalc_capacity(struct bch_fs *); void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
void bch2_fs_allocator_background_init(struct bch_fs *); void bch2_fs_allocator_background_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
...@@ -14,13 +14,18 @@ ...@@ -14,13 +14,18 @@
#include "bcachefs.h" #include "bcachefs.h"
#include "alloc_background.h" #include "alloc_background.h"
#include "alloc_foreground.h" #include "alloc_foreground.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "btree_gc.h" #include "btree_gc.h"
#include "buckets.h" #include "buckets.h"
#include "buckets_waiting_for_journal.h"
#include "clock.h" #include "clock.h"
#include "debug.h" #include "debug.h"
#include "disk_groups.h" #include "disk_groups.h"
#include "ec.h" #include "ec.h"
#include "error.h"
#include "io.h" #include "io.h"
#include "journal.h"
#include "trace.h" #include "trace.h"
#include <linux/math64.h> #include <linux/math64.h>
...@@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = { ...@@ -50,6 +55,17 @@ const char * const bch2_alloc_reserves[] = {
* reference _after_ doing the index update that makes its allocation reachable. * reference _after_ doing the index update that makes its allocation reachable.
*/ */
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned i;
rcu_read_lock();
for_each_member_device_rcu(ca, c, i, NULL)
ca->alloc_cursor = 0;
rcu_read_unlock();
}
static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob)
{ {
open_bucket_idx_t idx = ob - c->open_buckets; open_bucket_idx_t idx = ob - c->open_buckets;
...@@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ...@@ -85,7 +101,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
percpu_down_read(&c->mark_lock); percpu_down_read(&c->mark_lock);
spin_lock(&ob->lock); spin_lock(&ob->lock);
bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
ob->valid = false; ob->valid = false;
ob->data_type = 0; ob->data_type = 0;
...@@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ...@@ -185,39 +200,35 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
} }
} }
/** static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
* bch_bucket_alloc - allocate a single bucket from a specific device u64 bucket,
* enum alloc_reserve reserve,
* Returns index of bucket on success, 0 on failure struct bch_alloc_v4 *a,
* */ u64 *skipped_open,
struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, u64 *skipped_need_journal_commit,
enum alloc_reserve reserve, u64 *skipped_nouse,
bool may_alloc_partial, struct closure *cl)
struct closure *cl)
{ {
struct open_bucket *ob; struct open_bucket *ob;
long b = 0;
spin_lock(&c->freelist_lock); if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) {
(*skipped_nouse)++;
return NULL;
}
if (may_alloc_partial) { if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
int i; (*skipped_open)++;
return NULL;
for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { }
ob = c->open_buckets + ca->open_buckets_partial[i];
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
if (reserve <= ob->alloc_reserve) { c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) {
array_remove_item(ca->open_buckets_partial, (*skipped_need_journal_commit)++;
ca->open_buckets_partial_nr, return NULL;
i);
ob->on_partial_list = false;
ob->alloc_reserve = reserve;
spin_unlock(&c->freelist_lock);
return ob;
}
}
} }
spin_lock(&c->freelist_lock);
if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
if (cl) if (cl)
closure_wait(&c->open_buckets_wait, cl); closure_wait(&c->open_buckets_wait, cl);
...@@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ...@@ -226,36 +237,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
c->blocked_allocate_open_bucket = local_clock(); c->blocked_allocate_open_bucket = local_clock();
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
trace_open_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
return ERR_PTR(-OPEN_BUCKETS_EMPTY); return ERR_PTR(-OPEN_BUCKETS_EMPTY);
} }
if (likely(fifo_pop(&ca->free[RESERVE_none], b))) /* Recheck under lock: */
goto out; if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) {
spin_unlock(&c->freelist_lock);
switch (reserve) { (*skipped_open)++;
case RESERVE_btree_movinggc: return NULL;
case RESERVE_movinggc:
if (fifo_pop(&ca->free[RESERVE_movinggc], b))
goto out;
break;
default:
break;
} }
if (cl)
closure_wait(&c->freelist_wait, cl);
if (!c->blocked_allocate)
c->blocked_allocate = local_clock();
spin_unlock(&c->freelist_lock);
trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve]);
return ERR_PTR(-FREELIST_EMPTY);
out:
verify_not_on_freelist(c, ca, b);
ob = bch2_open_bucket_alloc(c); ob = bch2_open_bucket_alloc(c);
spin_lock(&ob->lock); spin_lock(&ob->lock);
...@@ -264,8 +255,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ...@@ -264,8 +255,8 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
ob->sectors_free = ca->mi.bucket_size; ob->sectors_free = ca->mi.bucket_size;
ob->alloc_reserve = reserve; ob->alloc_reserve = reserve;
ob->dev = ca->dev_idx; ob->dev = ca->dev_idx;
ob->gen = *bucket_gen(ca, b); ob->gen = a->gen;
ob->bucket = b; ob->bucket = bucket;
spin_unlock(&ob->lock); spin_unlock(&ob->lock);
ca->nr_open_buckets++; ca->nr_open_buckets++;
...@@ -286,10 +277,326 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ...@@ -286,10 +277,326 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
} }
spin_unlock(&c->freelist_lock); spin_unlock(&c->freelist_lock);
return ob;
}
static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
enum alloc_reserve reserve, u64 free_entry,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct bkey_s_c freespace_k,
struct closure *cl)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
struct bkey_s_c k;
struct open_bucket *ob;
struct bch_alloc_v4 a;
u64 b = free_entry & ~(~0ULL << 56);
unsigned genbits = free_entry >> 56;
struct printbuf buf = PRINTBUF;
int ret;
if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) {
pr_buf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n"
" freespace key ",
ca->mi.first_bucket, ca->mi.nbuckets);
bch2_bkey_val_to_text(&buf, c, freespace_k);
bch2_trans_inconsistent(trans, "%s", buf.buf);
ob = ERR_PTR(-EIO);
goto err;
}
bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret) {
ob = ERR_PTR(ret);
goto err;
}
bch2_alloc_to_v4(k, &a);
if (genbits != (alloc_freespace_genbits(a) >> 56)) {
pr_buf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
" freespace key ",
genbits, alloc_freespace_genbits(a) >> 56);
bch2_bkey_val_to_text(&buf, c, freespace_k);
pr_buf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, k);
bch2_trans_inconsistent(trans, "%s", buf.buf);
ob = ERR_PTR(-EIO);
goto err;
}
if (a.data_type != BUCKET_free) {
pr_buf(&buf, "non free bucket in freespace btree\n"
" freespace key ");
bch2_bkey_val_to_text(&buf, c, freespace_k);
pr_buf(&buf, "\n ");
bch2_bkey_val_to_text(&buf, c, k);
bch2_trans_inconsistent(trans, "%s", buf.buf);
ob = ERR_PTR(-EIO);
goto err;
}
ob = __try_alloc_bucket(c, ca, b, reserve, &a,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl);
if (!ob)
iter.path->preserve = false;
err:
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
return ob;
}
static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
enum alloc_reserve reserve)
{
struct open_bucket *ob;
int i;
spin_lock(&c->freelist_lock);
for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
ob = c->open_buckets + ca->open_buckets_partial[i];
if (reserve <= ob->alloc_reserve) {
array_remove_item(ca->open_buckets_partial,
ca->open_buckets_partial_nr,
i);
ob->on_partial_list = false;
ob->alloc_reserve = reserve;
spin_unlock(&c->freelist_lock);
return ob;
}
}
spin_unlock(&c->freelist_lock);
return NULL;
}
/*
* This path is for before the freespace btree is initialized:
*
* If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
* journal buckets - journal buckets will be < ca->new_fs_bucket_idx
*/
static noinline struct open_bucket *
bch2_bucket_alloc_early(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
u64 *buckets_seen,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct closure *cl)
{
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx);
u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor));
int ret;
again:
for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor),
BTREE_ITER_SLOTS, k, ret) {
struct bch_alloc_v4 a;
if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
break;
if (ca->new_fs_bucket_idx &&
is_superblock_bucket(ca, k.k->p.offset))
continue;
bch2_alloc_to_v4(k, &a);
if (bucket_state(a) != BUCKET_free)
continue;
(*buckets_seen)++;
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, reserve, &a,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl);
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
ca->alloc_cursor = alloc_cursor;
if (!ob && alloc_cursor > alloc_start) {
alloc_cursor = alloc_start;
goto again;
}
return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
}
static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
u64 *buckets_seen,
u64 *skipped_open,
u64 *skipped_need_journal_commit,
u64 *skipped_nouse,
struct closure *cl)
{
struct btree_iter iter;
struct bkey_s_c k;
struct open_bucket *ob = NULL;
u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor));
u64 alloc_cursor = alloc_start;
int ret;
BUG_ON(ca->new_fs_bucket_idx);
again:
for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace,
POS(ca->dev_idx, alloc_cursor), 0, k, ret) {
if (k.k->p.inode != ca->dev_idx)
break;
for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k));
alloc_cursor < k.k->p.offset;
alloc_cursor++) {
if (btree_trans_too_many_iters(trans)) {
ob = ERR_PTR(-EINTR);
break;
}
(*buckets_seen)++;
ob = try_alloc_bucket(trans, ca, reserve,
alloc_cursor,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
k, cl);
if (ob) {
iter.path->preserve = false;
break;
}
}
if (ob)
break;
}
bch2_trans_iter_exit(trans, &iter);
ca->alloc_cursor = alloc_cursor;
if (!ob && ret)
ob = ERR_PTR(ret);
if (!ob && alloc_start > ca->mi.first_bucket) {
alloc_cursor = alloc_start = ca->mi.first_bucket;
goto again;
}
return ob;
}
/**
* bch_bucket_alloc - allocate a single bucket from a specific device
*
* Returns index of bucket on success, 0 on failure
* */
static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
struct bch_dev *ca,
enum alloc_reserve reserve,
bool may_alloc_partial,
struct closure *cl)
{
struct bch_fs *c = trans->c;
struct open_bucket *ob = NULL;
u64 avail = dev_buckets_available(ca, reserve);
u64 buckets_seen = 0;
u64 skipped_open = 0;
u64 skipped_need_journal_commit = 0;
u64 skipped_nouse = 0;
if (may_alloc_partial) {
ob = try_alloc_partial_bucket(c, ca, reserve);
if (ob)
return ob;
}
again:
if (!avail) {
if (cl) {
closure_wait(&c->freelist_wait, cl);
/* recheck after putting ourself on waitlist */
avail = dev_buckets_available(ca, reserve);
if (avail) {
closure_wake_up(&c->freelist_wait);
goto again;
}
}
if (!c->blocked_allocate)
c->blocked_allocate = local_clock();
ob = ERR_PTR(-FREELIST_EMPTY);
goto err;
}
ob = likely(ca->mi.freespace_initialized)
? bch2_bucket_alloc_freelist(trans, ca, reserve,
&buckets_seen,
&skipped_open,
&skipped_need_journal_commit,
&skipped_nouse,
cl)
: bch2_bucket_alloc_early(trans, ca, reserve,
&buckets_seen,
&skipped_open,
&skipped_need_journal_commit,
&skipped_nouse,
cl);
if (skipped_need_journal_commit * 2 > avail)
bch2_journal_flush_async(&c->journal, NULL);
err:
if (!ob)
ob = ERR_PTR(-FREELIST_EMPTY);
if (!IS_ERR(ob)) {
trace_bucket_alloc(ca, bch2_alloc_reserves[reserve], avail,
buckets_seen,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl == NULL, PTR_ERR_OR_ZERO(ob));
} else {
trace_bucket_alloc_fail(ca, bch2_alloc_reserves[reserve], avail,
buckets_seen,
skipped_open,
skipped_need_journal_commit,
skipped_nouse,
cl == NULL, PTR_ERR_OR_ZERO(ob));
atomic_long_inc(&c->bucket_alloc_fail);
}
return ob;
}
bch2_wake_allocator(ca); struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
enum alloc_reserve reserve,
bool may_alloc_partial,
struct closure *cl)
{
struct open_bucket *ob;
trace_bucket_alloc(ca, bch2_alloc_reserves[reserve]); bch2_trans_do(c, NULL, NULL, 0,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, reserve,
may_alloc_partial, cl)));
return ob; return ob;
} }
...@@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, ...@@ -320,7 +627,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
struct dev_stripe_state *stripe) struct dev_stripe_state *stripe)
{ {
u64 *v = stripe->next_alloc + ca->dev_idx; u64 *v = stripe->next_alloc + ca->dev_idx;
u64 free_space = dev_buckets_available(ca); u64 free_space = dev_buckets_available(ca, RESERVE_none);
u64 free_space_inv = free_space u64 free_space_inv = free_space
? div64_u64(1ULL << 48, free_space) ? div64_u64(1ULL << 48, free_space)
: 1ULL << 48; : 1ULL << 48;
...@@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c, ...@@ -358,7 +665,7 @@ static void add_new_bucket(struct bch_fs *c,
ob_push(c, ptrs, ob); ob_push(c, ptrs, ob);
} }
int bch2_bucket_alloc_set(struct bch_fs *c, static int bch2_bucket_alloc_set_trans(struct btree_trans *trans,
struct open_buckets *ptrs, struct open_buckets *ptrs,
struct dev_stripe_state *stripe, struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc, struct bch_devs_mask *devs_may_alloc,
...@@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c, ...@@ -369,10 +676,12 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
unsigned flags, unsigned flags,
struct closure *cl) struct closure *cl)
{ {
struct bch_fs *c = trans->c;
struct dev_alloc_list devs_sorted = struct dev_alloc_list devs_sorted =
bch2_dev_alloc_list(c, stripe, devs_may_alloc); bch2_dev_alloc_list(c, stripe, devs_may_alloc);
unsigned dev;
struct bch_dev *ca; struct bch_dev *ca;
int ret = -INSUFFICIENT_DEVICES; int ret = 0;
unsigned i; unsigned i;
BUG_ON(*nr_effective >= nr_replicas); BUG_ON(*nr_effective >= nr_replicas);
...@@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c, ...@@ -380,35 +689,68 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
for (i = 0; i < devs_sorted.nr; i++) { for (i = 0; i < devs_sorted.nr; i++) {
struct open_bucket *ob; struct open_bucket *ob;
ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); dev = devs_sorted.devs[i];
rcu_read_lock();
ca = rcu_dereference(c->devs[dev]);
if (ca)
percpu_ref_get(&ca->ref);
rcu_read_unlock();
if (!ca) if (!ca)
continue; continue;
if (!ca->mi.durability && *have_cache) if (!ca->mi.durability && *have_cache) {
percpu_ref_put(&ca->ref);
continue; continue;
}
ob = bch2_bucket_alloc(c, ca, reserve, ob = bch2_bucket_alloc_trans(trans, ca, reserve,
flags & BUCKET_MAY_ALLOC_PARTIAL, cl); flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
if (IS_ERR(ob)) { if (!IS_ERR(ob))
ret = PTR_ERR(ob); bch2_dev_stripe_increment(ca, stripe);
percpu_ref_put(&ca->ref);
if (cl)
return ret; ret = PTR_ERR_OR_ZERO(ob);
if (ret) {
if (ret == -EINTR || cl)
break;
continue; continue;
} }
add_new_bucket(c, ptrs, devs_may_alloc, add_new_bucket(c, ptrs, devs_may_alloc,
nr_effective, have_cache, flags, ob); nr_effective, have_cache, flags, ob);
bch2_dev_stripe_increment(ca, stripe);
if (*nr_effective >= nr_replicas) if (*nr_effective >= nr_replicas)
return 0; break;
} }
if (*nr_effective >= nr_replicas)
ret = 0;
else if (!ret)
ret = -INSUFFICIENT_DEVICES;
return ret; return ret;
} }
int bch2_bucket_alloc_set(struct bch_fs *c,
struct open_buckets *ptrs,
struct dev_stripe_state *stripe,
struct bch_devs_mask *devs_may_alloc,
unsigned nr_replicas,
unsigned *nr_effective,
bool *have_cache,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl)
{
return bch2_trans_do(c, NULL, NULL, 0,
bch2_bucket_alloc_set_trans(&trans, ptrs, stripe,
devs_may_alloc, nr_replicas,
nr_effective, have_cache, reserve,
flags, cl));
}
/* Allocate from stripes: */ /* Allocate from stripes: */
/* /*
...@@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, ...@@ -513,7 +855,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c,
wp->ptrs = ptrs_skip; wp->ptrs = ptrs_skip;
} }
static int open_bucket_add_buckets(struct bch_fs *c, static int open_bucket_add_buckets(struct btree_trans *trans,
struct open_buckets *ptrs, struct open_buckets *ptrs,
struct write_point *wp, struct write_point *wp,
struct bch_devs_list *devs_have, struct bch_devs_list *devs_have,
...@@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, ...@@ -526,6 +868,7 @@ static int open_bucket_add_buckets(struct bch_fs *c,
unsigned flags, unsigned flags,
struct closure *_cl) struct closure *_cl)
{ {
struct bch_fs *c = trans->c;
struct bch_devs_mask devs; struct bch_devs_mask devs;
struct open_bucket *ob; struct open_bucket *ob;
struct closure *cl = NULL; struct closure *cl = NULL;
...@@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c, ...@@ -557,7 +900,8 @@ static int open_bucket_add_buckets(struct bch_fs *c,
target, erasure_code, target, erasure_code,
nr_replicas, nr_effective, nr_replicas, nr_effective,
have_cache, flags, _cl); have_cache, flags, _cl);
if (ret == -FREELIST_EMPTY || if (ret == -EINTR ||
ret == -FREELIST_EMPTY ||
ret == -OPEN_BUCKETS_EMPTY) ret == -OPEN_BUCKETS_EMPTY)
return ret; return ret;
if (*nr_effective >= nr_replicas) if (*nr_effective >= nr_replicas)
...@@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, ...@@ -571,25 +915,22 @@ static int open_bucket_add_buckets(struct bch_fs *c,
if (*nr_effective >= nr_replicas) if (*nr_effective >= nr_replicas)
return 0; return 0;
percpu_down_read(&c->mark_lock);
rcu_read_lock();
retry_blocking: retry_blocking:
/* /*
* Try nonblocking first, so that if one device is full we'll try from * Try nonblocking first, so that if one device is full we'll try from
* other devices: * other devices:
*/ */
ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs,
nr_replicas, nr_effective, have_cache, nr_replicas, nr_effective, have_cache,
reserve, flags, cl); reserve, flags, cl);
if (ret && ret != -INSUFFICIENT_DEVICES && !cl && _cl) { if (ret &&
ret != -EINTR &&
ret != -INSUFFICIENT_DEVICES &&
!cl && _cl) {
cl = _cl; cl = _cl;
goto retry_blocking; goto retry_blocking;
} }
rcu_read_unlock();
percpu_up_read(&c->mark_lock);
return ret; return ret;
} }
...@@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c, ...@@ -703,15 +1044,25 @@ static bool try_decrease_writepoints(struct bch_fs *c,
return true; return true;
} }
static struct write_point *writepoint_find(struct bch_fs *c, static void bch2_trans_mutex_lock(struct btree_trans *trans,
struct mutex *lock)
{
if (!mutex_trylock(lock)) {
bch2_trans_unlock(trans);
mutex_lock(lock);
}
}
static struct write_point *writepoint_find(struct btree_trans *trans,
unsigned long write_point) unsigned long write_point)
{ {
struct bch_fs *c = trans->c;
struct write_point *wp, *oldest; struct write_point *wp, *oldest;
struct hlist_head *head; struct hlist_head *head;
if (!(write_point & 1UL)) { if (!(write_point & 1UL)) {
wp = (struct write_point *) write_point; wp = (struct write_point *) write_point;
mutex_lock(&wp->lock); bch2_trans_mutex_lock(trans, &wp->lock);
return wp; return wp;
} }
...@@ -720,7 +1071,7 @@ static struct write_point *writepoint_find(struct bch_fs *c, ...@@ -720,7 +1071,7 @@ static struct write_point *writepoint_find(struct bch_fs *c,
wp = __writepoint_find(head, write_point); wp = __writepoint_find(head, write_point);
if (wp) { if (wp) {
lock_wp: lock_wp:
mutex_lock(&wp->lock); bch2_trans_mutex_lock(trans, &wp->lock);
if (wp->write_point == write_point) if (wp->write_point == write_point)
goto out; goto out;
mutex_unlock(&wp->lock); mutex_unlock(&wp->lock);
...@@ -733,8 +1084,8 @@ static struct write_point *writepoint_find(struct bch_fs *c, ...@@ -733,8 +1084,8 @@ static struct write_point *writepoint_find(struct bch_fs *c,
if (!oldest || time_before64(wp->last_used, oldest->last_used)) if (!oldest || time_before64(wp->last_used, oldest->last_used))
oldest = wp; oldest = wp;
mutex_lock(&oldest->lock); bch2_trans_mutex_lock(trans, &oldest->lock);
mutex_lock(&c->write_points_hash_lock); bch2_trans_mutex_lock(trans, &c->write_points_hash_lock);
if (oldest >= c->write_points + c->write_points_nr || if (oldest >= c->write_points + c->write_points_nr ||
try_increase_writepoints(c)) { try_increase_writepoints(c)) {
mutex_unlock(&c->write_points_hash_lock); mutex_unlock(&c->write_points_hash_lock);
...@@ -762,7 +1113,7 @@ static struct write_point *writepoint_find(struct bch_fs *c, ...@@ -762,7 +1113,7 @@ static struct write_point *writepoint_find(struct bch_fs *c,
/* /*
* Get us an open_bucket we can allocate from, return with it locked: * Get us an open_bucket we can allocate from, return with it locked:
*/ */
int bch2_alloc_sectors_start(struct bch_fs *c, int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
unsigned target, unsigned target,
unsigned erasure_code, unsigned erasure_code,
struct write_point_specifier write_point, struct write_point_specifier write_point,
...@@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c, ...@@ -774,6 +1125,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
struct closure *cl, struct closure *cl,
struct write_point **wp_ret) struct write_point **wp_ret)
{ {
struct bch_fs *c = trans->c;
struct write_point *wp; struct write_point *wp;
struct open_bucket *ob; struct open_bucket *ob;
struct open_buckets ptrs; struct open_buckets ptrs;
...@@ -793,7 +1145,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c, ...@@ -793,7 +1145,7 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
write_points_nr = c->write_points_nr; write_points_nr = c->write_points_nr;
have_cache = false; have_cache = false;
*wp_ret = wp = writepoint_find(c, write_point.v); *wp_ret = wp = writepoint_find(trans, write_point.v);
if (wp->data_type == BCH_DATA_user) if (wp->data_type == BCH_DATA_user)
ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
...@@ -803,21 +1155,21 @@ int bch2_alloc_sectors_start(struct bch_fs *c, ...@@ -803,21 +1155,21 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
have_cache = true; have_cache = true;
if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code, target, erasure_code,
nr_replicas, &nr_effective, nr_replicas, &nr_effective,
&have_cache, reserve, &have_cache, reserve,
ob_flags, cl); ob_flags, cl);
} else { } else {
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
target, erasure_code, target, erasure_code,
nr_replicas, &nr_effective, nr_replicas, &nr_effective,
&have_cache, reserve, &have_cache, reserve,
ob_flags, NULL); ob_flags, NULL);
if (!ret) if (!ret || ret == -EINTR)
goto alloc_done; goto alloc_done;
ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
0, erasure_code, 0, erasure_code,
nr_replicas, &nr_effective, nr_replicas, &nr_effective,
&have_cache, reserve, &have_cache, reserve,
...@@ -871,10 +1223,33 @@ int bch2_alloc_sectors_start(struct bch_fs *c, ...@@ -871,10 +1223,33 @@ int bch2_alloc_sectors_start(struct bch_fs *c,
case -INSUFFICIENT_DEVICES: case -INSUFFICIENT_DEVICES:
return -EROFS; return -EROFS;
default: default:
BUG(); return ret;
} }
} }
int bch2_alloc_sectors_start(struct bch_fs *c,
unsigned target,
unsigned erasure_code,
struct write_point_specifier write_point,
struct bch_devs_list *devs_have,
unsigned nr_replicas,
unsigned nr_replicas_required,
enum alloc_reserve reserve,
unsigned flags,
struct closure *cl,
struct write_point **wp_ret)
{
return bch2_trans_do(c, NULL, NULL, 0,
bch2_alloc_sectors_start_trans(&trans, target,
erasure_code,
write_point,
devs_have,
nr_replicas,
nr_replicas_required,
reserve,
flags, cl, wp_ret));
}
struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob)
{ {
struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev);
......
...@@ -14,6 +14,8 @@ struct bch_devs_List; ...@@ -14,6 +14,8 @@ struct bch_devs_List;
extern const char * const bch2_alloc_reserves[]; extern const char * const bch2_alloc_reserves[];
void bch2_reset_alloc_cursors(struct bch_fs *);
struct dev_alloc_list { struct dev_alloc_list {
unsigned nr; unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX]; u8 devs[BCH_SB_MEMBERS_MAX];
...@@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ...@@ -136,6 +138,15 @@ int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
unsigned, unsigned *, bool *, enum alloc_reserve, unsigned, unsigned *, bool *, enum alloc_reserve,
unsigned, struct closure *); unsigned, struct closure *);
int bch2_alloc_sectors_start_trans(struct btree_trans *,
unsigned, unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *,
struct write_point **);
int bch2_alloc_sectors_start(struct bch_fs *, int bch2_alloc_sectors_start(struct bch_fs *,
unsigned, unsigned, unsigned, unsigned,
struct write_point_specifier, struct write_point_specifier,
......
...@@ -10,18 +10,6 @@ ...@@ -10,18 +10,6 @@
struct ec_bucket_buf; struct ec_bucket_buf;
#define ALLOC_THREAD_STATES() \
x(stopped) \
x(running) \
x(blocked) \
x(blocked_full)
enum allocator_states {
#define x(n) ALLOCATOR_##n,
ALLOC_THREAD_STATES()
#undef x
};
#define BCH_ALLOC_RESERVES() \ #define BCH_ALLOC_RESERVES() \
x(btree_movinggc) \ x(btree_movinggc) \
x(btree) \ x(btree) \
...@@ -32,11 +20,8 @@ enum alloc_reserve { ...@@ -32,11 +20,8 @@ enum alloc_reserve {
#define x(name) RESERVE_##name, #define x(name) RESERVE_##name,
BCH_ALLOC_RESERVES() BCH_ALLOC_RESERVES()
#undef x #undef x
RESERVE_NR
}; };
typedef FIFO(long) alloc_fifo;
#define OPEN_BUCKETS_COUNT 1024 #define OPEN_BUCKETS_COUNT 1024
#define WRITE_POINT_HASH_NR 32 #define WRITE_POINT_HASH_NR 32
...@@ -127,12 +112,4 @@ struct write_point_specifier { ...@@ -127,12 +112,4 @@ struct write_point_specifier {
unsigned long v; unsigned long v;
}; };
struct alloc_heap_entry {
size_t bucket;
size_t nr;
unsigned long key;
};
typedef HEAP(struct alloc_heap_entry) alloc_heap;
#endif /* _BCACHEFS_ALLOC_TYPES_H */ #endif /* _BCACHEFS_ALLOC_TYPES_H */
...@@ -462,34 +462,18 @@ struct bch_dev { ...@@ -462,34 +462,18 @@ struct bch_dev {
/* Allocator: */ /* Allocator: */
u64 new_fs_bucket_idx; u64 new_fs_bucket_idx;
struct task_struct __rcu *alloc_thread; u64 alloc_cursor;
/*
* free: Buckets that are ready to be used
*
* free_inc: Incoming buckets - these are buckets that currently have
* cached data in them, and we can't reuse them until after we write
* their new gen to disk. After prio_write() finishes writing the new
* gens/prios, they'll be moved to the free list (and possibly discarded
* in the process)
*/
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
unsigned nr_open_buckets; unsigned nr_open_buckets;
unsigned nr_btree_reserve;
open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
open_bucket_idx_t open_buckets_partial_nr; open_bucket_idx_t open_buckets_partial_nr;
size_t fifo_last_bucket;
size_t inc_gen_needs_gc; size_t inc_gen_needs_gc;
size_t inc_gen_really_needs_gc; size_t inc_gen_really_needs_gc;
size_t buckets_waiting_on_journal; size_t buckets_waiting_on_journal;
enum allocator_states allocator_state;
alloc_heap alloc_heap;
atomic64_t rebalance_work; atomic64_t rebalance_work;
struct journal_device journal; struct journal_device journal;
...@@ -511,8 +495,6 @@ struct bch_dev { ...@@ -511,8 +495,6 @@ struct bch_dev {
enum { enum {
/* startup: */ /* startup: */
BCH_FS_ALLOC_CLEAN, BCH_FS_ALLOC_CLEAN,
BCH_FS_ALLOCATOR_RUNNING,
BCH_FS_ALLOCATOR_STOPPING,
BCH_FS_INITIAL_GC_DONE, BCH_FS_INITIAL_GC_DONE,
BCH_FS_INITIAL_GC_UNFIXED, BCH_FS_INITIAL_GC_UNFIXED,
BCH_FS_TOPOLOGY_REPAIR_DONE, BCH_FS_TOPOLOGY_REPAIR_DONE,
...@@ -914,6 +896,7 @@ mempool_t bio_bounce_pages; ...@@ -914,6 +896,7 @@ mempool_t bio_bounce_pages;
atomic_long_t read_realloc_races; atomic_long_t read_realloc_races;
atomic_long_t extent_migrate_done; atomic_long_t extent_migrate_done;
atomic_long_t extent_migrate_raced; atomic_long_t extent_migrate_raced;
atomic_long_t bucket_alloc_fail;
unsigned btree_gc_periodic:1; unsigned btree_gc_periodic:1;
unsigned copy_gc_enabled:1; unsigned copy_gc_enabled:1;
......
...@@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) ...@@ -1684,9 +1684,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
*/ */
int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
{ {
struct bch_dev *ca;
u64 start_time = local_clock(); u64 start_time = local_clock();
unsigned i, iter = 0; unsigned iter = 0;
int ret; int ret;
lockdep_assert_held(&c->state_lock); lockdep_assert_held(&c->state_lock);
...@@ -1787,13 +1786,6 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) ...@@ -1787,13 +1786,6 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
trace_gc_end(c); trace_gc_end(c);
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
/*
* Wake up allocator in case it was waiting for buckets
* because of not being able to inc gens
*/
for_each_member_device(ca, c, i)
bch2_wake_allocator(ca);
/* /*
* At startup, allocations can happen directly instead of via the * At startup, allocations can happen directly instead of via the
* allocator thread - issue wakeup in case they blocked on gc_lock: * allocator thread - issue wakeup in case they blocked on gc_lock:
......
...@@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, ...@@ -178,12 +178,13 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
six_unlock_intent(&b->c.lock); six_unlock_intent(&b->c.lock);
} }
static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
struct disk_reservation *res, struct disk_reservation *res,
struct closure *cl, struct closure *cl,
bool interior_node, bool interior_node,
unsigned flags) unsigned flags)
{ {
struct bch_fs *c = trans->c;
struct write_point *wp; struct write_point *wp;
struct btree *b; struct btree *b;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
...@@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ...@@ -214,7 +215,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
mutex_unlock(&c->btree_reserve_cache_lock); mutex_unlock(&c->btree_reserve_cache_lock);
retry: retry:
ret = bch2_alloc_sectors_start(c, ret = bch2_alloc_sectors_start_trans(trans,
c->opts.metadata_target ?: c->opts.metadata_target ?:
c->opts.foreground_target, c->opts.foreground_target,
0, 0,
...@@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as) ...@@ -414,7 +415,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
mutex_unlock(&c->btree_reserve_cache_lock); mutex_unlock(&c->btree_reserve_cache_lock);
} }
static int bch2_btree_reserve_get(struct btree_update *as, static int bch2_btree_reserve_get(struct btree_trans *trans,
struct btree_update *as,
unsigned nr_nodes[2], unsigned nr_nodes[2],
unsigned flags, unsigned flags,
struct closure *cl) struct closure *cl)
...@@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as, ...@@ -441,7 +443,7 @@ static int bch2_btree_reserve_get(struct btree_update *as,
struct prealloc_nodes *p = as->prealloc_nodes + interior; struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) { while (p->nr < nr_nodes[interior]) {
b = __bch2_btree_node_alloc(c, &as->disk_res, b = __bch2_btree_node_alloc(trans, &as->disk_res,
flags & BTREE_INSERT_NOWAIT ? NULL : cl, flags & BTREE_INSERT_NOWAIT ? NULL : cl,
interior, flags); interior, flags);
if (IS_ERR(b)) { if (IS_ERR(b)) {
...@@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1066,8 +1068,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
if (ret) if (ret)
goto err; goto err;
ret = bch2_btree_reserve_get(as, nr_nodes, flags, NULL); ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
if (ret) { if (ret == -EAGAIN ||
ret == -ENOMEM) {
struct closure cl; struct closure cl;
closure_init_stack(&cl); closure_init_stack(&cl);
...@@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ...@@ -1075,7 +1078,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
bch2_trans_unlock(trans); bch2_trans_unlock(trans);
do { do {
ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl); ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
closure_sync(&cl); closure_sync(&cl);
} while (ret == -EAGAIN); } while (ret == -EAGAIN);
} }
......
...@@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca, ...@@ -296,11 +296,6 @@ static inline int bucket_sectors_fragmented(struct bch_dev *ca,
: 0; : 0;
} }
static inline int is_stripe_data_bucket(struct bucket_mark m)
{
return m.stripe && m.data_type != BCH_DATA_parity;
}
static inline enum bch_data_type bucket_type(struct bucket_mark m) static inline enum bch_data_type bucket_type(struct bucket_mark m)
{ {
return m.cached_sectors && !m.dirty_sectors return m.cached_sectors && !m.dirty_sectors
...@@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ...@@ -350,9 +345,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
preempt_enable(); preempt_enable();
if (!is_available_bucket(old) && is_available_bucket(new))
bch2_wake_allocator(ca);
} }
static inline int __update_replicas(struct bch_fs *c, static inline int __update_replicas(struct bch_fs *c,
...@@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans, ...@@ -488,19 +480,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
update_replicas_list(trans, &r.e, sectors); update_replicas_list(trans, &r.e, sectors);
} }
void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
size_t b, bool owned_by_allocator)
{
struct bucket *g = bucket(ca, b);
struct bucket_mark old, new;
old = bucket_cmpxchg(g, new, ({
new.owned_by_allocator = owned_by_allocator;
}));
BUG_ON(owned_by_allocator == old.owned_by_allocator);
}
int bch2_mark_alloc(struct btree_trans *trans, int bch2_mark_alloc(struct btree_trans *trans,
struct bkey_s_c old, struct bkey_s_c new, struct bkey_s_c old, struct bkey_s_c new,
unsigned flags) unsigned flags)
...@@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans, ...@@ -560,6 +539,10 @@ int bch2_mark_alloc(struct btree_trans *trans,
} }
} }
if (!new_a.data_type &&
(!new_a.journal_seq || new_a.journal_seq < c->journal.flushed_seq_ondisk))
closure_wake_up(&c->freelist_wait);
if (bucket_state(new_a) == BUCKET_need_gc_gens) { if (bucket_state(new_a) == BUCKET_need_gc_gens) {
atomic_inc(&c->kick_gc); atomic_inc(&c->kick_gc);
wake_up_process(c->gc_thread); wake_up_process(c->gc_thread);
...@@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans, ...@@ -583,7 +566,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
g->io_time[READ] = new_a.io_time[READ]; g->io_time[READ] = new_a.io_time[READ];
g->io_time[WRITE] = new_a.io_time[WRITE]; g->io_time[WRITE] = new_a.io_time[WRITE];
g->oldest_gen = new_a.oldest_gen;
g->gen_valid = 1; g->gen_valid = 1;
g->stripe = new_a.stripe; g->stripe = new_a.stripe;
g->stripe_redundancy = new_a.stripe_redundancy; g->stripe_redundancy = new_a.stripe_redundancy;
...@@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ...@@ -1861,8 +1843,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
a->v.data_type = type; a->v.data_type = type;
a->v.dirty_sectors = sectors; a->v.dirty_sectors = sectors;
ret = bch2_trans_update(trans, &iter, &a->k_i, ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
BTREE_UPDATE_NO_KEY_CACHE_COHERENCY);
if (ret) if (ret)
goto out; goto out;
out: out:
...@@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ...@@ -2048,24 +2029,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
struct bucket_array *buckets = NULL, *old_buckets = NULL; struct bucket_array *buckets = NULL, *old_buckets = NULL;
struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
unsigned long *buckets_nouse = NULL; unsigned long *buckets_nouse = NULL;
alloc_fifo free[RESERVE_NR];
alloc_fifo free_inc;
alloc_heap alloc_heap;
size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
/* XXX: these should be tunable */
size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
btree_reserve * 2);
bool resize = ca->buckets[0] != NULL; bool resize = ca->buckets[0] != NULL;
int ret = -ENOMEM; int ret = -ENOMEM;
unsigned i;
memset(&free, 0, sizeof(free));
memset(&free_inc, 0, sizeof(free_inc));
memset(&alloc_heap, 0, sizeof(alloc_heap));
if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
nbuckets * sizeof(struct bucket), nbuckets * sizeof(struct bucket),
...@@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ...@@ -2075,12 +2040,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
(c->opts.buckets_nouse && (c->opts.buckets_nouse &&
!(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
sizeof(unsigned long), sizeof(unsigned long),
GFP_KERNEL|__GFP_ZERO))) || GFP_KERNEL|__GFP_ZERO))))
!init_fifo(&free[RESERVE_movinggc],
copygc_reserve, GFP_KERNEL) ||
!init_fifo(&free[RESERVE_none], reserve_none, GFP_KERNEL) ||
!init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
!init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
goto err; goto err;
buckets->first_bucket = ca->mi.first_bucket; buckets->first_bucket = ca->mi.first_bucket;
...@@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ...@@ -2126,18 +2086,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
up_write(&c->gc_lock); up_write(&c->gc_lock);
} }
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++) {
fifo_move(&free[i], &ca->free[i]);
swap(ca->free[i], free[i]);
}
fifo_move(&free_inc, &ca->free_inc);
swap(ca->free_inc, free_inc);
spin_unlock(&c->freelist_lock);
/* with gc lock held, alloc_heap can't be in use: */
swap(ca->alloc_heap, alloc_heap);
nbuckets = ca->mi.nbuckets; nbuckets = ca->mi.nbuckets;
if (resize) if (resize)
...@@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ...@@ -2145,10 +2093,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
ret = 0; ret = 0;
err: err:
free_heap(&alloc_heap);
free_fifo(&free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&free[i]);
kvpfree(buckets_nouse, kvpfree(buckets_nouse,
BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
if (bucket_gens) if (bucket_gens)
...@@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) ...@@ -2163,10 +2107,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
{ {
unsigned i; unsigned i;
free_heap(&ca->alloc_heap);
free_fifo(&ca->free_inc);
for (i = 0; i < RESERVE_NR; i++)
free_fifo(&ca->free[i]);
kvpfree(ca->buckets_nouse, kvpfree(ca->buckets_nouse,
BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
......
...@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) ...@@ -58,11 +58,6 @@ static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
return __bucket(ca, b, true); return __bucket(ca, b, true);
} }
static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
{
return __bucket(ca, b, false);
}
static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
{ {
return rcu_dereference_check(ca->bucket_gens, return rcu_dereference_check(ca->bucket_gens,
...@@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark) ...@@ -151,50 +146,50 @@ static inline bool is_available_bucket(struct bucket_mark mark)
struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
static inline u64 __dev_buckets_available(struct bch_dev *ca, static inline u64 __dev_buckets_available(struct bch_dev *ca,
struct bch_dev_usage stats) struct bch_dev_usage stats,
enum alloc_reserve reserve)
{ {
u64 total = ca->mi.nbuckets - ca->mi.first_bucket; s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
s64 reserved = 0;
switch (reserve) {
case RESERVE_none:
reserved += ca->mi.nbuckets >> 6;
fallthrough;
case RESERVE_movinggc:
reserved += ca->nr_btree_reserve;
fallthrough;
case RESERVE_btree:
reserved += ca->nr_btree_reserve;
fallthrough;
case RESERVE_btree_movinggc:
break;
default:
BUG();
}
if (WARN_ONCE(stats.buckets_unavailable > total, if (WARN_ONCE(stats.buckets_unavailable > total,
"buckets_unavailable overflow (%llu > %llu)\n", "buckets_unavailable overflow (%llu > %llu)\n",
stats.buckets_unavailable, total)) stats.buckets_unavailable, total))
return 0; return 0;
return total - stats.buckets_unavailable; return max_t(s64, 0,
total -
stats.buckets_unavailable -
ca->nr_open_buckets -
reserved);
} }
static inline u64 dev_buckets_available(struct bch_dev *ca) static inline u64 dev_buckets_available(struct bch_dev *ca,
enum alloc_reserve reserve)
{ {
return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
}
static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
struct bch_dev_usage stats)
{
struct bch_fs *c = ca->fs;
s64 available = __dev_buckets_available(ca, stats);
unsigned i;
spin_lock(&c->freelist_lock);
for (i = 0; i < RESERVE_NR; i++)
available -= fifo_used(&ca->free[i]);
available -= fifo_used(&ca->free_inc);
available -= ca->nr_open_buckets;
spin_unlock(&c->freelist_lock);
return max(available, 0LL);
}
static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
{
return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
} }
/* Filesystem usage: */ /* Filesystem usage: */
static inline unsigned fs_usage_u64s(struct bch_fs *c) static inline unsigned fs_usage_u64s(struct bch_fs *c)
{ {
return sizeof(struct bch_fs_usage) / sizeof(u64) + return sizeof(struct bch_fs_usage) / sizeof(u64) +
READ_ONCE(c->replicas.nr); READ_ONCE(c->replicas.nr);
} }
...@@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *); ...@@ -222,7 +217,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
void bch2_fs_usage_initialize(struct bch_fs *); void bch2_fs_usage_initialize(struct bch_fs *);
void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned, size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned); struct gc_pos, unsigned);
......
...@@ -14,7 +14,6 @@ struct bucket_mark { ...@@ -14,7 +14,6 @@ struct bucket_mark {
struct { struct {
u8 gen; u8 gen;
u8 data_type:3, u8 data_type:3,
owned_by_allocator:1,
stripe:1; stripe:1;
u16 dirty_sectors; u16 dirty_sectors;
u16 cached_sectors; u16 cached_sectors;
...@@ -29,7 +28,6 @@ struct bucket { ...@@ -29,7 +28,6 @@ struct bucket {
}; };
u64 io_time[2]; u64 io_time[2];
u8 oldest_gen;
unsigned gen_valid:1; unsigned gen_valid:1;
u8 stripe_redundancy; u8 stripe_redundancy;
u32 stripe; u32 stripe;
......
...@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ...@@ -1295,9 +1295,6 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
BUG_ON(nr_have_data > h->s->nr_data); BUG_ON(nr_have_data > h->s->nr_data);
BUG_ON(nr_have_parity > h->s->nr_parity); BUG_ON(nr_have_parity > h->s->nr_parity);
percpu_down_read(&c->mark_lock);
rcu_read_lock();
buckets.nr = 0; buckets.nr = 0;
if (nr_have_parity < h->s->nr_parity) { if (nr_have_parity < h->s->nr_parity) {
ret = bch2_bucket_alloc_set(c, &buckets, ret = bch2_bucket_alloc_set(c, &buckets,
...@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ...@@ -1324,7 +1321,7 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
} }
if (ret) if (ret)
goto err; return ret;
} }
buckets.nr = 0; buckets.nr = 0;
...@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h, ...@@ -1352,12 +1349,10 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
} }
if (ret) if (ret)
goto err; return ret;
} }
err:
rcu_read_unlock(); return 0;
percpu_up_read(&c->mark_lock);
return ret;
} }
/* XXX: doesn't obey target: */ /* XXX: doesn't obey target: */
......
...@@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ...@@ -812,10 +812,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
break; break;
} }
} else { } else {
rcu_read_lock();
ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none, ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_none,
false, cl); false, cl);
rcu_read_unlock();
if (IS_ERR(ob[nr_got])) { if (IS_ERR(ob[nr_got])) {
ret = cl ? -EAGAIN : -ENOSPC; ret = cl ? -EAGAIN : -ENOSPC;
break; break;
......
...@@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl) ...@@ -1398,6 +1398,10 @@ static void journal_write_done(struct closure *cl)
if (!JSET_NO_FLUSH(w->data)) { if (!JSET_NO_FLUSH(w->data)) {
j->flushed_seq_ondisk = seq; j->flushed_seq_ondisk = seq;
j->last_seq_ondisk = w->last_seq; j->last_seq_ondisk = w->last_seq;
closure_wake_up(&c->freelist_wait);
bch2_reset_alloc_cursors(c);
} }
} else if (!j->err_seq || seq < j->err_seq) } else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq; j->err_seq = seq;
......
...@@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ...@@ -104,18 +104,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
return DATA_SKIP; return DATA_SKIP;
} }
static bool have_copygc_reserve(struct bch_dev *ca)
{
bool ret;
spin_lock(&ca->fs->freelist_lock);
ret = fifo_full(&ca->free[RESERVE_movinggc]) ||
ca->allocator_state != ALLOCATOR_running;
spin_unlock(&ca->fs->freelist_lock);
return ret;
}
static inline int fragmentation_cmp(copygc_heap *heap, static inline int fragmentation_cmp(copygc_heap *heap,
struct copygc_heap_entry l, struct copygc_heap_entry l,
struct copygc_heap_entry r) struct copygc_heap_entry r)
...@@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c) ...@@ -247,11 +235,10 @@ static int bch2_copygc(struct bch_fs *c)
} }
for_each_rw_member(ca, c, dev_idx) { for_each_rw_member(ca, c, dev_idx) {
closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); s64 avail = min(dev_buckets_available(ca, RESERVE_movinggc),
ca->mi.nbuckets >> 6);
spin_lock(&ca->fs->freelist_lock); sectors_reserved += avail * ca->mi.bucket_size;
sectors_reserved += fifo_used(&ca->free[RESERVE_movinggc]) * ca->mi.bucket_size;
spin_unlock(&ca->fs->freelist_lock);
} }
ret = walk_buckets_to_copygc(c); ret = walk_buckets_to_copygc(c);
...@@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c) ...@@ -352,8 +339,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
for_each_rw_member(ca, c, dev_idx) { for_each_rw_member(ca, c, dev_idx) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca); struct bch_dev_usage usage = bch2_dev_usage_read(ca);
fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) * fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_none) *
ca->mi.bucket_size) >> 1); ca->mi.bucket_size) >> 1);
fragmented = usage.d[BCH_DATA_user].fragmented; fragmented = usage.d[BCH_DATA_user].fragmented;
wait = min(wait, max(0LL, fragmented_allowed - fragmented)); wait = min(wait, max(0LL, fragmented_allowed - fragmented));
......
...@@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c) ...@@ -1374,6 +1374,7 @@ int bch2_fs_initialize(struct bch_fs *c)
* Write out the superblock and journal buckets, now that we can do * Write out the superblock and journal buckets, now that we can do
* btree updates * btree updates
*/ */
bch_verbose(c, "marking superblocks");
err = "error marking superblock and journal"; err = "error marking superblock and journal";
for_each_member_device(ca, c, i) { for_each_member_device(ca, c, i) {
ret = bch2_trans_mark_dev_sb(c, ca); ret = bch2_trans_mark_dev_sb(c, ca);
...@@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c) ...@@ -1385,6 +1386,7 @@ int bch2_fs_initialize(struct bch_fs *c)
ca->new_fs_bucket_idx = 0; ca->new_fs_bucket_idx = 0;
} }
bch_verbose(c, "initializing freespace");
err = "error initializing freespace"; err = "error initializing freespace";
ret = bch2_fs_freespace_init(c); ret = bch2_fs_freespace_init(c);
if (ret) if (ret)
......
...@@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) ...@@ -206,17 +206,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
*/ */
bch2_journal_flush_all_pins(&c->journal); bch2_journal_flush_all_pins(&c->journal);
/*
* If the allocator threads didn't all start up, the btree updates to
* write out alloc info aren't going to work:
*/
if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
goto nowrote_alloc;
bch_verbose(c, "flushing journal and stopping allocators"); bch_verbose(c, "flushing journal and stopping allocators");
bch2_journal_flush_all_pins(&c->journal); bch2_journal_flush_all_pins(&c->journal);
set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
do { do {
clean_passes++; clean_passes++;
...@@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c) ...@@ -241,17 +233,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
bch_verbose(c, "flushing journal and stopping allocators complete"); bch_verbose(c, "flushing journal and stopping allocators complete");
set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
nowrote_alloc:
closure_wait_event(&c->btree_interior_update_wait, closure_wait_event(&c->btree_interior_update_wait,
!bch2_btree_interior_updates_nr_pending(c)); !bch2_btree_interior_updates_nr_pending(c));
flush_work(&c->btree_interior_update_work); flush_work(&c->btree_interior_update_work);
for_each_member_device(ca, c, i)
bch2_dev_allocator_stop(ca);
clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
bch2_fs_journal_stop(&c->journal); bch2_fs_journal_stop(&c->journal);
/* /*
...@@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c) ...@@ -287,10 +273,6 @@ void bch2_fs_read_only(struct bch_fs *c)
/* /*
* Block new foreground-end write operations from starting - any new * Block new foreground-end write operations from starting - any new
* writes will return -EROFS: * writes will return -EROFS:
*
* (This is really blocking new _allocations_, writes to previously
* allocated space can still happen until stopping the allocator in
* bch2_dev_allocator_stop()).
*/ */
percpu_ref_kill(&c->writes); percpu_ref_kill(&c->writes);
...@@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) ...@@ -419,20 +401,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_dev_allocator_add(c, ca); bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
for_each_rw_member(ca, c, i) {
ret = bch2_dev_allocator_start(ca);
if (ret) {
bch_err(c, "error starting allocator threads");
percpu_ref_put(&ca->io_ref);
goto err;
}
}
set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
for_each_rw_member(ca, c, i)
bch2_wake_allocator(ca);
if (!early) { if (!early) {
ret = bch2_fs_read_write_late(c); ret = bch2_fs_read_write_late(c);
if (ret) if (ret)
...@@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c) ...@@ -946,20 +914,6 @@ int bch2_fs_start(struct bch_fs *c)
set_bit(BCH_FS_STARTED, &c->flags); set_bit(BCH_FS_STARTED, &c->flags);
/*
* Allocator threads don't start filling copygc reserve until after we
* set BCH_FS_STARTED - wake them now:
*
* XXX ugly hack:
* Need to set ca->allocator_state here instead of relying on the
* allocator threads to do it to avoid racing with the copygc threads
* checking it and thinking they have no alloc reserve:
*/
for_each_online_member(ca, c, i) {
ca->allocator_state = ALLOCATOR_running;
bch2_wake_allocator(ca);
}
if (c->opts.read_only || c->opts.nochanges) { if (c->opts.read_only || c->opts.nochanges) {
bch2_fs_read_only(c); bch2_fs_read_only(c);
} else { } else {
...@@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj) ...@@ -1051,8 +1005,6 @@ static void bch2_dev_release(struct kobject *kobj)
static void bch2_dev_free(struct bch_dev *ca) static void bch2_dev_free(struct bch_dev *ca)
{ {
bch2_dev_allocator_stop(ca);
cancel_work_sync(&ca->io_error_work); cancel_work_sync(&ca->io_error_work);
if (ca->kobj.state_in_sysfs && if (ca->kobj.state_in_sysfs &&
...@@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ...@@ -1167,6 +1119,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
ca->mi = bch2_mi_to_cpu(member); ca->mi = bch2_mi_to_cpu(member);
ca->uuid = member->uuid; ca->uuid = member->uuid;
ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
ca->mi.bucket_size / btree_sectors(c));
if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
0, GFP_KERNEL) || 0, GFP_KERNEL) ||
percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
...@@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ...@@ -1216,12 +1171,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
ca->fs = c; ca->fs = c;
if (ca->mi.state == BCH_MEMBER_STATE_rw &&
bch2_dev_allocator_start(ca)) {
bch2_dev_free(ca);
goto err;
}
bch2_dev_attach(c, ca, dev_idx); bch2_dev_attach(c, ca, dev_idx);
out: out:
pr_verbose_init(c->opts, "ret %i", ret); pr_verbose_init(c->opts, "ret %i", ret);
...@@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ...@@ -1405,14 +1354,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
/* /*
* The allocator thread itself allocates btree nodes, so stop it first: * The allocator thread itself allocates btree nodes, so stop it first:
*/ */
bch2_dev_allocator_stop(ca);
bch2_dev_allocator_remove(c, ca); bch2_dev_allocator_remove(c, ca);
bch2_dev_journal_stop(&c->journal, ca); bch2_dev_journal_stop(&c->journal, ca);
bch2_copygc_start(c); bch2_copygc_start(c);
} }
static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
{ {
lockdep_assert_held(&c->state_lock); lockdep_assert_held(&c->state_lock);
...@@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ...@@ -1420,8 +1368,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
bch2_dev_allocator_add(c, ca); bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c); bch2_recalc_capacity(c);
return bch2_dev_allocator_start(ca);
} }
int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
...@@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ...@@ -1448,7 +1394,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
mutex_unlock(&c->sb_lock); mutex_unlock(&c->sb_lock);
if (new_state == BCH_MEMBER_STATE_rw) if (new_state == BCH_MEMBER_STATE_rw)
ret = __bch2_dev_read_write(c, ca); __bch2_dev_read_write(c, ca);
rebalance_wakeup(c); rebalance_wakeup(c);
...@@ -1710,13 +1656,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path) ...@@ -1710,13 +1656,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
ca->new_fs_bucket_idx = 0; ca->new_fs_bucket_idx = 0;
if (ca->mi.state == BCH_MEMBER_STATE_rw) { if (ca->mi.state == BCH_MEMBER_STATE_rw)
ret = __bch2_dev_read_write(c, ca); __bch2_dev_read_write(c, ca);
if (ret) {
bch_err(c, "device add error: error going RW on new device: %i", ret);
goto err_late;
}
}
up_write(&c->state_lock); up_write(&c->state_lock);
return 0; return 0;
...@@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path) ...@@ -1776,11 +1717,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
goto err; goto err;
} }
if (ca->mi.state == BCH_MEMBER_STATE_rw) { if (ca->mi.state == BCH_MEMBER_STATE_rw)
ret = __bch2_dev_read_write(c, ca); __bch2_dev_read_write(c, ca);
if (ret)
goto err;
}
mutex_lock(&c->sb_lock); mutex_lock(&c->sb_lock);
mi = bch2_sb_get_members(c->disk_sb.sb); mi = bch2_sb_get_members(c->disk_sb.sb);
......
...@@ -170,7 +170,6 @@ read_attribute(congested); ...@@ -170,7 +170,6 @@ read_attribute(congested);
read_attribute(btree_avg_write_size); read_attribute(btree_avg_write_size);
read_attribute(reserve_stats);
read_attribute(btree_cache_size); read_attribute(btree_cache_size);
read_attribute(compression_stats); read_attribute(compression_stats);
read_attribute(journal_debug); read_attribute(journal_debug);
...@@ -186,11 +185,11 @@ read_attribute(internal_uuid); ...@@ -186,11 +185,11 @@ read_attribute(internal_uuid);
read_attribute(has_data); read_attribute(has_data);
read_attribute(alloc_debug); read_attribute(alloc_debug);
write_attribute(wake_allocator);
read_attribute(read_realloc_races); read_attribute(read_realloc_races);
read_attribute(extent_migrate_done); read_attribute(extent_migrate_done);
read_attribute(extent_migrate_raced); read_attribute(extent_migrate_raced);
read_attribute(bucket_alloc_fail);
rw_attribute(discard); rw_attribute(discard);
rw_attribute(label); rw_attribute(label);
...@@ -377,6 +376,8 @@ SHOW(bch2_fs) ...@@ -377,6 +376,8 @@ SHOW(bch2_fs)
atomic_long_read(&c->extent_migrate_done)); atomic_long_read(&c->extent_migrate_done));
sysfs_print(extent_migrate_raced, sysfs_print(extent_migrate_raced,
atomic_long_read(&c->extent_migrate_raced)); atomic_long_read(&c->extent_migrate_raced));
sysfs_print(bucket_alloc_fail,
atomic_long_read(&c->bucket_alloc_fail));
sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
...@@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = { ...@@ -577,6 +578,7 @@ struct attribute *bch2_fs_internal_files[] = {
&sysfs_read_realloc_races, &sysfs_read_realloc_races,
&sysfs_extent_migrate_done, &sysfs_extent_migrate_done,
&sysfs_extent_migrate_raced, &sysfs_extent_migrate_raced,
&sysfs_bucket_alloc_fail,
&sysfs_gc_gens_pos, &sysfs_gc_gens_pos,
...@@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = { ...@@ -705,24 +707,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
NULL NULL
}; };
static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
{
enum alloc_reserve i;
spin_lock(&ca->fs->freelist_lock);
pr_buf(out, "free_inc:\t%zu\t%zu\n",
fifo_used(&ca->free_inc),
ca->free_inc.size);
for (i = 0; i < RESERVE_NR; i++)
pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
fifo_used(&ca->free[i]),
ca->free[i].size);
spin_unlock(&ca->fs->freelist_lock);
}
static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
{ {
struct bch_fs *c = ca->fs; struct bch_fs *c = ca->fs;
...@@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ...@@ -748,9 +732,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"ec\t%16llu\n" "ec\t%16llu\n"
"available%15llu\n" "available%15llu\n"
"\n" "\n"
"free_inc\t\t%zu/%zu\n"
"free[RESERVE_MOVINGGC]\t%zu/%zu\n"
"free[RESERVE_NONE]\t%zu/%zu\n"
"freelist_wait\t\t%s\n" "freelist_wait\t\t%s\n"
"open buckets allocated\t%u\n" "open buckets allocated\t%u\n"
"open buckets this dev\t%u\n" "open buckets this dev\t%u\n"
...@@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ...@@ -758,13 +739,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
"open_buckets_wait\t%s\n" "open_buckets_wait\t%s\n"
"open_buckets_btree\t%u\n" "open_buckets_btree\t%u\n"
"open_buckets_user\t%u\n" "open_buckets_user\t%u\n"
"btree reserve cache\t%u\n" "btree reserve cache\t%u\n",
"thread state:\t\t%s\n",
stats.buckets_ec, stats.buckets_ec,
__dev_buckets_available(ca, stats), __dev_buckets_available(ca, stats, RESERVE_none),
fifo_used(&ca->free_inc), ca->free_inc.size,
fifo_used(&ca->free[RESERVE_movinggc]), ca->free[RESERVE_movinggc].size,
fifo_used(&ca->free[RESERVE_none]), ca->free[RESERVE_none].size,
c->freelist_wait.list.first ? "waiting" : "empty", c->freelist_wait.list.first ? "waiting" : "empty",
OPEN_BUCKETS_COUNT - c->open_buckets_nr_free, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
ca->nr_open_buckets, ca->nr_open_buckets,
...@@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ...@@ -772,8 +749,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
c->open_buckets_wait.list.first ? "waiting" : "empty", c->open_buckets_wait.list.first ? "waiting" : "empty",
nr[BCH_DATA_btree], nr[BCH_DATA_btree],
nr[BCH_DATA_user], nr[BCH_DATA_user],
c->btree_reserve_cache_nr, c->btree_reserve_cache_nr);
bch2_allocator_states[ca->allocator_state]);
} }
static const char * const bch2_rw[] = { static const char * const bch2_rw[] = {
...@@ -848,9 +824,6 @@ SHOW(bch2_dev) ...@@ -848,9 +824,6 @@ SHOW(bch2_dev)
clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
* 100 / CONGESTED_MAX); * 100 / CONGESTED_MAX);
if (attr == &sysfs_reserve_stats)
reserve_stats_to_text(out, ca);
if (attr == &sysfs_alloc_debug) if (attr == &sysfs_alloc_debug)
dev_alloc_debug_to_text(out, ca); dev_alloc_debug_to_text(out, ca);
...@@ -890,9 +863,6 @@ STORE(bch2_dev) ...@@ -890,9 +863,6 @@ STORE(bch2_dev)
return ret; return ret;
} }
if (attr == &sysfs_wake_allocator)
bch2_wake_allocator(ca);
return size; return size;
} }
SYSFS_OPS(bch2_dev); SYSFS_OPS(bch2_dev);
...@@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = { ...@@ -918,11 +888,8 @@ struct attribute *bch2_dev_files[] = {
&sysfs_io_latency_stats_write, &sysfs_io_latency_stats_write,
&sysfs_congested, &sysfs_congested,
&sysfs_reserve_stats,
/* debug: */ /* debug: */
&sysfs_alloc_debug, &sysfs_alloc_debug,
&sysfs_wake_allocator,
NULL NULL
}; };
......
...@@ -471,37 +471,74 @@ TRACE_EVENT(invalidate, ...@@ -471,37 +471,74 @@ TRACE_EVENT(invalidate,
); );
DECLARE_EVENT_CLASS(bucket_alloc, DECLARE_EVENT_CLASS(bucket_alloc,
TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
TP_ARGS(ca, alloc_reserve), u64 avail,
u64 seen,
u64 open,
u64 need_journal_commit,
u64 nouse,
bool nonblocking,
int ret),
TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret),
TP_STRUCT__entry( TP_STRUCT__entry(
__field(dev_t, dev ) __field(dev_t, dev )
__array(char, reserve, 16 ) __array(char, reserve, 16 )
__field(u64, avail )
__field(u64, seen )
__field(u64, open )
__field(u64, need_journal_commit )
__field(u64, nouse )
__field(bool, nonblocking )
__field(int, ret )
), ),
TP_fast_assign( TP_fast_assign(
__entry->dev = ca->dev; __entry->dev = ca->dev;
strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); strlcpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve));
__entry->avail = avail;
__entry->seen = seen;
__entry->open = open;
__entry->need_journal_commit = need_journal_commit;
__entry->nouse = nouse;
__entry->nonblocking = nonblocking;
__entry->ret = ret;
), ),
TP_printk("%d,%d reserve %s", TP_printk("%d,%d reserve %s avail %llu seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u ret %i",
MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->reserve) __entry->reserve,
__entry->avail,
__entry->seen,
__entry->open,
__entry->need_journal_commit,
__entry->nouse,
__entry->nonblocking,
__entry->ret)
); );
DEFINE_EVENT(bucket_alloc, bucket_alloc, DEFINE_EVENT(bucket_alloc, bucket_alloc,
TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
TP_ARGS(ca, alloc_reserve) u64 avail,
u64 seen,
u64 open,
u64 need_journal_commit,
u64 nouse,
bool nonblocking,
int ret),
TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
); );
DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), TP_PROTO(struct bch_dev *ca, const char *alloc_reserve,
TP_ARGS(ca, alloc_reserve) u64 avail,
); u64 seen,
u64 open,
DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, u64 need_journal_commit,
TP_PROTO(struct bch_dev *ca, const char *alloc_reserve), u64 nouse,
TP_ARGS(ca, alloc_reserve) bool nonblocking,
int ret),
TP_ARGS(ca, alloc_reserve, avail, seen, open, need_journal_commit, nouse, nonblocking, ret)
); );
/* Moving IO */ /* Moving IO */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment