Commit 8042b5b7 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Extents may now cross btree node boundaries

When snapshots arrive, we won't necessarily be able to arbitrarily split
existis - when we need to split an existing extent, we'll have to check
if the extent was overwritten in child snapshots and if so emit a
whiteout for the split in the child snapshot.

Because extents couldn't span btree nodes previously, journal replay
would sometimes have to split existing extents. That's no good anymore,
but fortunately since extent handling has already been lifted above most
of the btree code there's no real need for that rule anymore.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 7e1a3aa9
......@@ -1346,13 +1346,19 @@ LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
x(reflink_inline_data, 14) \
x(new_varint, 15) \
x(journal_no_flush, 16) \
x(alloc_v2, 17)
x(alloc_v2, 17) \
x(extents_across_btree_nodes, 18)
#define BCH_SB_FEATURES_ALWAYS \
((1ULL << BCH_FEATURE_new_extent_overwrite)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_btree_updates_journalled)|\
(1ULL << BCH_FEATURE_extents_across_btree_nodes))
#define BCH_SB_FEATURES_ALL \
((1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_new_extent_overwrite)| \
(BCH_SB_FEATURES_ALWAYS| \
(1ULL << BCH_FEATURE_new_siphash)| \
(1ULL << BCH_FEATURE_btree_ptr_v2)| \
(1ULL << BCH_FEATURE_extents_above_btree_updates)|\
(1ULL << BCH_FEATURE_new_varint)| \
(1ULL << BCH_FEATURE_journal_no_flush)| \
(1ULL << BCH_FEATURE_alloc_v2))
......
......@@ -1814,11 +1814,8 @@ struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
static inline struct bkey_s_c
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
{
struct btree_iter_level *l = &iter->l[0];
struct btree_node_iter node_iter;
struct bkey_s_c k;
struct bkey n;
int ret;
struct bpos pos, next_start;
/* keys & holes can't span inode numbers: */
if (iter->pos.offset == KEY_OFFSET_MAX) {
......@@ -1826,50 +1823,31 @@ __bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
return bkey_s_c_null;
bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
}
/*
* iterator is now at the correct position for inserting at iter->pos,
* but we need to keep iterating until we find the first non whiteout so
* we know how big a hole we have, if any:
*/
node_iter = l->iter;
k = __btree_iter_unpack(iter, l, &iter->k,
bch2_btree_node_iter_peek(&node_iter, l->b));
if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
/*
* We're not setting iter->uptodate because the node iterator
* doesn't necessarily point at the key we're returning:
*/
pos = iter->pos;
k = bch2_btree_iter_peek(iter);
iter->pos = pos;
EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
bch2_btree_iter_verify(iter);
if (bkey_err(k))
return k;
}
/* hole */
if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
return k;
if (!k.k)
k.k = &l->b->key.k;
next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
bkey_init(&n);
n.p = iter->pos;
bch2_key_resize(&n,
bkey_init(&iter->k);
iter->k.p = iter->pos;
bch2_key_resize(&iter->k,
min_t(u64, KEY_SIZE_MAX,
(k.k->p.inode == n.p.inode
? bkey_start_offset(k.k)
(next_start.inode == iter->pos.inode
? next_start.offset
: KEY_OFFSET_MAX) -
n.p.offset));
iter->pos.offset));
EBUG_ON(!n.size);
EBUG_ON(!iter->k.size);
iter->k = n;
iter->uptodate = BTREE_ITER_UPTODATE;
bch2_btree_iter_verify_entry_exit(iter);
......@@ -1893,13 +1871,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->uptodate == BTREE_ITER_UPTODATE)
return btree_iter_peek_uptodate(iter);
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return bkey_s_c_err(ret);
if (iter->flags & BTREE_ITER_IS_EXTENTS)
return __bch2_btree_iter_peek_slot_extents(iter);
k = __btree_iter_peek_all(iter, l, &iter->k);
EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
......
......@@ -62,9 +62,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *iter,
EBUG_ON(btree_node_just_written(b));
EBUG_ON(bset_written(b, btree_bset_last(b)));
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
bkey_cmp(bkey_start_pos(&insert->k),
bkey_predecessor(b->data->min_key)) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0);
EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0);
EBUG_ON(insert->k.u64s >
......@@ -705,26 +702,31 @@ static inline int btree_iter_pos_cmp(const struct btree_iter *l,
bkey_cmp(l->pos, r->pos);
}
static void bch2_trans_update2(struct btree_trans *trans,
static int bch2_trans_update2(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_i *insert)
{
struct btree_insert_entry *i, n = (struct btree_insert_entry) {
.iter = iter, .k = insert
};
int ret;
btree_insert_entry_checks(trans, n.iter, n.k);
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
ret = bch2_btree_iter_traverse(iter);
if (unlikely(ret))
return ret;
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
trans_for_each_update2(trans, i) {
if (btree_iter_pos_cmp(n.iter, i->iter) == 0) {
*i = n;
return;
return 0;
}
if (btree_iter_pos_cmp(n.iter, i->iter) <= 0)
......@@ -733,6 +735,7 @@ static void bch2_trans_update2(struct btree_trans *trans,
array_insert_item(trans->updates2, trans->nr_updates2,
i - trans->updates2, n);
return 0;
}
static int extent_update_to_keys(struct btree_trans *trans,
......@@ -753,9 +756,9 @@ static int extent_update_to_keys(struct btree_trans *trans,
iter->flags |= BTREE_ITER_INTENT;
__bch2_btree_iter_set_pos(iter, insert->k.p, false);
bch2_trans_update2(trans, iter, insert);
ret = bch2_trans_update2(trans, iter, insert);
bch2_trans_iter_put(trans, iter);
return 0;
return ret;
}
static int extent_handle_overwrites(struct btree_trans *trans,
......@@ -785,8 +788,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_back(start, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
}
if (bkey_cmp(k.k->p, end) > 0) {
......@@ -800,8 +805,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
bch2_cut_front(end, update);
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
} else {
update_iter = bch2_trans_copy_iter(trans, iter);
......@@ -815,8 +822,10 @@ static int extent_handle_overwrites(struct btree_trans *trans,
update->k.size = 0;
__bch2_btree_iter_set_pos(update_iter, update->k.p, false);
bch2_trans_update2(trans, update_iter, update);
ret = bch2_trans_update2(trans, update_iter, update);
bch2_trans_iter_put(trans, update_iter);
if (ret)
goto err;
}
k = bch2_btree_iter_next_with_updates(iter);
......@@ -921,11 +930,11 @@ int __bch2_trans_commit(struct btree_trans *trans)
trans_for_each_update(trans, i) {
if (i->iter->flags & BTREE_ITER_IS_EXTENTS) {
ret = extent_update_to_keys(trans, i->iter, i->k);
if (ret)
goto out;
} else {
bch2_trans_update2(trans, i->iter, i->k);
ret = bch2_trans_update2(trans, i->iter, i->k);
}
if (ret)
goto out;
}
trans_for_each_update2(trans, i) {
......
......@@ -1321,9 +1321,6 @@ int bch2_mark_update(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree *b = iter_l(iter)->b;
struct btree_node_iter node_iter = iter_l(iter)->iter;
struct bkey_packed *_old;
struct bkey_s_c old;
struct bkey unpacked;
int ret = 0;
......@@ -1363,23 +1360,24 @@ int bch2_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
struct btree_iter *copy;
BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
0, new->k.size,
fs_usage, trans->journal_res.seq,
BTREE_TRIGGER_INSERT|flags);
while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
unsigned offset = 0;
s64 sectors;
copy = bch2_trans_copy_iter(trans, iter);
old = bkey_disassemble(b, _old, &unpacked);
sectors = -((s64) old.k->size);
for_each_btree_key_continue(copy, 0, old, ret) {
unsigned offset = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
return 0;
break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
......@@ -1412,9 +1410,8 @@ int bch2_mark_update(struct btree_trans *trans,
trans->journal_res.seq, flags) ?: 1;
if (ret <= 0)
break;
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
}
return ret;
......@@ -1445,27 +1442,20 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
pr_err("overlapping with");
if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
struct btree *b = iter_l(i->iter)->b;
struct btree_node_iter node_iter = iter_l(i->iter)->iter;
struct bkey_packed *_k;
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k;
struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
struct bkey_s_c k;
int ret;
pr_info("_k %px format %u", _k, _k->format);
k = bkey_disassemble(b, _k, &unpacked);
if (btree_node_is_extents(b)
for_each_btree_key_continue(copy, 0, k, ret) {
if (btree_node_type_is_extents(i->iter->btree_id)
? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
: bkey_cmp(i->k->k.p, k.k->p))
break;
bch2_bkey_val_to_text(&PBUF(buf), c, k);
pr_err("%s", buf);
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
} else {
struct bkey_cached *ck = (void *) i->iter->l[0].b;
......@@ -1860,8 +1850,6 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
}
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
bch2_trans_update(trans, iter, n, 0);
out:
ret = sectors;
......@@ -1987,15 +1975,13 @@ int bch2_trans_mark_update(struct btree_trans *trans,
BTREE_TRIGGER_OVERWRITE|flags);
}
} else {
struct btree *b = iter_l(iter)->b;
struct btree_node_iter node_iter = iter_l(iter)->iter;
struct bkey_packed *_old;
struct bkey unpacked;
struct btree_iter *copy;
struct bkey _old;
EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
bkey_init(&unpacked);
old = (struct bkey_s_c) { &unpacked, NULL };
bkey_init(&_old);
old = (struct bkey_s_c) { &_old, NULL };
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
0, new->k.size,
......@@ -2003,18 +1989,16 @@ int bch2_trans_mark_update(struct btree_trans *trans,
if (ret)
return ret;
while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) {
unsigned flags = BTREE_TRIGGER_OVERWRITE;
unsigned offset = 0;
s64 sectors;
copy = bch2_trans_copy_iter(trans, iter);
old = bkey_disassemble(b, _old, &unpacked);
sectors = -((s64) old.k->size);
for_each_btree_key_continue(copy, 0, old, ret) {
unsigned offset = 0;
s64 sectors = -((s64) old.k->size);
flags |= BTREE_TRIGGER_OVERWRITE;
if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
return 0;
break;
switch (bch2_extent_overlap(&new->k, old.k)) {
case BCH_EXTENT_OVERLAP_ALL:
......@@ -2045,10 +2029,9 @@ int bch2_trans_mark_update(struct btree_trans *trans,
ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
offset, sectors, flags);
if (ret)
return ret;
bch2_btree_node_iter_advance(&node_iter, b);
break;
}
bch2_trans_iter_put(trans, copy);
}
return ret;
......
......@@ -99,24 +99,12 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
struct bpos *end)
{
struct btree_trans *trans = iter->trans;
struct btree *b;
struct btree_node_iter node_iter;
struct bkey_packed *_k;
unsigned nr_iters = 0;
struct btree_iter *copy;
struct bkey_s_c k;
unsigned nr_iters = 0;
int ret;
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
b = iter->l[0].b;
node_iter = iter->l[0].iter;
BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) &&
bkey_cmp(bkey_start_pos(&insert->k),
bkey_predecessor(b->data->min_key)) < 0);
*end = bpos_min(insert->k.p, b->key.k.p);
*end = insert->k.p;
/* extent_update_to_keys(): */
nr_iters += 1;
......@@ -126,9 +114,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
if (ret < 0)
return ret;
while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) {
struct bkey unpacked;
struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
copy = bch2_trans_copy_iter(trans, iter);
for_each_btree_key_continue(copy, 0, k, ret) {
unsigned offset = 0;
if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
......@@ -155,10 +143,9 @@ int bch2_extent_atomic_end(struct btree_iter *iter,
&nr_iters, EXTENT_ITERS_MAX);
if (ret)
break;
bch2_btree_node_iter_advance(&node_iter, b);
}
bch2_trans_iter_put(trans, copy);
return ret < 0 ? ret : 0;
}
......
......@@ -506,115 +506,6 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++);
}
static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
struct bkey_i *k)
{
struct btree_trans trans;
struct btree_iter *iter, *split_iter;
/*
* We might cause compressed extents to be split, so we need to pass in
* a disk_reservation:
*/
struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0);
struct bkey_i *split;
struct bpos atomic_end;
/*
* Some extents aren't equivalent - w.r.t. what the triggers do
* - if they're split:
*/
bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
k->k.type == KEY_TYPE_reflink_p;
bool remark = false;
int ret;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
retry:
bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, btree_id,
bkey_start_pos(&k->k),
BTREE_ITER_INTENT);
do {
ret = bch2_btree_iter_traverse(iter);
if (ret)
goto err;
atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p);
split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k));
ret = PTR_ERR_OR_ZERO(split);
if (ret)
goto err;
if (!remark &&
remark_if_split &&
bkey_cmp(atomic_end, k->k.p) < 0) {
ret = bch2_disk_reservation_add(c, &disk_res,
k->k.size *
bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
BCH_DISK_RESERVATION_NOFAIL);
BUG_ON(ret);
remark = true;
}
bkey_copy(split, k);
bch2_cut_front(iter->pos, split);
bch2_cut_back(atomic_end, split);
split_iter = bch2_trans_copy_iter(&trans, iter);
/*
* It's important that we don't go through the
* extent_handle_overwrites() and extent_update_to_keys() path
* here: journal replay is supposed to treat extents like
* regular keys
*/
__bch2_btree_iter_set_pos(split_iter, split->k.p, false);
bch2_trans_update(&trans, split_iter, split,
BTREE_TRIGGER_NORUN);
bch2_trans_iter_put(&trans, split_iter);
bch2_btree_iter_set_pos(iter, split->k.p);
if (remark) {
ret = bch2_trans_mark_key(&trans,
bkey_s_c_null,
bkey_i_to_s_c(split),
0, split->k.size,
BTREE_TRIGGER_INSERT);
if (ret)
goto err;
}
} while (bkey_cmp(iter->pos, k->k.p) < 0);
if (remark) {
ret = bch2_trans_mark_key(&trans,
bkey_i_to_s_c(k),
bkey_s_c_null,
0, -((s64) k->k.size),
BTREE_TRIGGER_OVERWRITE);
if (ret)
goto err;
}
ret = bch2_trans_commit(&trans, &disk_res, NULL,
BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY);
err:
bch2_trans_iter_put(&trans, iter);
if (ret == -EINTR)
goto retry;
bch2_disk_reservation_put(c, &disk_res);
return bch2_trans_exit(&trans) ?: ret;
}
static int __bch2_journal_replay_key(struct btree_trans *trans,
enum btree_id id, unsigned level,
struct bkey_i *k)
......@@ -753,9 +644,7 @@ static int bch2_journal_replay(struct bch_fs *c,
replay_now_at(j, keys.journal_seq_base + i->journal_seq);
ret = i->k->k.size
? bch2_extent_replay_key(c, i->btree_id, i->k)
: bch2_journal_replay_key(c, i);
ret = bch2_journal_replay_key(c, i);
if (ret)
goto err;
}
......
......@@ -956,9 +956,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
mutex_lock(&c->sb_lock);
SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
ret = bch2_write_super(c);
mutex_unlock(&c->sb_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment