Commit be9e782d authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Don't downgrade locks on transaction restart

We should only be downgrading locks on success - otherwise, our
transaction restarts won't be getting the correct locks and we'll
livelock.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 2e7acdfb
...@@ -1523,6 +1523,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, ...@@ -1523,6 +1523,7 @@ static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
path->ref = 0; path->ref = 0;
path->intent_ref = 0; path->intent_ref = 0;
path->nodes_locked = 0; path->nodes_locked = 0;
path->alloc_seq++;
btree_path_list_add(trans, pos, path); btree_path_list_add(trans, pos, path);
trans->paths_sorted = false; trans->paths_sorted = false;
...@@ -1598,7 +1599,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans, ...@@ -1598,7 +1599,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
locks_want = min(locks_want, BTREE_MAX_DEPTH); locks_want = min(locks_want, BTREE_MAX_DEPTH);
if (locks_want > path->locks_want) if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
return path; return path;
} }
......
...@@ -509,7 +509,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree ...@@ -509,7 +509,7 @@ bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree
* path->uptodate yet: * path->uptodate yet:
*/ */
if (!path->locks_want && if (!path->locks_want &&
!__bch2_btree_path_upgrade(trans, path, 1)) { !__bch2_btree_path_upgrade(trans, path, 1, NULL)) {
trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade);
goto err; goto err;
......
...@@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, ...@@ -431,7 +431,8 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
static inline bool btree_path_get_locks(struct btree_trans *trans, static inline bool btree_path_get_locks(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
bool upgrade) bool upgrade,
struct get_locks_fail *f)
{ {
unsigned l = path->level; unsigned l = path->level;
int fail_idx = -1; int fail_idx = -1;
...@@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, ...@@ -442,8 +443,14 @@ static inline bool btree_path_get_locks(struct btree_trans *trans,
if (!(upgrade if (!(upgrade
? bch2_btree_node_upgrade(trans, path, l) ? bch2_btree_node_upgrade(trans, path, l)
: bch2_btree_node_relock(trans, path, l))) : bch2_btree_node_relock(trans, path, l))) {
fail_idx = l; fail_idx = l;
if (f) {
f->l = l;
f->b = path->l[l].b;
}
}
l++; l++;
} while (l < path->locks_want); } while (l < path->locks_want);
...@@ -584,7 +591,9 @@ __flatten ...@@ -584,7 +591,9 @@ __flatten
bool bch2_btree_path_relock_norestart(struct btree_trans *trans, bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip) struct btree_path *path, unsigned long trace_ip)
{ {
return btree_path_get_locks(trans, path, false); struct get_locks_fail f;
return btree_path_get_locks(trans, path, false, &f);
} }
int __bch2_btree_path_relock(struct btree_trans *trans, int __bch2_btree_path_relock(struct btree_trans *trans,
...@@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans, ...@@ -600,22 +609,24 @@ int __bch2_btree_path_relock(struct btree_trans *trans,
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
unsigned new_locks_want) unsigned new_locks_want,
struct get_locks_fail *f)
{ {
EBUG_ON(path->locks_want >= new_locks_want); EBUG_ON(path->locks_want >= new_locks_want);
path->locks_want = new_locks_want; path->locks_want = new_locks_want;
return btree_path_get_locks(trans, path, true); return btree_path_get_locks(trans, path, true, f);
} }
bool __bch2_btree_path_upgrade(struct btree_trans *trans, bool __bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
unsigned new_locks_want) unsigned new_locks_want,
struct get_locks_fail *f)
{ {
struct btree_path *linked; struct btree_path *linked;
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true; return true;
/* /*
...@@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, ...@@ -644,7 +655,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->btree_id == path->btree_id && linked->btree_id == path->btree_id &&
linked->locks_want < new_locks_want) { linked->locks_want < new_locks_want) {
linked->locks_want = new_locks_want; linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true); btree_path_get_locks(trans, linked, true, NULL);
} }
return false; return false;
...@@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, ...@@ -656,6 +667,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
{ {
unsigned l; unsigned l;
if (trans->restarted)
return;
EBUG_ON(path->locks_want < new_locks_want); EBUG_ON(path->locks_want < new_locks_want);
path->locks_want = new_locks_want; path->locks_want = new_locks_want;
...@@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, ...@@ -674,6 +688,9 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
} }
bch2_btree_path_verify_locks(path); bch2_btree_path_verify_locks(path);
path->downgrade_seq++;
trace_path_downgrade(trans, _RET_IP_, path);
} }
/* Btree transaction locking: */ /* Btree transaction locking: */
...@@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans) ...@@ -682,6 +699,9 @@ void bch2_trans_downgrade(struct btree_trans *trans)
{ {
struct btree_path *path; struct btree_path *path;
if (trans->restarted)
return;
trans_for_each_path(trans, path) trans_for_each_path(trans, path)
bch2_btree_path_downgrade(trans, path); bch2_btree_path_downgrade(trans, path);
} }
......
...@@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, ...@@ -355,26 +355,36 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */ /* upgrade */
struct get_locks_fail {
unsigned l;
struct btree *b;
};
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned); struct btree_path *, unsigned,
struct get_locks_fail *);
bool __bch2_btree_path_upgrade(struct btree_trans *, bool __bch2_btree_path_upgrade(struct btree_trans *,
struct btree_path *, unsigned); struct btree_path *, unsigned,
struct get_locks_fail *);
static inline int bch2_btree_path_upgrade(struct btree_trans *trans, static inline int bch2_btree_path_upgrade(struct btree_trans *trans,
struct btree_path *path, struct btree_path *path,
unsigned new_locks_want) unsigned new_locks_want)
{ {
struct get_locks_fail f;
unsigned old_locks_want = path->locks_want; unsigned old_locks_want = path->locks_want;
new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
if (path->locks_want < new_locks_want if (path->locks_want < new_locks_want
? __bch2_btree_path_upgrade(trans, path, new_locks_want) ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f)
: path->uptodate == BTREE_ITER_UPTODATE) : path->uptodate == BTREE_ITER_UPTODATE)
return 0; return 0;
trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path,
old_locks_want, new_locks_want); old_locks_want, new_locks_want, &f);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade);
} }
......
...@@ -861,12 +861,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags ...@@ -861,12 +861,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
*/ */
bch2_journal_res_put(&c->journal, &trans->journal_res); bch2_journal_res_put(&c->journal, &trans->journal_res);
if (unlikely(ret)) return ret;
return ret;
bch2_trans_downgrade(trans);
return 0;
} }
static int journal_reclaim_wait_done(struct bch_fs *c) static int journal_reclaim_wait_done(struct bch_fs *c)
...@@ -1135,6 +1130,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) ...@@ -1135,6 +1130,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans); bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset: out_reset:
if (!ret)
bch2_trans_downgrade(trans);
bch2_trans_reset_updates(trans); bch2_trans_reset_updates(trans);
return ret; return ret;
......
...@@ -228,6 +228,8 @@ struct btree_path { ...@@ -228,6 +228,8 @@ struct btree_path {
u8 sorted_idx; u8 sorted_idx;
u8 ref; u8 ref;
u8 intent_ref; u8 intent_ref;
u32 alloc_seq;
u32 downgrade_seq;
/* btree_iter_copy starts here: */ /* btree_iter_copy starts here: */
struct bpos pos; struct bpos pos;
......
...@@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, ...@@ -1987,7 +1987,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
out: out:
if (new_path) if (new_path)
bch2_path_put(trans, new_path, true); bch2_path_put(trans, new_path, true);
bch2_btree_path_downgrade(trans, iter->path); bch2_trans_downgrade(trans);
return ret; return ret;
err: err:
bch2_btree_node_free_never_used(as, trans, n); bch2_btree_node_free_never_used(as, trans, n);
......
...@@ -162,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, ...@@ -162,11 +162,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) && if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) { !ptr->cached) {
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr);
/*
* See comment below:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
*/
rewrites_found |= 1U << i; rewrites_found |= 1U << i;
} }
i++; i++;
...@@ -212,14 +208,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, ...@@ -212,14 +208,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (!p.ptr.cached && if (!p.ptr.cached &&
durability - ptr_durability >= m->op.opts.data_replicas) { durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability; durability -= ptr_durability;
bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr);
/*
* Currently, we're dropping unneeded replicas
* instead of marking them as cached, since
* cached data in stripe buckets prevents them
* from being reused:
bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
*/
goto restart_drop_extra_replicas; goto restart_drop_extra_replicas;
} }
} }
......
...@@ -1043,13 +1043,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, ...@@ -1043,13 +1043,16 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path) TP_ARGS(trans, caller_ip, path)
); );
struct get_locks_fail;
TRACE_EVENT(trans_restart_upgrade, TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans, TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip, unsigned long caller_ip,
struct btree_path *path, struct btree_path *path,
unsigned old_locks_want, unsigned old_locks_want,
unsigned new_locks_want), unsigned new_locks_want,
TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), struct get_locks_fail *f),
TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f),
TP_STRUCT__entry( TP_STRUCT__entry(
__array(char, trans_fn, 32 ) __array(char, trans_fn, 32 )
...@@ -1057,6 +1060,11 @@ TRACE_EVENT(trans_restart_upgrade, ...@@ -1057,6 +1060,11 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, btree_id ) __field(u8, btree_id )
__field(u8, old_locks_want ) __field(u8, old_locks_want )
__field(u8, new_locks_want ) __field(u8, new_locks_want )
__field(u8, level )
__field(u32, path_seq )
__field(u32, node_seq )
__field(u32, path_alloc_seq )
__field(u32, downgrade_seq)
TRACE_BPOS_entries(pos) TRACE_BPOS_entries(pos)
), ),
...@@ -1066,10 +1074,15 @@ TRACE_EVENT(trans_restart_upgrade, ...@@ -1066,10 +1074,15 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->btree_id = path->btree_id; __entry->btree_id = path->btree_id;
__entry->old_locks_want = old_locks_want; __entry->old_locks_want = old_locks_want;
__entry->new_locks_want = new_locks_want; __entry->new_locks_want = new_locks_want;
__entry->level = f->l;
__entry->path_seq = path->l[f->l].lock_seq;
__entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
__entry->path_alloc_seq = path->alloc_seq;
__entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos) TRACE_BPOS_assign(pos, path->pos)
), ),
TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
__entry->trans_fn, __entry->trans_fn,
(void *) __entry->caller_ip, (void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id), bch2_btree_id_str(__entry->btree_id),
...@@ -1077,7 +1090,12 @@ TRACE_EVENT(trans_restart_upgrade, ...@@ -1077,7 +1090,12 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->pos_offset, __entry->pos_offset,
__entry->pos_snapshot, __entry->pos_snapshot,
__entry->old_locks_want, __entry->old_locks_want,
__entry->new_locks_want) __entry->new_locks_want,
__entry->level,
__entry->path_seq,
__entry->node_seq,
__entry->path_alloc_seq,
__entry->downgrade_seq)
); );
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
...@@ -1238,6 +1256,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, ...@@ -1238,6 +1256,27 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
__entry->new_u64s) __entry->new_u64s)
); );
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
struct btree_path *path),
TP_ARGS(trans, caller_ip, path),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
),
TP_printk("%s %pS",
__entry->trans_fn,
(void *) __entry->caller_ip)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
TP_PROTO(struct btree_trans *trans, TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip), unsigned long caller_ip),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment