Commit f26c67f4 authored by Kent Overstreet's avatar Kent Overstreet

bcachefs: Snapshot depth, skiplist fields

This extents KEY_TYPE_snapshot to include some new fields:
 - depth, to indicate depth of this particular node from the root
 - skip[3], skiplist entries for quickly walking back up to the root

These are to improve bch2_snapshot_is_ancestor(), making it O(ln(n))
instead of O(n) in the snapshot tree depth.

Skiplist nodes are picked at random from the set of ancestor nodes, not
some fixed fraction.

This introduces bcachefs_metadata_version 1.1, snapshot_skiplists.
Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 065bd335
...@@ -1148,6 +1148,8 @@ struct bch_snapshot { ...@@ -1148,6 +1148,8 @@ struct bch_snapshot {
__le32 children[2]; __le32 children[2];
__le32 subvol; __le32 subvol;
__le32 tree; __le32 tree;
__le32 depth;
__le32 skip[3];
}; };
LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
...@@ -1625,7 +1627,9 @@ struct bch_sb_field_journal_seq_blacklist { ...@@ -1625,7 +1627,9 @@ struct bch_sb_field_journal_seq_blacklist {
x(snapshot_trees, BCH_VERSION(0, 29), \ x(snapshot_trees, BCH_VERSION(0, 29), \
RECOVERY_PASS_ALL_FSCK) \ RECOVERY_PASS_ALL_FSCK) \
x(major_minor, BCH_VERSION(1, 0), \ x(major_minor, BCH_VERSION(1, 0), \
0) 0) \
x(snapshot_skiplists, BCH_VERSION(1, 1), \
BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))
enum bcachefs_metadata_version { enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9, bcachefs_metadata_version_min = 9,
......
...@@ -795,6 +795,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, ...@@ -795,6 +795,14 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags))) (_journal_seq), (_commit_flags)))
#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \
_start, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ #define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \
_start, _end, _iter_flags, _k, \ _start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\ _disk_res, _journal_seq, _commit_flags,\
......
...@@ -594,10 +594,21 @@ static int bch2_journal_replay_key(struct btree_trans *trans, ...@@ -594,10 +594,21 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned iter_flags = unsigned iter_flags =
BTREE_ITER_INTENT| BTREE_ITER_INTENT|
BTREE_ITER_NOT_EXTENTS; BTREE_ITER_NOT_EXTENTS;
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret; int ret;
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
* besides the allocator is doing updates yet so we don't need key cache
* coherency for non-alloc btrees, and key cache fills for snapshots
* btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until
* the snapshots recovery pass runs.
*/
if (!k->level && k->btree_id == BTREE_ID_alloc) if (!k->level && k->btree_id == BTREE_ID_alloc)
iter_flags |= BTREE_ITER_CACHED; iter_flags |= BTREE_ITER_CACHED;
else
update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM;
bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
BTREE_MAX_DEPTH, k->level, BTREE_MAX_DEPTH, k->level,
...@@ -610,7 +621,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, ...@@ -610,7 +621,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
if (k->overwritten) if (k->overwritten)
goto out; goto out;
ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ret = bch2_trans_update(trans, &iter, k->k, update_flags);
out: out:
bch2_trans_iter_exit(trans, &iter); bch2_trans_iter_exit(trans, &iter);
return ret; return ret;
......
This diff is collapsed.
...@@ -37,9 +37,34 @@ static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ...@@ -37,9 +37,34 @@ static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
return genradix_ptr(&c->snapshots, U32_MAX - id); return genradix_ptr(&c->snapshots, U32_MAX - id);
} }
static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
{
return snapshot_t(c, id)->parent;
}
static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
{ {
#ifdef CONFIG_BCACHEFS_DEBUG
u32 parent = snapshot_t(c, id)->parent;
if (parent &&
snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
panic("id %u depth=%u parent %u depth=%u\n",
id, snapshot_t(c, id)->depth,
parent, snapshot_t(c, parent)->depth);
return parent;
#else
return snapshot_t(c, id)->parent; return snapshot_t(c, id)->parent;
#endif
}
static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n)
{
while (n--)
id = bch2_snapshot_parent(c, id);
return id;
} }
static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
...@@ -84,13 +109,7 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) ...@@ -84,13 +109,7 @@ static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
return 0; return 0;
} }
static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32);
{
while (id && id < ancestor)
id = bch2_snapshot_parent(c, id);
return id == ancestor;
}
static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
{ {
......
...@@ -8,6 +8,8 @@ typedef DARRAY(u32) snapshot_id_list; ...@@ -8,6 +8,8 @@ typedef DARRAY(u32) snapshot_id_list;
struct snapshot_t { struct snapshot_t {
u32 parent; u32 parent;
u32 skip[3];
u32 depth;
u32 children[2]; u32 children[2];
u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 subvol; /* Nonzero only if a subvolume points to this node: */
u32 tree; u32 tree;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment