Commit 76426098 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Reflink

Signed-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 3c7f3b7a
...@@ -44,6 +44,7 @@ bcachefs-y := \ ...@@ -44,6 +44,7 @@ bcachefs-y := \
quota.o \ quota.o \
rebalance.o \ rebalance.o \
recovery.o \ recovery.o \
reflink.o \
replicas.o \ replicas.o \
siphash.o \ siphash.o \
six.o \ six.o \
......
...@@ -361,6 +361,7 @@ enum gc_phase { ...@@ -361,6 +361,7 @@ enum gc_phase {
GC_PHASE_BTREE_XATTRS, GC_PHASE_BTREE_XATTRS,
GC_PHASE_BTREE_ALLOC, GC_PHASE_BTREE_ALLOC,
GC_PHASE_BTREE_QUOTAS, GC_PHASE_BTREE_QUOTAS,
GC_PHASE_BTREE_REFLINK,
GC_PHASE_PENDING_DELETE, GC_PHASE_PENDING_DELETE,
GC_PHASE_ALLOC, GC_PHASE_ALLOC,
...@@ -750,6 +751,9 @@ struct bch_fs { ...@@ -750,6 +751,9 @@ struct bch_fs {
struct work_struct ec_stripe_delete_work; struct work_struct ec_stripe_delete_work;
struct llist_head ec_stripe_delete_list; struct llist_head ec_stripe_delete_list;
/* REFLINK */
u64 reflink_hint;
/* VFS IO PATH - fs-io.c */ /* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset; struct bio_set writepage_bioset;
struct bio_set dio_write_bioset; struct bio_set dio_write_bioset;
......
...@@ -340,7 +340,9 @@ static inline void bkey_init(struct bkey *k) ...@@ -340,7 +340,9 @@ static inline void bkey_init(struct bkey *k)
x(xattr, 11) \ x(xattr, 11) \
x(alloc, 12) \ x(alloc, 12) \
x(quota, 13) \ x(quota, 13) \
x(stripe, 14) x(stripe, 14) \
x(reflink_p, 15) \
x(reflink_v, 16)
enum bch_bkey_type { enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr, #define x(name, nr) KEY_TYPE_##name = nr,
...@@ -895,6 +897,24 @@ struct bch_stripe { ...@@ -895,6 +897,24 @@ struct bch_stripe {
struct bch_extent_ptr ptrs[0]; struct bch_extent_ptr ptrs[0];
} __attribute__((packed, aligned(8))); } __attribute__((packed, aligned(8)));
/* Reflink: */
struct bch_reflink_p {
struct bch_val v;
__le64 idx;
__le32 reservation_generation;
__u8 nr_replicas;
__u8 pad[3];
};
struct bch_reflink_v {
struct bch_val v;
__le64 refcount;
union bch_extent_entry start[0];
__u64 _data[0];
};
/* Optional/variable size superblock sections: */ /* Optional/variable size superblock sections: */
struct bch_sb_field { struct bch_sb_field {
...@@ -1297,6 +1317,7 @@ enum bch_sb_features { ...@@ -1297,6 +1317,7 @@ enum bch_sb_features {
BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */
BCH_FEATURE_EC = 4, BCH_FEATURE_EC = 4,
BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
BCH_FEATURE_REFLINK = 6,
BCH_FEATURE_NR, BCH_FEATURE_NR,
}; };
...@@ -1487,7 +1508,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ...@@ -1487,7 +1508,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
x(XATTRS, 3, "xattrs") \ x(XATTRS, 3, "xattrs") \
x(ALLOC, 4, "alloc") \ x(ALLOC, 4, "alloc") \
x(QUOTAS, 5, "quotas") \ x(QUOTAS, 5, "quotas") \
x(EC, 6, "erasure_coding") x(EC, 6, "erasure_coding") \
x(REFLINK, 7, "reflink")
enum btree_id { enum btree_id {
#define x(kwd, val, name) BTREE_ID_##kwd = val, #define x(kwd, val, name) BTREE_ID_##kwd = val,
......
...@@ -560,6 +560,8 @@ BKEY_VAL_ACCESSORS(xattr); ...@@ -560,6 +560,8 @@ BKEY_VAL_ACCESSORS(xattr);
BKEY_VAL_ACCESSORS(alloc); BKEY_VAL_ACCESSORS(alloc);
BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(quota);
BKEY_VAL_ACCESSORS(stripe); BKEY_VAL_ACCESSORS(stripe);
BKEY_VAL_ACCESSORS(reflink_p);
BKEY_VAL_ACCESSORS(reflink_v);
/* byte order helpers */ /* byte order helpers */
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include "extents.h" #include "extents.h"
#include "inode.h" #include "inode.h"
#include "quota.h" #include "quota.h"
#include "reflink.h"
#include "xattr.h" #include "xattr.h"
const char * const bch2_bkey_types[] = { const char * const bch2_bkey_types[] = {
......
...@@ -464,7 +464,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) ...@@ -464,7 +464,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
static inline bool btree_node_type_is_extents(enum btree_node_type type) static inline bool btree_node_type_is_extents(enum btree_node_type type)
{ {
return type == BKEY_TYPE_EXTENTS; switch (type) {
case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_REFLINK:
return true;
default:
return false;
}
} }
static inline bool btree_node_is_extents(struct btree *b) static inline bool btree_node_is_extents(struct btree *b)
...@@ -480,6 +486,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) ...@@ -480,6 +486,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
case BKEY_TYPE_EXTENTS: case BKEY_TYPE_EXTENTS:
case BKEY_TYPE_INODES: case BKEY_TYPE_INODES:
case BKEY_TYPE_EC: case BKEY_TYPE_EC:
case BKEY_TYPE_REFLINK:
return true; return true;
default: default:
return false; return false;
......
...@@ -521,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans, ...@@ -521,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans,
{ {
return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) &&
(i->iter->btree_id == BTREE_ID_EXTENTS || (i->iter->btree_id == BTREE_ID_EXTENTS ||
i->iter->btree_id == BTREE_ID_INODES); i->iter->btree_id == BTREE_ID_INODES ||
i->iter->btree_id == BTREE_ID_REFLINK);
} }
static inline bool update_has_triggers(struct btree_trans *trans, static inline bool update_has_triggers(struct btree_trans *trans,
......
...@@ -972,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, ...@@ -972,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
spin_unlock(&c->ec_stripes_heap_lock); spin_unlock(&c->ec_stripes_heap_lock);
bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
(u64) p.idx); (u64) p.idx);
return -1; return -EIO;
} }
BUG_ON(m->r.e.data_type != data_type); BUG_ON(m->r.e.data_type != data_type);
...@@ -1144,6 +1144,7 @@ int bch2_mark_key_locked(struct bch_fs *c, ...@@ -1144,6 +1144,7 @@ int bch2_mark_key_locked(struct bch_fs *c,
fs_usage, journal_seq, flags); fs_usage, journal_seq, flags);
break; break;
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER,
fs_usage, journal_seq, flags); fs_usage, journal_seq, flags);
break; break;
...@@ -1304,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, ...@@ -1304,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
xchg(&warned_disk_usage, 1)) xchg(&warned_disk_usage, 1))
return; return;
pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); bch_err(c, "disk usage increased more than %llu sectors reserved",
disk_res_sectors);
trans_for_each_update_iter(trans, i) { trans_for_each_update_iter(trans, i) {
struct btree_iter *iter = i->iter; struct btree_iter *iter = i->iter;
...@@ -1471,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ...@@ -1471,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
struct bch_extent_stripe_ptr p, struct bch_extent_stripe_ptr p,
s64 sectors, enum bch_data_type data_type) s64 sectors, enum bch_data_type data_type)
{ {
struct bch_fs *c = trans->c;
struct bch_replicas_padded r; struct bch_replicas_padded r;
struct btree_iter *iter; struct btree_iter *iter;
struct bkey_i *new_k; struct bkey_i *new_k;
...@@ -1487,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ...@@ -1487,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
return ret; return ret;
if (k.k->type != KEY_TYPE_stripe) { if (k.k->type != KEY_TYPE_stripe) {
bch_err_ratelimited(trans->c, bch2_fs_inconsistent(c,
"pointer to nonexistent stripe %llu", "pointer to nonexistent stripe %llu",
(u64) p.idx); (u64) p.idx);
ret = -1; ret = -EIO;
goto out; goto out;
} }
...@@ -1578,6 +1581,84 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, ...@@ -1578,6 +1581,84 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
return 0; return 0;
} }
static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p,
u64 idx, unsigned sectors,
unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_iter *iter;
struct bkey_i *new_k;
struct bkey_s_c k;
struct bkey_i_reflink_v *r_v;
s64 ret;
ret = trans_get_key(trans, BTREE_ID_REFLINK,
POS(0, idx), &iter, &k);
if (ret)
return ret;
if (k.k->type != KEY_TYPE_reflink_v) {
bch2_fs_inconsistent(c,
"%llu:%llu len %u points to nonexistent indirect extent %llu",
p.k->p.inode, p.k->p.offset, p.k->size, idx);
ret = -EIO;
goto err;
}
if ((flags & BCH_BUCKET_MARK_OVERWRITE) &&
(bkey_start_offset(k.k) < idx ||
k.k->p.offset > idx + sectors))
goto out;
bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
new_k = trans_update_key(trans, iter, k.k->u64s);
ret = PTR_ERR_OR_ZERO(new_k);
if (ret)
goto err;
bkey_reassemble(new_k, k);
r_v = bkey_i_to_reflink_v(new_k);
le64_add_cpu(&r_v->v.refcount,
!(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1);
if (!r_v->v.refcount) {
r_v->k.type = KEY_TYPE_deleted;
set_bkey_val_u64s(&r_v->k, 0);
}
out:
ret = k.k->p.offset - idx;
err:
bch2_trans_iter_put(trans, iter);
return ret;
}
static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
struct bkey_s_c_reflink_p p, unsigned offset,
s64 sectors, unsigned flags)
{
u64 idx = le64_to_cpu(p.v->idx) + offset;
s64 ret = 0;
sectors = abs(sectors);
BUG_ON(offset + sectors > p.k->size);
while (sectors) {
ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
if (ret < 0)
break;
idx += ret;
sectors = max_t(s64, 0LL, sectors - ret);
ret = 0;
}
return ret;
}
int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
unsigned offset, s64 sectors, unsigned flags) unsigned offset, s64 sectors, unsigned flags)
{ {
...@@ -1593,6 +1674,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ...@@ -1593,6 +1674,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
return bch2_trans_mark_extent(trans, k, offset, sectors, return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_BTREE); flags, BCH_DATA_BTREE);
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
return bch2_trans_mark_extent(trans, k, offset, sectors, return bch2_trans_mark_extent(trans, k, offset, sectors,
flags, BCH_DATA_USER); flags, BCH_DATA_USER);
case KEY_TYPE_inode: case KEY_TYPE_inode:
...@@ -1616,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ...@@ -1616,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k,
d->fs_usage.persistent_reserved[replicas - 1] += sectors; d->fs_usage.persistent_reserved[replicas - 1] += sectors;
return 0; return 0;
} }
case KEY_TYPE_reflink_p:
return bch2_trans_mark_reflink_p(trans,
bkey_s_c_to_reflink_p(k),
offset, sectors, flags);
default: default:
return 0; return 0;
} }
......
...@@ -744,7 +744,8 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) ...@@ -744,7 +744,8 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
case KEY_TYPE_error: case KEY_TYPE_error:
case KEY_TYPE_cookie: case KEY_TYPE_cookie:
break; break;
case KEY_TYPE_extent: { case KEY_TYPE_extent:
case KEY_TYPE_reflink_v: {
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry; union bch_extent_entry *entry;
bool seen_crc = false; bool seen_crc = false;
...@@ -774,6 +775,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) ...@@ -774,6 +775,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k)
break; break;
} }
case KEY_TYPE_reflink_p: {
struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
le64_add_cpu(&p.v->idx, sub);
break;
}
case KEY_TYPE_reservation: case KEY_TYPE_reservation:
break; break;
default: default:
...@@ -968,6 +975,33 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans, ...@@ -968,6 +975,33 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans,
} }
break; break;
case KEY_TYPE_reflink_p: {
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
u64 idx = le64_to_cpu(p.v->idx);
unsigned sectors = end->offset - bkey_start_offset(p.k);
struct btree_iter *iter;
struct bkey_s_c r_k;
for_each_btree_key(trans, iter,
BTREE_ID_REFLINK, POS(0, idx + offset),
BTREE_ITER_SLOTS, r_k, ret) {
if (bkey_cmp(bkey_start_pos(r_k.k),
POS(0, idx + sectors)) >= 0)
break;
*nr_iters += 1;
if (*nr_iters >= max_iters) {
struct bpos pos = bkey_start_pos(k.k);
pos.offset += r_k.k->p.offset - idx;
*end = bpos_min(*end, pos);
break;
}
}
bch2_trans_iter_put(trans, iter);
break;
}
} }
return ret; return ret;
...@@ -1561,17 +1595,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ...@@ -1561,17 +1595,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
return false; return false;
} }
void bch2_extent_mark_replicas_cached(struct bch_fs *c, void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
struct bkey_s_extent e,
unsigned target, unsigned target,
unsigned nr_desired_replicas) unsigned nr_desired_replicas)
{ {
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry; union bch_extent_entry *entry;
struct extent_ptr_decoded p; struct extent_ptr_decoded p;
int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
if (target && extra > 0) if (target && extra > 0)
extent_for_each_ptr_decode(e, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p); int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra && if (n && n <= extra &&
...@@ -1582,7 +1616,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, ...@@ -1582,7 +1616,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
} }
if (extra > 0) if (extra > 0)
extent_for_each_ptr_decode(e, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int n = bch2_extent_ptr_durability(c, p); int n = bch2_extent_ptr_durability(c, p);
if (n && n <= extra) { if (n && n <= extra) {
......
...@@ -306,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ...@@ -306,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
to_entry(&s.v->ptrs[s.v->nr_blocks]), to_entry(&s.v->ptrs[s.v->nr_blocks]),
}; };
} }
case KEY_TYPE_reflink_v: {
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
return (struct bkey_ptrs_c) {
r.v->start,
bkey_val_end(r),
};
}
default: default:
return (struct bkey_ptrs_c) { NULL, NULL }; return (struct bkey_ptrs_c) { NULL, NULL };
} }
...@@ -436,7 +444,7 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, ...@@ -436,7 +444,7 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
void bch2_insert_fixup_extent(struct btree_trans *, void bch2_insert_fixup_extent(struct btree_trans *,
struct btree_insert_entry *); struct btree_insert_entry *);
void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
unsigned, unsigned); unsigned, unsigned);
const struct bch_extent_ptr * const struct bch_extent_ptr *
...@@ -452,17 +460,24 @@ static inline bool bkey_extent_is_data(const struct bkey *k) ...@@ -452,17 +460,24 @@ static inline bool bkey_extent_is_data(const struct bkey *k)
switch (k->type) { switch (k->type) {
case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr:
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reflink_p:
case KEY_TYPE_reflink_v:
return true; return true;
default: default:
return false; return false;
} }
} }
/*
* Should extent be counted under inode->i_sectors?
*/
static inline bool bkey_extent_is_allocation(const struct bkey *k) static inline bool bkey_extent_is_allocation(const struct bkey *k)
{ {
switch (k->type) { switch (k->type) {
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reservation: case KEY_TYPE_reservation:
case KEY_TYPE_reflink_p:
case KEY_TYPE_reflink_v:
return true; return true;
default: default:
return false; return false;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "io.h" #include "io.h"
#include "keylist.h" #include "keylist.h"
#include "quota.h" #include "quota.h"
#include "reflink.h"
#include "trace.h" #include "trace.h"
#include <linux/aio.h> #include <linux/aio.h>
...@@ -201,7 +202,7 @@ static int inode_set_size(struct bch_inode_info *inode, ...@@ -201,7 +202,7 @@ static int inode_set_size(struct bch_inode_info *inode,
return 0; return 0;
} }
static int __must_check bch2_write_inode_size(struct bch_fs *c, int __must_check bch2_write_inode_size(struct bch_fs *c,
struct bch_inode_info *inode, struct bch_inode_info *inode,
loff_t new_size, unsigned fields) loff_t new_size, unsigned fields)
{ {
...@@ -936,15 +937,12 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ...@@ -936,15 +937,12 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
{ {
struct bvec_iter iter; struct bvec_iter iter;
struct bio_vec bv; struct bio_vec bv;
unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
? 0 : bch2_bkey_nr_ptrs_allocated(k);
unsigned state = k.k->type == KEY_TYPE_reservation unsigned state = k.k->type == KEY_TYPE_reservation
? SECTOR_RESERVED ? SECTOR_RESERVED
: SECTOR_ALLOCATED; : SECTOR_ALLOCATED;
BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k));
BUG_ON(bio_end_sector(bio) > k.k->p.offset);
bio_for_each_segment(bv, bio, iter) { bio_for_each_segment(bv, bio, iter) {
struct bch_page_state *s = bch2_page_state(bv.bv_page); struct bch_page_state *s = bch2_page_state(bv.bv_page);
unsigned i; unsigned i;
...@@ -959,10 +957,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ...@@ -959,10 +957,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
} }
static void readpage_bio_extend(struct readpages_iter *iter, static void readpage_bio_extend(struct readpages_iter *iter,
struct bio *bio, u64 offset, struct bio *bio,
unsigned sectors_this_extent,
bool get_more) bool get_more)
{ {
while (bio_end_sector(bio) < offset && while (bio_sectors(bio) < sectors_this_extent &&
bio->bi_vcnt < bio->bi_max_vecs) { bio->bi_vcnt < bio->bi_max_vecs) {
pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
struct page *page = readpage_iter_next(iter); struct page *page = readpage_iter_next(iter);
...@@ -1012,35 +1011,39 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1012,35 +1011,39 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
struct bch_fs *c = trans->c; struct bch_fs *c = trans->c;
int flags = BCH_READ_RETRY_IF_STALE| int flags = BCH_READ_RETRY_IF_STALE|
BCH_READ_MAY_PROMOTE; BCH_READ_MAY_PROMOTE;
int ret = 0;
rbio->c = c; rbio->c = c;
rbio->start_time = local_clock(); rbio->start_time = local_clock();
retry:
while (1) { while (1) {
BKEY_PADDED(k) tmp; BKEY_PADDED(k) tmp;
struct bkey_s_c k; struct bkey_s_c k;
unsigned bytes, offset_into_extent; unsigned bytes, sectors, offset_into_extent;
bch2_btree_iter_set_pos(iter, bch2_btree_iter_set_pos(iter,
POS(inum, rbio->bio.bi_iter.bi_sector)); POS(inum, rbio->bio.bi_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter); k = bch2_btree_iter_peek_slot(iter);
BUG_ON(!k.k); ret = bkey_err(k);
if (ret)
if (IS_ERR(k.k)) { break;
int ret = btree_iter_err(iter);
BUG_ON(!ret);
bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
bio_endio(&rbio->bio);
return;
}
bkey_reassemble(&tmp.k, k); bkey_reassemble(&tmp.k, k);
bch2_trans_unlock(trans);
k = bkey_i_to_s_c(&tmp.k); k = bkey_i_to_s_c(&tmp.k);
offset_into_extent = iter->pos.offset - offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k); bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(trans, iter,
&offset_into_extent, &tmp.k);
if (ret)
break;
sectors = min(sectors, k.k->size - offset_into_extent);
bch2_trans_unlock(trans);
if (readpages_iter) { if (readpages_iter) {
bool want_full_extent = false; bool want_full_extent = false;
...@@ -1055,13 +1058,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1055,13 +1058,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
(p.crc.compression_type != 0)); (p.crc.compression_type != 0));
} }
readpage_bio_extend(readpages_iter, readpage_bio_extend(readpages_iter, &rbio->bio,
&rbio->bio, k.k->p.offset, sectors, want_full_extent);
want_full_extent);
} }
bytes = min_t(unsigned, bio_sectors(&rbio->bio), bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
(k.k->size - offset_into_extent)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes); swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes) if (rbio->bio.bi_iter.bi_size == bytes)
...@@ -1078,6 +1079,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ...@@ -1078,6 +1079,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
swap(rbio->bio.bi_iter.bi_size, bytes); swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes); bio_advance(&rbio->bio, bytes);
} }
if (ret == -EINTR)
goto retry;
bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
bio_endio(&rbio->bio);
} }
void bch2_readahead(struct readahead_control *ractl) void bch2_readahead(struct readahead_control *ractl)
...@@ -2256,29 +2263,25 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ...@@ -2256,29 +2263,25 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
/* truncate: */ /* truncate: */
static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
u64 start_offset, u64 end_offset, u64 *journal_seq) struct bpos end, struct bch_inode_info *inode,
u64 new_i_size)
{ {
struct bpos start = POS(inode->v.i_ino, start_offset); struct bch_fs *c = trans->c;
struct bpos end = POS(inode->v.i_ino, end_offset);
unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c k; struct bkey_s_c k;
int ret = 0; int ret = 0, ret2 = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start,
BTREE_ITER_INTENT);
while ((k = bch2_btree_iter_peek(iter)).k && while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k)) &&
bkey_cmp(iter->pos, end) < 0) { bkey_cmp(iter->pos, end) < 0) {
struct disk_reservation disk_res = struct disk_reservation disk_res =
bch2_disk_reservation_init(c, 0); bch2_disk_reservation_init(c, 0);
struct bkey_i delete; struct bkey_i delete;
ret = bkey_err(k);
if (ret)
goto btree_err;
bkey_init(&delete.k); bkey_init(&delete.k);
delete.k.p = iter->pos; delete.k.p = iter->pos;
...@@ -2286,23 +2289,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, ...@@ -2286,23 +2289,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
bch2_key_resize(&delete.k, max_sectors); bch2_key_resize(&delete.k, max_sectors);
bch2_cut_back(end, &delete.k); bch2_cut_back(end, &delete.k);
bch2_trans_begin_updates(&trans); bch2_trans_begin_updates(trans);
ret = bch2_extent_update(&trans, inode, ret = bch2_extent_update(trans, inode,
&disk_res, NULL, iter, &delete, &disk_res, NULL, iter, &delete,
0, true, true, NULL); new_i_size, false, true, NULL);
bch2_disk_reservation_put(c, &disk_res); bch2_disk_reservation_put(c, &disk_res);
btree_err:
if (ret == -EINTR) if (ret == -EINTR) {
ret2 = ret;
ret = 0; ret = 0;
}
if (ret) if (ret)
break; break;
}
bch2_trans_cond_resched(&trans); if (bkey_cmp(iter->pos, end) > 0) {
bch2_btree_iter_set_pos(iter, end);
ret = bch2_btree_iter_traverse(iter);
} }
return ret ?: ret2;
}
static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode,
u64 start_offset, u64 end_offset)
{
struct btree_trans trans;
struct btree_iter *iter;
int ret = 0;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode->v.i_ino, start_offset),
BTREE_ITER_INTENT);
ret = bch2_fpunch_at(&trans, iter,
POS(inode->v.i_ino, end_offset),
inode, 0);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
if (ret == -EINTR)
ret = 0;
return ret; return ret;
} }
...@@ -2510,7 +2541,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ...@@ -2510,7 +2541,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
ret = __bch2_fpunch(c, inode, ret = __bch2_fpunch(c, inode,
round_up(iattr->ia_size, block_bytes(c)) >> 9, round_up(iattr->ia_size, block_bytes(c)) >> 9,
U64_MAX, &inode->ei_journal_seq); U64_MAX);
if (unlikely(ret)) if (unlikely(ret))
goto err; goto err;
...@@ -2557,8 +2588,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ...@@ -2557,8 +2588,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
truncate_pagecache_range(&inode->v, offset, offset + len - 1); truncate_pagecache_range(&inode->v, offset, offset + len - 1);
if (discard_start < discard_end) if (discard_start < discard_end)
ret = __bch2_fpunch(c, inode, discard_start, discard_end, ret = __bch2_fpunch(c, inode, discard_start, discard_end);
&inode->ei_journal_seq);
err: err:
bch2_pagecache_block_put(&inode->ei_pagecache_lock); bch2_pagecache_block_put(&inode->ei_pagecache_lock);
inode_unlock(&inode->v); inode_unlock(&inode->v);
...@@ -2670,7 +2700,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode, ...@@ -2670,7 +2700,7 @@ static long bch2_fcollapse(struct bch_inode_info *inode,
ret = __bch2_fpunch(c, inode, ret = __bch2_fpunch(c, inode,
round_up(new_size, block_bytes(c)) >> 9, round_up(new_size, block_bytes(c)) >> 9,
U64_MAX, &inode->ei_journal_seq); U64_MAX);
if (ret) if (ret)
goto err; goto err;
...@@ -2853,6 +2883,94 @@ long bch2_fallocate_dispatch(struct file *file, int mode, ...@@ -2853,6 +2883,94 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static void mark_range_unallocated(struct bch_inode_info *inode,
loff_t start, loff_t end)
{
pgoff_t index = start >> PAGE_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
struct folio_batch fbatch;
unsigned i, j;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
struct bch_page_state *s;
folio_lock(folio);
s = bch2_page_state(&folio->page);
if (s)
for (j = 0; j < PAGE_SECTORS; j++)
s->s[j].nr_replicas = 0;
folio_unlock(folio);
}
folio_batch_release(&fbatch);
cond_resched();
}
}
loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
struct file *file_dst, loff_t pos_dst,
loff_t len, unsigned remap_flags)
{
struct bch_inode_info *src = file_bch_inode(file_src);
struct bch_inode_info *dst = file_bch_inode(file_dst);
struct bch_fs *c = src->v.i_sb->s_fs_info;
loff_t ret = 0;
loff_t aligned_len;
if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
return -EINVAL;
if (remap_flags & REMAP_FILE_DEDUP)
return -EOPNOTSUPP;
if ((pos_src & (block_bytes(c) - 1)) ||
(pos_dst & (block_bytes(c) - 1)))
return -EINVAL;
if (src == dst &&
abs(pos_src - pos_dst) < len)
return -EINVAL;
bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
ret = generic_remap_file_range_prep(file_src, pos_src,
file_dst, pos_dst,
&len, remap_flags);
if (ret < 0 || len == 0)
goto out_unlock;
aligned_len = round_up(len, block_bytes(c));
ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
pos_dst, pos_dst + aligned_len);
if (ret)
goto out_unlock;
mark_range_unallocated(src, pos_src, pos_src + aligned_len);
ret = bch2_remap_range(c, dst,
POS(dst->v.i_ino, pos_dst >> 9),
POS(src->v.i_ino, pos_src >> 9),
aligned_len >> 9,
pos_dst + len);
if (ret > 0)
ret = min(ret << 9, len);
out_unlock:
bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
return ret;
}
/* fseek: */ /* fseek: */
static int folio_data_offset(struct folio *folio, unsigned offset) static int folio_data_offset(struct folio *folio, unsigned offset)
......
...@@ -9,6 +9,22 @@ ...@@ -9,6 +9,22 @@
#include <linux/uio.h> #include <linux/uio.h>
struct quota_res;
int bch2_extent_update(struct btree_trans *,
struct bch_inode_info *,
struct disk_reservation *,
struct quota_res *,
struct btree_iter *,
struct bkey_i *,
u64, bool, bool, s64 *);
int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
struct bpos, struct bch_inode_info *, u64);
int __must_check bch2_write_inode_size(struct bch_fs *,
struct bch_inode_info *,
loff_t, unsigned);
int bch2_writepage(struct page *, struct writeback_control *); int bch2_writepage(struct page *, struct writeback_control *);
int bch2_read_folio(struct file *, struct folio *); int bch2_read_folio(struct file *, struct folio *);
...@@ -28,6 +44,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); ...@@ -28,6 +44,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int);
int bch2_truncate(struct bch_inode_info *, struct iattr *); int bch2_truncate(struct bch_inode_info *, struct iattr *);
long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
loff_t, loff_t, unsigned);
loff_t bch2_llseek(struct file *, loff_t, int); loff_t bch2_llseek(struct file *, loff_t, int);
vm_fault_t bch2_page_fault(struct vm_fault *); vm_fault_t bch2_page_fault(struct vm_fault *);
......
...@@ -1157,6 +1157,9 @@ static int bch2_fill_extent(struct bch_fs *c, ...@@ -1157,6 +1157,9 @@ static int bch2_fill_extent(struct bch_fs *c,
struct extent_ptr_decoded p; struct extent_ptr_decoded p;
int ret; int ret;
if (k.k->type == KEY_TYPE_reflink_v)
flags |= FIEMAP_EXTENT_SHARED;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
int flags2 = 0; int flags2 = 0;
u64 offset = p.ptr.offset; u64 offset = p.ptr.offset;
...@@ -1200,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ...@@ -1200,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
struct btree_iter *iter; struct btree_iter *iter;
struct bkey_s_c k; struct bkey_s_c k;
BKEY_PADDED(k) cur, prev; BKEY_PADDED(k) cur, prev;
unsigned offset_into_extent, sectors;
bool have_extent = false; bool have_extent = false;
int ret = 0; int ret = 0;
...@@ -1212,15 +1216,36 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ...@@ -1212,15 +1216,36 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(ei->v.i_ino, start >> 9), 0, k, ret) { POS(ei->v.i_ino, start >> 9),
if (bkey_cmp(bkey_start_pos(k.k), BTREE_ITER_SLOTS);
POS(ei->v.i_ino, (start + len) >> 9)) >= 0)
break; while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) {
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
bkey_reassemble(&cur.k, k); bkey_reassemble(&cur.k, k);
k = bkey_i_to_s_c(&cur.k); k = bkey_i_to_s_c(&cur.k);
offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(&trans, iter,
&offset_into_extent, &cur.k);
if (ret)
break;
sectors = min(sectors, k.k->size - offset_into_extent);
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) + offset_into_extent),
&cur.k);
bch2_key_resize(&cur.k.k, sectors);
cur.k.k.p.offset = iter->pos.offset + cur.k.k.size;
if (bkey_extent_is_data(k.k) || if (bkey_extent_is_data(k.k) ||
k.k->type == KEY_TYPE_reservation) { k.k->type == KEY_TYPE_reservation) {
if (have_extent) { if (have_extent) {
...@@ -1233,12 +1258,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ...@@ -1233,12 +1258,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bkey_copy(&prev.k, &cur.k); bkey_copy(&prev.k, &cur.k);
have_extent = true; have_extent = true;
} }
bch2_btree_iter_set_pos(iter,
POS(iter->pos.inode,
iter->pos.offset + sectors));
} }
if (!ret && have_extent) if (!ret && have_extent)
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
FIEMAP_EXTENT_LAST); FIEMAP_EXTENT_LAST);
err:
ret = bch2_trans_exit(&trans) ?: ret; ret = bch2_trans_exit(&trans) ?: ret;
return ret < 0 ? ret : 0; return ret < 0 ? ret : 0;
} }
...@@ -1286,6 +1315,7 @@ static const struct file_operations bch_file_operations = { ...@@ -1286,6 +1315,7 @@ static const struct file_operations bch_file_operations = {
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = bch2_compat_fs_ioctl, .compat_ioctl = bch2_compat_fs_ioctl,
#endif #endif
.remap_file_range = bch2_remap_file_range,
}; };
static const struct inode_operations bch_file_inode_operations = { static const struct inode_operations bch_file_inode_operations = {
......
...@@ -59,7 +59,8 @@ static inline int ptrcmp(void *l, void *r) ...@@ -59,7 +59,8 @@ static inline int ptrcmp(void *l, void *r)
enum bch_inode_lock_op { enum bch_inode_lock_op {
INODE_LOCK = (1U << 0), INODE_LOCK = (1U << 0),
INODE_UPDATE_LOCK = (1U << 1), INODE_PAGECACHE_BLOCK = (1U << 1),
INODE_UPDATE_LOCK = (1U << 2),
}; };
#define bch2_lock_inodes(_locks, ...) \ #define bch2_lock_inodes(_locks, ...) \
...@@ -71,9 +72,11 @@ do { \ ...@@ -71,9 +72,11 @@ do { \
\ \
for (i = 1; i < ARRAY_SIZE(a); i++) \ for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \ if (a[i] != a[i - 1]) { \
if (_locks & INODE_LOCK) \ if ((_locks) & INODE_LOCK) \
down_write_nested(&a[i]->v.i_rwsem, i); \ down_write_nested(&a[i]->v.i_rwsem, i); \
if (_locks & INODE_UPDATE_LOCK) \ if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_lock_nested(&a[i]->ei_update_lock, i);\ mutex_lock_nested(&a[i]->ei_update_lock, i);\
} \ } \
} while (0) } while (0)
...@@ -87,9 +90,11 @@ do { \ ...@@ -87,9 +90,11 @@ do { \
\ \
for (i = 1; i < ARRAY_SIZE(a); i++) \ for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \ if (a[i] != a[i - 1]) { \
if (_locks & INODE_LOCK) \ if ((_locks) & INODE_LOCK) \
up_write(&a[i]->v.i_rwsem); \ up_write(&a[i]->v.i_rwsem); \
if (_locks & INODE_UPDATE_LOCK) \ if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
if ((_locks) & INODE_UPDATE_LOCK) \
mutex_unlock(&a[i]->ei_update_lock); \ mutex_unlock(&a[i]->ei_update_lock); \
} \ } \
} while (0) } while (0)
......
...@@ -1041,6 +1041,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ...@@ -1041,6 +1041,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
noinline noinline
static struct promote_op *__promote_alloc(struct bch_fs *c, static struct promote_op *__promote_alloc(struct bch_fs *c,
enum btree_id btree_id,
struct bpos pos, struct bpos pos,
struct extent_ptr_decoded *pick, struct extent_ptr_decoded *pick,
struct bch_io_opts opts, struct bch_io_opts opts,
...@@ -1097,6 +1098,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, ...@@ -1097,6 +1098,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
(struct data_opts) { (struct data_opts) {
.target = opts.promote_target .target = opts.promote_target
}, },
btree_id,
bkey_s_c_null); bkey_s_c_null);
BUG_ON(ret); BUG_ON(ret);
...@@ -1134,7 +1136,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, ...@@ -1134,7 +1136,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c,
if (!should_promote(c, k, pos, opts, flags)) if (!should_promote(c, k, pos, opts, flags))
return NULL; return NULL;
promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); promote = __promote_alloc(c,
k.k->type == KEY_TYPE_reflink_v
? BTREE_ID_REFLINK
: BTREE_ID_EXTENTS,
pos, pick, opts, sectors, rbio);
if (!promote) if (!promote)
return NULL; return NULL;
...@@ -1278,18 +1284,25 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, ...@@ -1278,18 +1284,25 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
POS(inode, bvec_iter.bi_sector), POS(inode, bvec_iter.bi_sector),
BTREE_ITER_SLOTS, k, ret) { BTREE_ITER_SLOTS, k, ret) {
BKEY_PADDED(k) tmp; BKEY_PADDED(k) tmp;
unsigned bytes, offset_into_extent; unsigned bytes, sectors, offset_into_extent;
bkey_reassemble(&tmp.k, k); bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k); k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
offset_into_extent = iter->pos.offset - offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k); bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(&trans, iter,
&offset_into_extent, &tmp.k);
if (ret)
break;
bytes = min_t(unsigned, bvec_iter_sectors(bvec_iter), sectors = min(sectors, k.k->size - offset_into_extent);
(k.k->size - offset_into_extent)) << 9;
bch2_trans_unlock(&trans);
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes); swap(bvec_iter.bi_size, bytes);
ret = __bch2_read_extent(c, rbio, bvec_iter, k, ret = __bch2_read_extent(c, rbio, bvec_iter, k,
...@@ -1569,6 +1582,48 @@ static void bch2_read_endio(struct bio *bio) ...@@ -1569,6 +1582,48 @@ static void bch2_read_endio(struct bio *bio)
bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
} }
int bch2_read_indirect_extent(struct btree_trans *trans,
struct btree_iter *extent_iter,
unsigned *offset_into_extent,
struct bkey_i *orig_k)
{
struct btree_iter *iter;
struct bkey_s_c k;
u64 reflink_offset;
int ret;
if (orig_k->k.type != KEY_TYPE_reflink_p)
return 0;
reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) +
*offset_into_extent;
iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK,
POS(0, reflink_offset),
BTREE_ITER_SLOTS, 1);
ret = PTR_ERR_OR_ZERO(iter);
if (ret)
return ret;
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
if (k.k->type != KEY_TYPE_reflink_v) {
__bcache_io_error(trans->c,
"pointer to nonexistent indirect extent");
ret = -EIO;
goto err;
}
*offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
bkey_reassemble(orig_k, k);
err:
bch2_trans_iter_put(trans, iter);
return ret;
}
int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k, struct bvec_iter iter, struct bkey_s_c k,
unsigned offset_into_extent, unsigned offset_into_extent,
...@@ -1644,6 +1699,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ...@@ -1644,6 +1699,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
pos.offset += offset_into_extent; pos.offset += offset_into_extent;
pick.ptr.offset += pick.crc.offset + pick.ptr.offset += pick.crc.offset +
offset_into_extent; offset_into_extent;
offset_into_extent = 0;
pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.compressed_size = bvec_iter_sectors(iter);
pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter);
pick.crc.offset = 0; pick.crc.offset = 0;
...@@ -1829,25 +1885,47 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ...@@ -1829,25 +1885,47 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
rbio->c = c; rbio->c = c;
rbio->start_time = local_clock(); rbio->start_time = local_clock();
for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
POS(inode, rbio->bio.bi_iter.bi_sector), POS(inode, rbio->bio.bi_iter.bi_sector),
BTREE_ITER_SLOTS, k, ret) { BTREE_ITER_SLOTS);
while (1) {
BKEY_PADDED(k) tmp; BKEY_PADDED(k) tmp;
unsigned bytes, offset_into_extent; unsigned bytes, sectors, offset_into_extent;
bch2_btree_iter_set_pos(iter,
POS(inode, rbio->bio.bi_iter.bi_sector));
k = bch2_btree_iter_peek_slot(iter);
ret = bkey_err(k);
if (ret)
goto err;
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
bkey_reassemble(&tmp.k, k); bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k); k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans);
offset_into_extent = iter->pos.offset - offset_into_extent = iter->pos.offset -
bkey_start_offset(k.k); bkey_start_offset(k.k);
sectors = k.k->size - offset_into_extent;
ret = bch2_read_indirect_extent(&trans, iter,
&offset_into_extent, &tmp.k);
if (ret)
goto err;
/*
* With indirect extents, the amount of data to read is the min
* of the original extent and the indirect extent:
*/
sectors = min(sectors, k.k->size - offset_into_extent);
/*
* Unlock the iterator while the btree node's lock is still in
* cache, before doing the IO:
*/
bch2_trans_unlock(&trans);
bytes = min_t(unsigned, bio_sectors(&rbio->bio), bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
(k.k->size - offset_into_extent)) << 9;
swap(rbio->bio.bi_iter.bi_size, bytes); swap(rbio->bio.bi_iter.bi_size, bytes);
if (rbio->bio.bi_iter.bi_size == bytes) if (rbio->bio.bi_iter.bi_size == bytes)
...@@ -1856,21 +1934,18 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ...@@ -1856,21 +1934,18 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
bch2_read_extent(c, rbio, k, offset_into_extent, flags); bch2_read_extent(c, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT) if (flags & BCH_READ_LAST_FRAGMENT)
return; break;
swap(rbio->bio.bi_iter.bi_size, bytes); swap(rbio->bio.bi_iter.bi_size, bytes);
bio_advance(&rbio->bio, bytes); bio_advance(&rbio->bio, bytes);
} }
out:
/*
* If we get here, it better have been because there was an error
* reading a btree node
*/
BUG_ON(!ret);
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_trans_exit(&trans); bch2_trans_exit(&trans);
return;
err:
bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret);
bch2_rbio_done(rbio); bch2_rbio_done(rbio);
goto out;
} }
void bch2_fs_io_exit(struct bch_fs *c) void bch2_fs_io_exit(struct bch_fs *c)
......
...@@ -99,6 +99,9 @@ struct bch_devs_mask; ...@@ -99,6 +99,9 @@ struct bch_devs_mask;
struct cache_promote_op; struct cache_promote_op;
struct extent_ptr_decoded; struct extent_ptr_decoded;
int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *,
unsigned *, struct bkey_i *);
enum bch_read_flags { enum bch_read_flags {
BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_RETRY_IF_STALE = 1 << 0,
BCH_READ_MAY_PROMOTE = 1 << 1, BCH_READ_MAY_PROMOTE = 1 << 1,
......
...@@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ...@@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
return 0; return 0;
} }
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
enum btree_id btree_id)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter *iter; struct btree_iter *iter;
...@@ -44,8 +45,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ...@@ -44,8 +45,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
POS_MIN, BTREE_ITER_PREFETCH); BTREE_ITER_PREFETCH);
while ((k = bch2_btree_iter_peek(iter)).k && while ((k = bch2_btree_iter_peek(iter)).k &&
!(ret = bkey_err(k))) { !(ret = bkey_err(k))) {
...@@ -98,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ...@@ -98,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
return ret; return ret;
} }
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?:
__bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK);
}
static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{ {
struct btree_trans trans; struct btree_trans trans;
......
...@@ -63,13 +63,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ...@@ -63,13 +63,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, iter = bch2_trans_get_iter(&trans, m->btree_id,
bkey_start_pos(&bch2_keylist_front(keys)->k), bkey_start_pos(&bch2_keylist_front(keys)->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT); BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
while (1) { while (1) {
struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
struct bkey_i_extent *insert, *new = struct bkey_i *insert;
struct bkey_i_extent *new =
bkey_i_to_extent(bch2_keylist_front(keys)); bkey_i_to_extent(bch2_keylist_front(keys));
BKEY_PADDED(k) _new, _insert; BKEY_PADDED(k) _new, _insert;
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
...@@ -86,26 +87,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ...@@ -86,26 +87,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
goto nomatch; goto nomatch;
if (m->data_cmd == DATA_REWRITE && if (m->data_cmd == DATA_REWRITE &&
!bch2_extent_has_device(bkey_s_c_to_extent(k), !bch2_bkey_has_device(k, m->data_opts.rewrite_dev))
m->data_opts.rewrite_dev))
goto nomatch; goto nomatch;
bkey_reassemble(&_insert.k, k); bkey_reassemble(&_insert.k, k);
insert = bkey_i_to_extent(&_insert.k); insert = &_insert.k;
bkey_copy(&_new.k, bch2_keylist_front(keys)); bkey_copy(&_new.k, bch2_keylist_front(keys));
new = bkey_i_to_extent(&_new.k); new = bkey_i_to_extent(&_new.k);
bch2_cut_front(iter->pos, &insert->k_i); bch2_cut_front(iter->pos, insert);
bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k); bch2_cut_back(insert->k.p, &new->k);
if (m->data_cmd == DATA_REWRITE) if (m->data_cmd == DATA_REWRITE)
bch2_bkey_drop_device(extent_i_to_s(insert).s, bch2_bkey_drop_device(bkey_i_to_s(insert),
m->data_opts.rewrite_dev); m->data_opts.rewrite_dev);
extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
/* /*
* raced with another move op? extent already * raced with another move op? extent already
* has a pointer to the device we just wrote * has a pointer to the device we just wrote
...@@ -114,17 +114,17 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ...@@ -114,17 +114,17 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
continue; continue;
} }
bch2_extent_ptr_decoded_append(&insert->k_i, &p); bch2_extent_ptr_decoded_append(insert, &p);
did_work = true; did_work = true;
} }
if (!did_work) if (!did_work)
goto nomatch; goto nomatch;
bch2_bkey_narrow_crcs(&insert->k_i, bch2_bkey_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 }); (struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s); bch2_extent_normalize(c, bkey_i_to_s(insert));
bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
op->opts.background_target, op->opts.background_target,
op->opts.data_replicas); op->opts.data_replicas);
...@@ -132,7 +132,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ...@@ -132,7 +132,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
* If we're not fully overwriting @k, and it's compressed, we * If we're not fully overwriting @k, and it's compressed, we
* need a reservation for all the pointers in @insert * need a reservation for all the pointers in @insert
*/ */
nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
m->nr_ptrs_reserved; m->nr_ptrs_reserved;
if (insert->k.size < k.k->size && if (insert->k.size < k.k->size &&
...@@ -148,7 +148,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) ...@@ -148,7 +148,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
} }
bch2_trans_update(&trans, bch2_trans_update(&trans,
BTREE_INSERT_ENTRY(iter, &insert->k_i)); BTREE_INSERT_ENTRY(iter, insert));
ret = bch2_trans_commit(&trans, &op->res, ret = bch2_trans_commit(&trans, &op->res,
op_journal_seq(op), op_journal_seq(op),
...@@ -213,10 +213,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ...@@ -213,10 +213,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
struct bch_io_opts io_opts, struct bch_io_opts io_opts,
enum data_cmd data_cmd, enum data_cmd data_cmd,
struct data_opts data_opts, struct data_opts data_opts,
enum btree_id btree_id,
struct bkey_s_c k) struct bkey_s_c k)
{ {
int ret; int ret;
m->btree_id = btree_id;
m->data_cmd = data_cmd; m->data_cmd = data_cmd;
m->data_opts = data_opts; m->data_opts = data_opts;
m->nr_ptrs_reserved = 0; m->nr_ptrs_reserved = 0;
...@@ -264,11 +266,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ...@@ -264,11 +266,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
break; break;
} }
case DATA_REWRITE: { case DATA_REWRITE: {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry; const union bch_extent_entry *entry;
struct extent_ptr_decoded p; struct extent_ptr_decoded p;
unsigned compressed_sectors = 0; unsigned compressed_sectors = 0;
extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (!p.ptr.cached && if (!p.ptr.cached &&
p.crc.compression_type != BCH_COMPRESSION_NONE && p.crc.compression_type != BCH_COMPRESSION_NONE &&
bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) bch2_dev_in_target(c, p.ptr.dev, data_opts.target))
...@@ -391,6 +394,7 @@ static int bch2_move_extent(struct bch_fs *c, ...@@ -391,6 +394,7 @@ static int bch2_move_extent(struct bch_fs *c,
struct moving_context *ctxt, struct moving_context *ctxt,
struct write_point_specifier wp, struct write_point_specifier wp,
struct bch_io_opts io_opts, struct bch_io_opts io_opts,
enum btree_id btree_id,
struct bkey_s_c k, struct bkey_s_c k,
enum data_cmd data_cmd, enum data_cmd data_cmd,
struct data_opts data_opts) struct data_opts data_opts)
...@@ -443,7 +447,7 @@ static int bch2_move_extent(struct bch_fs *c, ...@@ -443,7 +447,7 @@ static int bch2_move_extent(struct bch_fs *c,
io->rbio.bio.bi_end_io = move_read_endio; io->rbio.bio.bi_end_io = move_read_endio;
ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
data_cmd, data_opts, k); data_cmd, data_opts, btree_id, k);
if (ret) if (ret)
goto err_free_pages; goto err_free_pages;
...@@ -473,16 +477,17 @@ static int bch2_move_extent(struct bch_fs *c, ...@@ -473,16 +477,17 @@ static int bch2_move_extent(struct bch_fs *c,
return ret; return ret;
} }
int bch2_move_data(struct bch_fs *c, static int __bch2_move_data(struct bch_fs *c,
struct moving_context *ctxt,
struct bch_ratelimit *rate, struct bch_ratelimit *rate,
struct write_point_specifier wp, struct write_point_specifier wp,
struct bpos start, struct bpos start,
struct bpos end, struct bpos end,
move_pred_fn pred, void *arg, move_pred_fn pred, void *arg,
struct bch_move_stats *stats) struct bch_move_stats *stats,
enum btree_id btree_id)
{ {
bool kthread = (current->flags & PF_KTHREAD) != 0; bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt = { .stats = stats };
struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp; BKEY_PADDED(k) tmp;
struct btree_trans trans; struct btree_trans trans;
...@@ -493,17 +498,13 @@ int bch2_move_data(struct bch_fs *c, ...@@ -493,17 +498,13 @@ int bch2_move_data(struct bch_fs *c,
u64 delay, cur_inum = U64_MAX; u64 delay, cur_inum = U64_MAX;
int ret = 0, ret2; int ret = 0, ret2;
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
bch2_trans_init(&trans, c, 0, 0); bch2_trans_init(&trans, c, 0, 0);
stats->data_type = BCH_DATA_USER; stats->data_type = BCH_DATA_USER;
stats->btree_id = BTREE_ID_EXTENTS; stats->btree_id = btree_id;
stats->pos = POS_MIN; stats->pos = POS_MIN;
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, iter = bch2_trans_get_iter(&trans, btree_id, start,
BTREE_ITER_PREFETCH); BTREE_ITER_PREFETCH);
if (rate) if (rate)
...@@ -528,7 +529,7 @@ int bch2_move_data(struct bch_fs *c, ...@@ -528,7 +529,7 @@ int bch2_move_data(struct bch_fs *c,
if (unlikely(freezing(current))) { if (unlikely(freezing(current))) {
bch2_trans_unlock(&trans); bch2_trans_unlock(&trans);
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
try_to_freeze(); try_to_freeze();
} }
} while (delay); } while (delay);
...@@ -579,12 +580,12 @@ int bch2_move_data(struct bch_fs *c, ...@@ -579,12 +580,12 @@ int bch2_move_data(struct bch_fs *c,
k = bkey_i_to_s_c(&tmp.k); k = bkey_i_to_s_c(&tmp.k);
bch2_trans_unlock(&trans); bch2_trans_unlock(&trans);
ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, k, ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
data_cmd, data_opts); data_cmd, data_opts);
if (ret2) { if (ret2) {
if (ret2 == -ENOMEM) { if (ret2 == -ENOMEM) {
/* memory allocation failure, wait for some IO to finish */ /* memory allocation failure, wait for some IO to finish */
bch2_move_ctxt_wait_for_io(&ctxt); bch2_move_ctxt_wait_for_io(ctxt);
continue; continue;
} }
...@@ -602,7 +603,32 @@ int bch2_move_data(struct bch_fs *c, ...@@ -602,7 +603,32 @@ int bch2_move_data(struct bch_fs *c,
bch2_trans_cond_resched(&trans); bch2_trans_cond_resched(&trans);
} }
out: out:
bch2_trans_exit(&trans); ret = bch2_trans_exit(&trans) ?: ret;
return ret;
}
int bch2_move_data(struct bch_fs *c,
struct bch_ratelimit *rate,
struct write_point_specifier wp,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
struct bch_move_stats *stats)
{
struct moving_context ctxt = { .stats = stats };
int ret;
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
init_waitqueue_head(&ctxt.wait);
stats->data_type = BCH_DATA_USER;
ret = __bch2_move_data(c, &ctxt, rate, wp, start, end,
pred, arg, stats, BTREE_ID_EXTENTS) ?:
__bch2_move_data(c, &ctxt, rate, wp, start, end,
pred, arg, stats, BTREE_ID_REFLINK);
move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
closure_sync(&ctxt.cl); closure_sync(&ctxt.cl);
......
...@@ -25,6 +25,7 @@ struct data_opts { ...@@ -25,6 +25,7 @@ struct data_opts {
}; };
struct migrate_write { struct migrate_write {
enum btree_id btree_id;
enum data_cmd data_cmd; enum data_cmd data_cmd;
struct data_opts data_opts; struct data_opts data_opts;
...@@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, ...@@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
struct write_point_specifier, struct write_point_specifier,
struct bch_io_opts, struct bch_io_opts,
enum data_cmd, struct data_opts, enum data_cmd, struct data_opts,
struct bkey_s_c); enum btree_id, struct bkey_s_c);
typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
struct bkey_s_c, struct bkey_s_c,
......
...@@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq) ...@@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq)
bch2_journal_pin_put(j, j->replay_journal_seq++); bch2_journal_pin_put(j, j->replay_journal_seq++);
} }
static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
struct bkey_i *k)
{ {
struct btree_trans trans; struct btree_trans trans;
struct btree_iter *iter, *split_iter; struct btree_iter *iter, *split_iter;
...@@ -255,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) ...@@ -255,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k)
retry: retry:
bch2_trans_begin(&trans); bch2_trans_begin(&trans);
iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, iter = bch2_trans_get_iter(&trans, btree_id,
bkey_start_pos(&k->k), bkey_start_pos(&k->k),
BTREE_ITER_INTENT); BTREE_ITER_INTENT);
...@@ -341,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c, ...@@ -341,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c,
for_each_journal_key(keys, i) { for_each_journal_key(keys, i) {
replay_now_at(j, keys.journal_seq_base + i->journal_seq); replay_now_at(j, keys.journal_seq_base + i->journal_seq);
switch (i->btree_id) { if (i->btree_id == BTREE_ID_ALLOC)
case BTREE_ID_ALLOC:
ret = bch2_alloc_replay_key(c, i->k); ret = bch2_alloc_replay_key(c, i->k);
break; else if (btree_node_type_is_extents(i->btree_id))
case BTREE_ID_EXTENTS: ret = bch2_extent_replay_key(c, i->btree_id, i->k);
ret = bch2_extent_replay_key(c, i->k); else
break;
default:
ret = bch2_btree_insert(c, i->btree_id, i->k, ret = bch2_btree_insert(c, i->btree_id, i->k,
NULL, NULL, NULL, NULL,
BTREE_INSERT_NOFAIL| BTREE_INSERT_NOFAIL|
BTREE_INSERT_LAZY_RW| BTREE_INSERT_LAZY_RW|
BTREE_INSERT_JOURNAL_REPLAY| BTREE_INSERT_JOURNAL_REPLAY|
BTREE_INSERT_NOMARK); BTREE_INSERT_NOMARK);
break;
}
if (ret) { if (ret) {
bch_err(c, "journal replay: error %d while replaying key", bch_err(c, "journal replay: error %d while replaying key",
......
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "btree_update.h"
#include "extents.h"
#include "fs.h"
#include "fs-io.h"
#include "reflink.h"
#include <linux/sched/signal.h>
/* reflink pointers */
const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
if (bkey_val_bytes(p.k) != sizeof(*p.v))
return "incorrect value size";
return NULL;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
}
enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
struct bkey_s _l, struct bkey_s _r)
{
struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
return BCH_MERGE_NOMERGE;
if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
bch2_key_resize(l.k, KEY_SIZE_MAX);
__bch2_cut_front(l.k->p, _r);
return BCH_MERGE_PARTIAL;
}
bch2_key_resize(l.k, l.k->size + r.k->size);
return BCH_MERGE_MERGE;
}
/* indirect extents */
const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
if (bkey_val_bytes(r.k) < sizeof(*r.v))
return "incorrect value size";
return bch2_bkey_ptrs_invalid(c, k);
}
void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
struct bkey_s_c k)
{
struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
bch2_bkey_ptrs_to_text(out, c, k);
}
/*
* bch2_remap_range() depends on bch2_extent_update(), which depends on various
* things tied to the linux vfs for inode updates, for now:
*/
#ifndef NO_BCACHEFS_FS
static int bch2_make_extent_indirect(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct bkey_i_extent *e)
{
struct bch_fs *c = trans->c;
struct btree_iter *reflink_iter;
struct bkey_s_c k;
struct bkey_i_reflink_v *r_v;
struct bkey_i_reflink_p *r_p;
int ret;
for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK,
POS(0, c->reflink_hint),
BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
if (reflink_iter->pos.inode) {
bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
continue;
}
if (bkey_deleted(k.k) && e->k.size <= k.k->size)
break;
}
if (ret)
goto err;
/* rewind iter to start of hole, if necessary: */
bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k));
ret = PTR_ERR_OR_ZERO(r_v);
if (ret)
goto err;
bkey_reflink_v_init(&r_v->k_i);
r_v->k.p = reflink_iter->pos;
bch2_key_resize(&r_v->k, e->k.size);
r_v->k.version = e->k.version;
set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) +
bkey_val_u64s(&e->k));
r_v->v.refcount = 0;
memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k));
bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i));
r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
if (IS_ERR(r_p))
return PTR_ERR(r_p);
e->k.type = KEY_TYPE_reflink_p;
r_p = bkey_i_to_reflink_p(&e->k_i);
set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i));
err:
if (!IS_ERR(reflink_iter)) {
c->reflink_hint = reflink_iter->pos.offset;
bch2_trans_iter_put(trans, reflink_iter);
}
return ret;
}
static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
{
struct bkey_s_c k = bch2_btree_iter_peek(iter);
while (1) {
if (bkey_err(k))
return k;
if (bkey_cmp(iter->pos, end) >= 0)
return bkey_s_c_null;
if (k.k->type == KEY_TYPE_extent ||
k.k->type == KEY_TYPE_reflink_p)
return k;
k = bch2_btree_iter_next(iter);
}
}
s64 bch2_remap_range(struct bch_fs *c,
struct bch_inode_info *dst_inode,
struct bpos dst_start, struct bpos src_start,
u64 remap_sectors, u64 new_i_size)
{
struct btree_trans trans;
struct btree_iter *dst_iter, *src_iter;
struct bkey_s_c src_k;
BKEY_PADDED(k) new_dst, new_src;
struct bpos dst_end = dst_start, src_end = src_start;
struct bpos dst_want, src_want;
u64 src_done, dst_done;
int ret = 0;
if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
mutex_lock(&c->sb_lock);
if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) {
c->disk_sb.sb->features[0] |=
cpu_to_le64(1ULL << BCH_FEATURE_REFLINK);
bch2_write_super(c);
}
mutex_unlock(&c->sb_lock);
}
dst_end.offset += remap_sectors;
src_end.offset += remap_sectors;
bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
BTREE_ITER_INTENT, 1);
dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start,
BTREE_ITER_INTENT, 2);
while (1) {
bch2_trans_begin_updates(&trans);
trans.mem_top = 0;
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto err;
}
src_k = get_next_src(src_iter, src_end);
ret = bkey_err(src_k);
if (ret)
goto btree_err;
src_done = bpos_min(src_iter->pos, src_end).offset -
src_start.offset;
dst_want = POS(dst_start.inode, dst_start.offset + src_done);
if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
dst_inode, new_i_size);
if (ret)
goto btree_err;
continue;
}
BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
if (!bkey_cmp(dst_iter->pos, dst_end))
break;
if (src_k.k->type == KEY_TYPE_extent) {
bkey_reassemble(&new_src.k, src_k);
src_k = bkey_i_to_s_c(&new_src.k);
bch2_cut_front(src_iter->pos, &new_src.k);
bch2_cut_back(src_end, &new_src.k.k);
ret = bch2_make_extent_indirect(&trans, src_iter,
bkey_i_to_extent(&new_src.k));
if (ret)
goto btree_err;
BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
}
if (src_k.k->type == KEY_TYPE_reflink_p) {
struct bkey_s_c_reflink_p src_p =
bkey_s_c_to_reflink_p(src_k);
struct bkey_i_reflink_p *dst_p =
bkey_reflink_p_init(&new_dst.k);
u64 offset = le64_to_cpu(src_p.v->idx) +
(src_iter->pos.offset -
bkey_start_offset(src_k.k));
dst_p->v.idx = cpu_to_le64(offset);
} else {
BUG();
}
new_dst.k.k.p = dst_iter->pos;
bch2_key_resize(&new_dst.k.k,
min(src_k.k->p.offset - src_iter->pos.offset,
dst_end.offset - dst_iter->pos.offset));
ret = bch2_extent_update(&trans, dst_inode, NULL, NULL,
dst_iter, &new_dst.k,
new_i_size, false, true, NULL);
if (ret)
goto btree_err;
dst_done = dst_iter->pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(src_iter, src_want);
btree_err:
if (ret == -EINTR)
ret = 0;
if (ret)
goto err;
}
BUG_ON(bkey_cmp(dst_iter->pos, dst_end));
err:
BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
dst_done = dst_iter->pos.offset - dst_start.offset;
new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
ret = bch2_trans_exit(&trans) ?: ret;
mutex_lock(&dst_inode->ei_update_lock);
if (dst_inode->v.i_size < new_i_size) {
i_size_write(&dst_inode->v, new_i_size);
ret = bch2_write_inode_size(c, dst_inode, new_i_size,
ATTR_MTIME|ATTR_CTIME);
}
mutex_unlock(&dst_inode->ei_update_lock);
return dst_done ?: ret;
}
#endif /* NO_BCACHEFS_FS */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REFLINK_H
#define _BCACHEFS_REFLINK_H
const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
enum merge_result bch2_reflink_p_merge(struct bch_fs *,
struct bkey_s, struct bkey_s);
#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
}
const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
}
#ifndef NO_BCACHEFS_FS
s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *,
struct bpos, struct bpos, u64, u64);
#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_REFLINK_H */
...@@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ...@@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
extent_to_replicas(k, e); extent_to_replicas(k, e);
break; break;
case KEY_TYPE_extent: case KEY_TYPE_extent:
case KEY_TYPE_reflink_v:
e->data_type = BCH_DATA_USER; e->data_type = BCH_DATA_USER;
extent_to_replicas(k, e); extent_to_replicas(k, e);
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment