Commit 166afc45 authored by Dave Chinner's avatar Dave Chinner

Merge tag 'reflink-speedups-5.19_2022-04-28' of...

Merge tag 'reflink-speedups-5.19_2022-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-5.19-for-next

xfs: fix reflink inefficiencies

As Dave Chinner has complained about on IRC, there are a couple of
things about reflink that are very inefficient.  First of all, we
limited the size of all bunmapi operations to avoid flooding the log
with defer ops in the worst case, but recent changes to the defer
ops code have solved that problem, so get rid of the bunmapi length
clamp.

Second, the log reservations for reflink operations are far far
larger than they need to be.  Shrink them to exactly what we need to
handle each deferred RUI and CUI log item, and no more.  Also reduce
logcount because we don't need 8 rolls per operation.  Introduce a
transaction reservation compatibility layer to avoid changing the
minimum log size calculations.
Signed-off-by: default avatarDave Chinner <david@fromorbit.com>
parents 956f1b8f 6ed7e509
......@@ -5280,7 +5280,6 @@ __xfs_bunmapi(
int whichfork; /* data or attribute fork */
xfs_fsblock_t sum;
xfs_filblks_t len = *rlen; /* length to unmap in file */
xfs_fileoff_t max_len;
xfs_fileoff_t end;
struct xfs_iext_cursor icur;
bool done = false;
......@@ -5299,16 +5298,6 @@ __xfs_bunmapi(
ASSERT(len > 0);
ASSERT(nexts >= 0);
/*
* Guesstimate how many blocks we can unmap without running the risk of
* blowing out the transaction with a mix of EFIs and reflink
* adjustments.
*/
if (tp && xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK)
max_len = min(len, xfs_refcount_max_unmap(tp->t_log_res));
else
max_len = len;
error = xfs_iread_extents(tp, ip, whichfork);
if (error)
return error;
......@@ -5347,7 +5336,7 @@ __xfs_bunmapi(
extno = 0;
while (end != (xfs_fileoff_t)-1 && end >= start &&
(nexts == 0 || extno < nexts) && max_len > 0) {
(nexts == 0 || extno < nexts)) {
/*
* Is the found extent after a hole in which end lives?
* Just back up to the previous extent, if so.
......@@ -5381,14 +5370,6 @@ __xfs_bunmapi(
if (del.br_startoff + del.br_blockcount > end + 1)
del.br_blockcount = end + 1 - del.br_startoff;
/* How much can we safely unmap? */
if (max_len < del.br_blockcount) {
del.br_startoff += del.br_blockcount - max_len;
if (!wasdel)
del.br_startblock += del.br_blockcount - max_len;
del.br_blockcount = max_len;
}
if (!isrt)
goto delete;
......@@ -5524,7 +5505,6 @@ __xfs_bunmapi(
if (error)
goto error0;
max_len -= del.br_blockcount;
end = del.br_startoff - 1;
nodelete:
/*
......
......@@ -14,6 +14,7 @@
#include "xfs_trans_space.h"
#include "xfs_da_btree.h"
#include "xfs_bmap_btree.h"
#include "xfs_trace.h"
/*
* Calculate the maximum length in bytes that would be required for a local
......@@ -36,6 +37,65 @@ xfs_log_calc_max_attrsetm_res(
M_RES(mp)->tr_attrsetrt.tr_logres * nblks;
}
/*
* Compute an alternate set of log reservation sizes for use exclusively with
* minimum log size calculations.
*/
static void
xfs_log_calc_trans_resv_for_minlogblocks(
struct xfs_mount *mp,
struct xfs_trans_resv *resv)
{
unsigned int rmap_maxlevels = mp->m_rmap_maxlevels;
/*
* In the early days of rmap+reflink, we always set the rmap maxlevels
* to 9 even if the AG was small enough that it would never grow to
* that height. Transaction reservation sizes influence the minimum
* log size calculation, which influences the size of the log that mkfs
* creates. Use the old value here to ensure that newly formatted
* small filesystems will mount on older kernels.
*/
if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp))
mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS;
xfs_trans_resv_calc(mp, resv);
if (xfs_has_reflink(mp)) {
/*
* In the early days of reflink, typical log operation counts
* were greatly overestimated.
*/
resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
resv->tr_itruncate.tr_logcount =
XFS_ITRUNCATE_LOG_COUNT_REFLINK;
resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
} else if (xfs_has_rmapbt(mp)) {
/*
* In the early days of non-reflink rmap, the impact of rmapbt
* updates on log counts were not taken into account at all.
*/
resv->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
resv->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
resv->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
}
/*
* In the early days of reflink, we did not use deferred refcount
* update log items, so log reservations must be recomputed using the
* old calculations.
*/
resv->tr_write.tr_logres =
xfs_calc_write_reservation_minlogsize(mp);
resv->tr_itruncate.tr_logres =
xfs_calc_itruncate_reservation_minlogsize(mp);
resv->tr_qm_dqalloc.tr_logres =
xfs_calc_qm_dqalloc_reservation_minlogsize(mp);
/* Put everything back the way it was. This goes at the end. */
mp->m_rmap_maxlevels = rmap_maxlevels;
}
/*
* Iterate over the log space reservation table to figure out and return
* the maximum one in terms of the pre-calculated values which were done
......@@ -46,19 +106,25 @@ xfs_log_get_max_trans_res(
struct xfs_mount *mp,
struct xfs_trans_res *max_resp)
{
struct xfs_trans_resv resv = {};
struct xfs_trans_res *resp;
struct xfs_trans_res *end_resp;
unsigned int i;
int log_space = 0;
int attr_space;
attr_space = xfs_log_calc_max_attrsetm_res(mp);
resp = (struct xfs_trans_res *)M_RES(mp);
end_resp = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (; resp < end_resp; resp++) {
xfs_log_calc_trans_resv_for_minlogblocks(mp, &resv);
resp = (struct xfs_trans_res *)&resv;
end_resp = (struct xfs_trans_res *)(&resv + 1);
for (i = 0; resp < end_resp; i++, resp++) {
int tmp = resp->tr_logcount > 1 ?
resp->tr_logres * resp->tr_logcount :
resp->tr_logres;
trace_xfs_trans_resv_calc_minlogsize(mp, i, resp);
if (log_space < tmp) {
log_space = tmp;
*max_resp = *resp; /* struct copy */
......@@ -66,9 +132,10 @@ xfs_log_get_max_trans_res(
}
if (attr_space > log_space) {
*max_resp = M_RES(mp)->tr_attrsetm; /* struct copy */
*max_resp = resv.tr_attrsetm; /* struct copy */
max_resp->tr_logres = attr_space;
}
trace_xfs_log_get_max_trans_res(mp, max_resp);
}
/*
......
......@@ -886,8 +886,13 @@ xfs_refcount_still_have_space(
{
unsigned long overhead;
overhead = cur->bc_ag.refc.shape_changes *
xfs_allocfree_log_count(cur->bc_mp, 1);
/*
* Worst case estimate: full splits of the free space and rmap btrees
* to handle each of the shape changes to the refcount btree.
*/
overhead = xfs_allocfree_block_count(cur->bc_mp,
cur->bc_ag.refc.shape_changes);
overhead += cur->bc_mp->m_refc_maxlevels;
overhead *= cur->bc_mp->m_sb.sb_blocksize;
/*
......@@ -960,6 +965,7 @@ xfs_refcount_adjust_extents(
* Either cover the hole (increment) or
* delete the range (decrement).
*/
cur->bc_ag.refc.nr_ops++;
if (tmp.rc_refcount) {
error = xfs_refcount_insert(cur, &tmp,
&found_tmp);
......@@ -970,7 +976,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
cur->bc_ag.refc.nr_ops++;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
cur->bc_ag.pag->pag_agno,
......@@ -1001,11 +1006,11 @@ xfs_refcount_adjust_extents(
ext.rc_refcount += adj;
trace_xfs_refcount_modify_extent(cur->bc_mp,
cur->bc_ag.pag->pag_agno, &ext);
cur->bc_ag.refc.nr_ops++;
if (ext.rc_refcount > 1) {
error = xfs_refcount_update(cur, &ext);
if (error)
goto out_error;
cur->bc_ag.refc.nr_ops++;
} else if (ext.rc_refcount == 1) {
error = xfs_refcount_delete(cur, &found_rec);
if (error)
......@@ -1014,7 +1019,6 @@ xfs_refcount_adjust_extents(
error = -EFSCORRUPTED;
goto out_error;
}
cur->bc_ag.refc.nr_ops++;
goto advloop;
} else {
fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
......
......@@ -67,14 +67,17 @@ extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
* log (plus any key updates) so we'll conservatively assume 32 bytes
* per record. We must also leave space for btree splits on both ends
* of the range and space for the CUD and a new CUI.
*
* Each EFI that we attach to the transaction is assumed to consume ~32 bytes.
* This is a low estimate for an EFI tracking a single extent (16 bytes for the
* EFI header, 16 for the extent, and 12 for the xlog op header), but the
* estimate is acceptable if there's more than one extent being freed.
* In the worst case of freeing every other block during a refcount decrease
* operation, we amortize the space used for one EFI log item across 16
* extents.
*/
#define XFS_REFCOUNT_ITEM_OVERHEAD 32
static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res)
{
return (log_res * 3 / 4) / XFS_REFCOUNT_ITEM_OVERHEAD;
}
extern int xfs_refcount_has_record(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len, bool *exists);
union xfs_btree_rec;
......
This diff is collapsed.
......@@ -73,7 +73,6 @@ struct xfs_trans_resv {
#define XFS_DEFAULT_LOG_COUNT 1
#define XFS_DEFAULT_PERM_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT 2
#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_INACTIVE_LOG_COUNT 2
#define XFS_CREATE_LOG_COUNT 2
#define XFS_CREATE_TMPFILE_LOG_COUNT 2
......@@ -83,13 +82,24 @@ struct xfs_trans_resv {
#define XFS_LINK_LOG_COUNT 2
#define XFS_RENAME_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT 2
#define XFS_WRITE_LOG_COUNT_REFLINK 8
#define XFS_ADDAFORK_LOG_COUNT 2
#define XFS_ATTRINVAL_LOG_COUNT 1
#define XFS_ATTRSET_LOG_COUNT 3
#define XFS_ATTRRM_LOG_COUNT 3
/*
* Original log operation counts were overestimated in the early days of
* reflink. These are retained here purely for minimum log size calculations
* and must not be used for runtime reservations.
*/
#define XFS_ITRUNCATE_LOG_COUNT_REFLINK 8
#define XFS_WRITE_LOG_COUNT_REFLINK 8
void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops);
unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp);
unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp);
#endif /* __XFS_TRANS_RESV_H__ */
......@@ -586,21 +586,21 @@ xfs_reflink_cancel_cow_range(
STATIC int
xfs_reflink_end_cow_extent(
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb,
xfs_fileoff_t *end_fsb)
xfs_fileoff_t *offset_fsb,
xfs_fileoff_t end_fsb)
{
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
struct xfs_bmbt_irec got, del, data;
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp;
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
xfs_filblks_t rlen;
unsigned int resblks;
int nmaps;
int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
*end_fsb = offset_fsb;
*offset_fsb = end_fsb;
return 0;
}
......@@ -631,42 +631,66 @@ xfs_reflink_end_cow_extent(
* left by the time I/O completes for the loser of the race. In that
* case we are done.
*/
if (!xfs_iext_lookup_extent_before(ip, ifp, end_fsb, &icur, &got) ||
got.br_startoff + got.br_blockcount <= offset_fsb) {
*end_fsb = offset_fsb;
if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
goto out_cancel;
}
/*
* Structure copy @got into @del, then trim @del to the range that we
* were asked to remap. We preserve @got for the eventual CoW fork
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
* need to skip them. Preserve @got for the eventual CoW fork
* deletion; from now on @del represents the mapping that we're
* actually remapping.
*/
while (!xfs_bmap_is_written_extent(&got)) {
if (!xfs_iext_next_extent(ifp, &icur, &got) ||
got.br_startoff >= end_fsb) {
*offset_fsb = end_fsb;
goto out_cancel;
}
}
del = got;
xfs_trim_extent(&del, offset_fsb, *end_fsb - offset_fsb);
ASSERT(del.br_blockcount > 0);
/* Grab the corresponding mapping in the data fork. */
nmaps = 1;
error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
&nmaps, 0);
if (error)
goto out_cancel;
/* We can only remap the smaller of the two extent sizes. */
data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
del.br_blockcount = data.br_blockcount;
trace_xfs_reflink_cow_remap_from(ip, &del);
trace_xfs_reflink_cow_remap_to(ip, &data);
if (xfs_bmap_is_real_extent(&data)) {
/*
* Only remap real extents that contain data. With AIO, speculative
* preallocations can leak into the range we are called upon, and we
* need to skip them.
* If the extent we're remapping is backed by storage (written
* or not), unmap the extent and drop its refcount.
*/
if (!xfs_bmap_is_written_extent(&got)) {
*end_fsb = del.br_startoff;
goto out_cancel;
}
xfs_bmap_unmap_extent(tp, ip, &data);
xfs_refcount_decrease_extent(tp, &data);
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
-data.br_blockcount);
} else if (data.br_startblock == DELAYSTARTBLOCK) {
int done;
/* Unmap the old blocks in the data fork. */
rlen = del.br_blockcount;
error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1);
/*
* If the extent we're remapping is a delalloc reservation,
* we can use the regular bunmapi function to release the
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
error = xfs_bunmapi(NULL, ip, data.br_startoff,
data.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
/* Trim the extent to whatever got unmapped. */
xfs_trim_extent(&del, del.br_startoff + rlen, del.br_blockcount - rlen);
trace_xfs_reflink_cow_remap(ip, &del);
ASSERT(done);
}
/* Free the CoW orphan record. */
xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
......@@ -687,7 +711,7 @@ xfs_reflink_end_cow_extent(
return error;
/* Update the caller about how much progress we made. */
*end_fsb = del.br_startoff;
*offset_fsb = del.br_startoff + del.br_blockcount;
return 0;
out_cancel:
......@@ -715,7 +739,7 @@ xfs_reflink_end_cow(
end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
/*
* Walk backwards until we're out of the I/O range. The loop function
* Walk forwards until we've remapped the I/O range. The loop function
* repeatedly cycles the ILOCK to allocate one transaction per remapped
* extent.
*
......@@ -747,7 +771,7 @@ xfs_reflink_end_cow(
* blocks will be remapped.
*/
while (end_fsb > offset_fsb && !error)
error = xfs_reflink_end_cow_extent(ip, offset_fsb, &end_fsb);
error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
if (error)
trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
......@@ -1138,7 +1162,7 @@ xfs_reflink_remap_extent(
xfs_refcount_decrease_extent(tp, &smap);
qdelta -= smap.br_blockcount;
} else if (smap.br_startblock == DELAYSTARTBLOCK) {
xfs_filblks_t len = smap.br_blockcount;
int done;
/*
* If the extent we're unmapping is a delalloc reservation,
......@@ -1146,10 +1170,11 @@ xfs_reflink_remap_extent(
* incore state. Dropping the delalloc reservation takes care
* of the quota reservation for us.
*/
error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
error = xfs_bunmapi(NULL, ip, smap.br_startoff,
smap.br_blockcount, 0, 1, &done);
if (error)
goto out_cancel;
ASSERT(len == 0);
ASSERT(done);
}
/*
......
......@@ -3408,7 +3408,8 @@ DEFINE_INODE_IREC_EVENT(xfs_reflink_convert_cow);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from);
DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
......@@ -3503,7 +3504,7 @@ DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
TRACE_EVENT(xfs_trans_resv_calc,
DECLARE_EVENT_CLASS(xfs_trans_resv_class,
TP_PROTO(struct xfs_mount *mp, unsigned int type,
struct xfs_trans_res *res),
TP_ARGS(mp, type, res),
......@@ -3527,6 +3528,33 @@ TRACE_EVENT(xfs_trans_resv_calc,
__entry->logres,
__entry->logcount,
__entry->logflags)
)
#define DEFINE_TRANS_RESV_EVENT(name) \
DEFINE_EVENT(xfs_trans_resv_class, name, \
TP_PROTO(struct xfs_mount *mp, unsigned int type, \
struct xfs_trans_res *res), \
TP_ARGS(mp, type, res))
DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc);
DEFINE_TRANS_RESV_EVENT(xfs_trans_resv_calc_minlogsize);
TRACE_EVENT(xfs_log_get_max_trans_res,
TP_PROTO(struct xfs_mount *mp, const struct xfs_trans_res *res),
TP_ARGS(mp, res),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(uint, logres)
__field(int, logcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->logres = res->tr_logres;
__entry->logcount = res->tr_logcount;
),
TP_printk("dev %d:%d logres %u logcount %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->logres,
__entry->logcount)
);
DECLARE_EVENT_CLASS(xfs_trans_class,
......
......@@ -32,7 +32,6 @@ static void
xfs_trans_trace_reservations(
struct xfs_mount *mp)
{
struct xfs_trans_res resv;
struct xfs_trans_res *res;
struct xfs_trans_res *end_res;
int i;
......@@ -41,8 +40,6 @@ xfs_trans_trace_reservations(
end_res = (struct xfs_trans_res *)(M_RES(mp) + 1);
for (i = 0; res < end_res; i++, res++)
trace_xfs_trans_resv_calc(mp, i, res);
xfs_log_get_max_trans_res(mp, &resv);
trace_xfs_trans_resv_calc(mp, -1, &resv);
}
#else
# define xfs_trans_trace_reservations(mp)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment