Commit 84d69619 authored by Darrick J. Wong's avatar Darrick J. Wong

xfs: preallocate blocks for worst-case btree expansion

To gracefully handle the situation where a CoW operation turns a
single refcount extent into a lot of tiny ones and then run out of
space when a tree split has to happen, use the per-AG reserved block
pool to pre-allocate all the space we'll ever need for a maximal
btree.  For a 4K block size, this only costs an overhead of 0.3% of
available disk space.

When reflink is enabled, we have an unfortunate problem with rmap --
since we can share a block billions of times, this means that the
reverse mapping btree can expand basically infinitely.  When an AG is
so full that there are no free blocks with which to expand the rmapbt,
the filesystem will shut down hard.

This is rather annoying to the user, so use the AG reservation code to
reserve a "reasonable" amount of space for rmap.  We'll prevent
reflinks and CoW operations if we think we're getting close to
exhausting an AG's free space rather than shutting down, but this
permanent reservation should be enough for "most" users.  Hopefully.
Signed-off-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
[hch@lst.de: ensure that we invalidate the freed btree buffer]
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent f7ca3522
......@@ -38,6 +38,7 @@
#include "xfs_trans_space.h"
#include "xfs_rmap_btree.h"
#include "xfs_btree.h"
#include "xfs_refcount_btree.h"
/*
* Per-AG Block Reservations
......@@ -228,6 +229,11 @@ xfs_ag_resv_init(
if (pag->pag_meta_resv.ar_asked == 0) {
ask = used = 0;
error = xfs_refcountbt_calc_reserves(pag->pag_mount,
pag->pag_agno, &ask, &used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
ask, used);
if (error)
......@@ -238,6 +244,11 @@ xfs_ag_resv_init(
if (pag->pag_agfl_resv.ar_asked == 0) {
ask = used = 0;
error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
&ask, &used);
if (error)
goto out;
error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
if (error)
goto out;
......
......@@ -79,6 +79,8 @@ xfs_refcountbt_alloc_block(
struct xfs_alloc_arg args; /* block allocation args */
int error; /* error return value */
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
memset(&args, 0, sizeof(args));
args.tp = cur->bc_tp;
args.mp = cur->bc_mp;
......@@ -88,6 +90,7 @@ xfs_refcountbt_alloc_block(
args.firstblock = args.fsbno;
xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
args.minlen = args.maxlen = args.prod = 1;
args.resv = XFS_AG_RESV_METADATA;
error = xfs_alloc_vextent(&args);
if (error)
......@@ -125,16 +128,19 @@ xfs_refcountbt_free_block(
struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
struct xfs_owner_info oinfo;
int error;
trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
xfs_bmap_add_free(mp, cur->bc_private.a.dfops, fsbno, 1,
&oinfo);
error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
XFS_AG_RESV_METADATA);
if (error)
return error;
return 0;
return error;
}
STATIC int
......@@ -387,3 +393,59 @@ xfs_refcountbt_compute_maxlevels(
mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
mp->m_refc_mnr, mp->m_sb.sb_agblocks);
}
/* Calculate the refcount btree size for some records. */
xfs_extlen_t
xfs_refcountbt_calc_size(
struct xfs_mount *mp,
unsigned long long len)
{
return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
}
/*
* Calculate the maximum refcount btree size.
*/
xfs_extlen_t
xfs_refcountbt_max_size(
struct xfs_mount *mp)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_refc_mxr[0] == 0)
return 0;
return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
}
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
int
xfs_refcountbt_calc_reserves(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
struct xfs_buf *agbp;
struct xfs_agf *agf;
xfs_extlen_t tree_len;
int error;
if (!xfs_sb_version_hasreflink(&mp->m_sb))
return 0;
*ask += xfs_refcountbt_max_size(mp);
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
agf = XFS_BUF_TO_AGF(agbp);
tree_len = be32_to_cpu(agf->agf_refcount_blocks);
xfs_buf_relse(agbp);
*used += tree_len;
return error;
}
......@@ -64,4 +64,11 @@ extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
bool leaf);
extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
#endif /* __XFS_REFCOUNT_BTREE_H__ */
......@@ -35,6 +35,7 @@
#include "xfs_cksum.h"
#include "xfs_error.h"
#include "xfs_extent_busy.h"
#include "xfs_ag_resv.h"
/*
* Reverse map btree.
......@@ -533,3 +534,62 @@ xfs_rmapbt_compute_maxlevels(
mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
}
/* Calculate the refcount btree size for some records. */
xfs_extlen_t
xfs_rmapbt_calc_size(
struct xfs_mount *mp,
unsigned long long len)
{
return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
}
/*
* Calculate the maximum refcount btree size.
*/
xfs_extlen_t
xfs_rmapbt_max_size(
struct xfs_mount *mp)
{
/* Bail out if we're uninitialized, which can happen in mkfs. */
if (mp->m_rmap_mxr[0] == 0)
return 0;
return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
}
/*
* Figure out how many blocks to reserve and how many are used by this btree.
*/
int
xfs_rmapbt_calc_reserves(
struct xfs_mount *mp,
xfs_agnumber_t agno,
xfs_extlen_t *ask,
xfs_extlen_t *used)
{
struct xfs_buf *agbp;
struct xfs_agf *agf;
xfs_extlen_t pool_len;
xfs_extlen_t tree_len;
int error;
if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
return 0;
/* Reserve 1% of the AG or enough for 1 block per record. */
pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
*ask += pool_len;
error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
if (error)
return error;
agf = XFS_BUF_TO_AGF(agbp);
tree_len = be32_to_cpu(agf->agf_rmap_blocks);
xfs_buf_relse(agbp);
*used += tree_len;
return error;
}
......@@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
#endif /* __XFS_RMAP_BTREE_H__ */
......@@ -43,6 +43,7 @@
#include "xfs_log.h"
#include "xfs_filestream.h"
#include "xfs_rmap.h"
#include "xfs_ag_resv.h"
/*
* File system operations
......@@ -630,6 +631,11 @@ xfs_growfs_data_private(
xfs_set_low_space_thresholds(mp);
mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
/* Reserve AG metadata blocks. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out;
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
error = 0;
......@@ -680,6 +686,8 @@ xfs_growfs_data_private(
continue;
}
}
out:
return saved_error ? saved_error : error;
error0:
......@@ -989,3 +997,59 @@ xfs_do_force_shutdown(
"Please umount the filesystem and rectify the problem(s)");
}
}
/*
* Reserve free space for per-AG metadata.
*/
int
xfs_fs_reserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
int error = 0;
int err2;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_init(pag);
xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
if (error && error != -ENOSPC) {
xfs_warn(mp,
"Error %d reserving per-AG metadata reserve pool.", error);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
return error;
}
/*
* Free space reserved for per-AG metadata.
*/
int
xfs_fs_unreserve_ag_blocks(
struct xfs_mount *mp)
{
xfs_agnumber_t agno;
struct xfs_perag *pag;
int error = 0;
int err2;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
pag = xfs_perag_get(mp, agno);
err2 = xfs_ag_resv_free(pag);
xfs_perag_put(pag);
if (err2 && !error)
error = err2;
}
if (error)
xfs_warn(mp,
"Error %d freeing per-AG metadata reserve pool.", error);
return error;
}
......@@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
xfs_fsop_resblks_t *outval);
extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
......@@ -995,10 +995,17 @@ xfs_mountfs(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
goto out_quota;
}
/* Reserve AG blocks for future btree expansion. */
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
goto out_agresv;
}
return 0;
out_agresv:
xfs_fs_unreserve_ag_blocks(mp);
out_quota:
xfs_qm_unmount_quotas(mp);
out_rtunmount:
......@@ -1043,6 +1050,7 @@ xfs_unmountfs(
cancel_delayed_work_sync(&mp->m_eofblocks_work);
xfs_fs_unreserve_ag_blocks(mp);
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
IRELE(mp->m_rootip);
......
......@@ -1325,10 +1325,22 @@ xfs_fs_remount(
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
/* Create the per-AG metadata reservation pool .*/
error = xfs_fs_reserve_ag_blocks(mp);
if (error && error != -ENOSPC)
return error;
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
/* Free the per-AG metadata reservation pool. */
error = xfs_fs_unreserve_ag_blocks(mp);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
}
/*
* Before we sync the metadata, we need to free up the reserve
* block pool so that the used block count in the superblock on
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment