Commit 8c3c0743 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull more xfs updates from Darrick Wong:
 "As promised last week, this batch changes how xfs interacts with
  memory reclaim; how the log batches and throttles log items; how hard
  writes near ENOSPC will try to squeeze more space out of the
  filesystem; and hopefully fix the last of the umount hangs after a
  catastrophic failure.

  Summary:

   - Validate the realtime geometry in the superblock when mounting

   - Refactor a bunch of tricky flag handling in the log code

   - Flush the CIL more judiciously so that we don't wait until there
     are millions of log items consuming a lot of memory.

   - Throttle transaction commits to prevent the xfs frontend from
     flooding the CIL with too many log items.

   - Account metadata buffers correctly for memory reclaim.

   - Mark slabs properly for memory reclaim. These should help reclaim
     run more effectively when XFS is using a lot of memory.

   - Don't write a garbage log record at unmount time if we're trying to
     trigger summary counter recalculation at next mount.

   - Don't block the AIL on locked dquot/inode buffers; instead trigger
     its backoff mechanism to give the lock holder a chance to finish
     up.

   - Ratelimit writeback flushing when buffered writes encounter ENOSPC.

   - Other minor cleanups.

   - Make reflink a synchronous operation when the fs is mounted with
     wsync or sync, which means that now we force the log to disk to
     record the changes"

* tag 'xfs-5.7-merge-12' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (26 commits)
  xfs: reflink should force the log out if mounted with wsync
  xfs: factor out a new xfs_log_force_inode helper
  xfs: fix inode number overflow in ifree cluster helper
  xfs: remove redundant variable assignment in xfs_symlink()
  xfs: ratelimit inode flush on buffered write ENOSPC
  xfs: return locked status of inode buffer on xfsaild push
  xfs: trylock underlying buffer on dquot flush
  xfs: remove unnecessary ternary from xfs_create
  xfs: don't write a corrupt unmount record to force summary counter recalc
  xfs: factor inode lookup from xfs_ifree_cluster
  xfs: tail updates only need to occur when LSN changes
  xfs: factor common AIL item deletion code
  xfs: correctly acount for reclaimable slabs
  xfs: Improve metadata buffer reclaim accountability
  xfs: don't allow log IO to be throttled
  xfs: Throttle commits on delayed background CIL push
  xfs: Lower CIL flush limit for large logs
  xfs: remove some stale comments from the log code
  xfs: refactor unmount record writing
  xfs: merge xlog_commit_record with xlog_write_done
  ...
parents d3e5e977 5833112d
......@@ -328,6 +328,38 @@ xfs_validate_sb_common(
return -EFSCORRUPTED;
}
/* Validate the realtime geometry; stolen from xfs_repair */
if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
xfs_notice(mp,
"realtime extent sanity check failed");
return -EFSCORRUPTED;
}
if (sbp->sb_rblocks == 0) {
if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 ||
sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) {
xfs_notice(mp,
"realtime zeroed geometry check failed");
return -EFSCORRUPTED;
}
} else {
uint64_t rexts;
uint64_t rbmblocks;
rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize);
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
if (sbp->sb_rextents != rexts ||
sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
return -EFSCORRUPTED;
}
}
if (sbp->sb_unit) {
if (!xfs_sb_version_hasdalign(sbp) ||
sbp->sb_unit > sbp->sb_width ||
......
......@@ -327,6 +327,9 @@ xfs_buf_free(
__free_page(page);
}
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab +=
bp->b_page_count;
} else if (bp->b_flags & _XBF_KMEM)
kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
......@@ -2114,9 +2117,11 @@ xfs_buf_delwri_pushbuf(
int __init
xfs_buf_init(void)
{
xfs_buf_zone = kmem_cache_create("xfs_buf",
sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN, NULL);
xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
SLAB_MEM_SPREAD,
NULL);
if (!xfs_buf_zone)
goto out;
......
......@@ -1105,8 +1105,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen, 0, &bp,
&xfs_dquot_buf_ops);
mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
&bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
......@@ -1177,7 +1177,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
return -EIO;
return error;
}
/*
......
......@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
}
} else if (error == -EAGAIN)
rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
......
......@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
......@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
xfs_lsn_t lsn = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
......
......@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
return xfs_log_force_inode(ip);
}
STATIC int
......@@ -1069,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
if (ret)
goto out_unlock;
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
......
......@@ -1200,8 +1200,7 @@ xfs_create(
unlock_dp_on_error = false;
error = xfs_dir_createname(tp, dp, name, ip->i_ino,
resblks ?
resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
resblks - XFS_IALLOC_SPACE_RES(mp));
if (error) {
ASSERT(error != -ENOSPC);
goto out_trans_cancel;
......@@ -2503,6 +2502,88 @@ xfs_iunlink_remove(
return error;
}
/*
* Look up the inode number specified and mark it stale if it is found. If it is
* dirty, return the inode so it can be attached to the cluster buffer so it can
* be processed appropriately when the cluster free transaction completes.
*/
static struct xfs_inode *
xfs_ifree_get_one_inode(
struct xfs_perag *pag,
struct xfs_inode *free_ip,
xfs_ino_t inum)
{
struct xfs_mount *mp = pag->pag_mount;
struct xfs_inode *ip;
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
/* Inode not in memory, nothing to do */
if (!ip)
goto out_rcu_unlock;
/*
* because this is an RCU protected lookup, we could find a recently
* freed or even reallocated inode during the lookup. We need to check
* under the i_flags_lock for a valid inode here. Skip it if it is not
* valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
goto out_rcu_unlock;
}
spin_unlock(&ip->i_flags_lock);
/*
* Don't try to lock/unlock the current inode, but we _cannot_ skip the
* other inodes that we did not find in the list attached to the buffer
* and are not already marked stale. If we can't lock it, back off and
* retry.
*/
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're racing with
* freeing in xfs_reclaim_inode(). See the comments in that
* function for more information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out_rcu_unlock;
}
}
rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
/*
* We don't need to attach clean inodes or those only with unlogged
* changes (which we throw away, anyway).
*/
if (!ip->i_itemp || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
goto out_no_inode;
}
return ip;
out_rcu_unlock:
rcu_read_unlock();
out_no_inode:
return NULL;
}
/*
* A big issue when freeing the inode cluster is that we _cannot_ skip any
* inodes that are in memory - they all must be marked stale and attached to
......@@ -2603,77 +2684,11 @@ xfs_ifree_cluster(
* even trying to lock them.
*/
for (i = 0; i < igeo->inodes_per_cluster; i++) {
retry:
rcu_read_lock();
ip = radix_tree_lookup(&pag->pag_ici_root,
XFS_INO_TO_AGINO(mp, (inum + i)));
/* Inode not in memory, nothing to do */
if (!ip) {
rcu_read_unlock();
ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
if (!ip)
continue;
}
/*
* because this is an RCU protected lookup, we could
* find a recently freed or even reallocated inode
* during the lookup. We need to check under the
* i_flags_lock for a valid inode here. Skip it if it
* is not valid, the wrong inode or stale.
*/
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != inum + i ||
__xfs_iflags_test(ip, XFS_ISTALE)) {
spin_unlock(&ip->i_flags_lock);
rcu_read_unlock();
continue;
}
spin_unlock(&ip->i_flags_lock);
/*
* Don't try to lock/unlock the current inode, but we
* _cannot_ skip the other inodes that we did not find
* in the list attached to the buffer and are not
* already marked stale. If we can't lock it, back off
* and retry.
*/
if (ip != free_ip) {
if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
rcu_read_unlock();
delay(1);
goto retry;
}
/*
* Check the inode number again in case we're
* racing with freeing in xfs_reclaim_inode().
* See the comments in that function for more
* information as to why the initial check is
* not sufficient.
*/
if (ip->i_ino != inum + i) {
xfs_iunlock(ip, XFS_ILOCK_EXCL);
rcu_read_unlock();
continue;
}
}
rcu_read_unlock();
xfs_iflock(ip);
xfs_iflags_set(ip, XFS_ISTALE);
/*
* we don't need to attach clean inodes or those only
* with unlogged changes (which we throw away, anyway).
*/
iip = ip->i_itemp;
if (!iip || xfs_inode_clean(ip)) {
ASSERT(ip != free_ip);
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
continue;
}
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
iip->ili_fsync_fields = 0;
......@@ -3930,3 +3945,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
/*
* Ensure all commited transactions touching the inode are written to the log.
*/
int
xfs_log_force_inode(
struct xfs_inode *ip)
{
xfs_lsn_t lsn = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
lsn = ip->i_itemp->ili_last_lsn;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
if (!lsn)
return 0;
return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
}
......@@ -426,6 +426,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
......
......@@ -552,7 +552,8 @@ xfs_inode_item_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
}
} else if (error == -EAGAIN)
rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
......@@ -730,29 +731,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
bool mlip_changed = false;
xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
mlip_changed |= xfs_ail_delete_one(ailp, blip);
else {
blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
/*
* xfs_ail_update_finish() only cares about the
* lsn of the first tail item removed, any
* others will be at the same or higher lsn so
* we just ignore them.
*/
xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
if (!tail_lsn && lsn)
tail_lsn = lsn;
} else {
xfs_clear_li_failed(blip);
}
}
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
xlog_assign_tail_lsn_locked(ailp->ail_mount);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
}
spin_unlock(&ailp->ail_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->ail_mount);
xfs_ail_update_finish(ailp, tail_lsn);
}
/*
......
This diff is collapsed.
......@@ -105,10 +105,6 @@ struct xfs_log_item;
struct xfs_item_ops;
struct xfs_trans;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
struct xlog_ticket *ticket,
struct xlog_in_core **iclog,
bool regrant);
int xfs_log_force(struct xfs_mount *mp, uint flags);
int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
int *log_forced);
......
......@@ -668,6 +668,11 @@ xlog_cil_push_work(
push_seq = cil->xc_push_seq;
ASSERT(push_seq <= ctx->sequence);
/*
* Wake up any background push waiters now this context is being pushed.
*/
wake_up_all(&ctx->push_wait);
/*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
......@@ -744,6 +749,7 @@ xlog_cil_push_work(
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
init_waitqueue_head(&new_ctx->push_wait);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
......@@ -801,7 +807,7 @@ xlog_cil_push_work(
lvhdr.lv_iovecp = &lhdr;
lvhdr.lv_next = ctx->lv_chain;
error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0, true);
if (error)
goto out_abort_free_ticket;
......@@ -839,10 +845,11 @@ xlog_cil_push_work(
}
spin_unlock(&cil->xc_push_lock);
/* xfs_log_done always frees the ticket on error. */
commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false);
if (commit_lsn == -1)
goto out_abort;
error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn);
if (error)
goto out_abort_free_ticket;
xfs_log_ticket_ungrant(log, tic);
spin_lock(&commit_iclog->ic_callback_lock);
if (commit_iclog->ic_state == XLOG_STATE_IOERROR) {
......@@ -875,7 +882,7 @@ xlog_cil_push_work(
return;
out_abort_free_ticket:
xfs_log_ticket_put(tic);
xfs_log_ticket_ungrant(log, tic);
out_abort:
ASSERT(XLOG_FORCED_SHUTDOWN(log));
xlog_cil_committed(ctx);
......@@ -890,7 +897,7 @@ xlog_cil_push_work(
*/
static void
xlog_cil_push_background(
struct xlog *log)
struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
......@@ -904,14 +911,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
up_read(&cil->xc_ctx_lock);
return;
}
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
/*
* Drop the context lock now, we can't hold that if we need to sleep
* because we are over the blocking threshold. The push_lock is still
* held, so blocking threshold sleep/wakeup is still correctly
* serialised here.
*/
up_read(&cil->xc_ctx_lock);
/*
* If we are well over the space limit, throttle the work that is being
* done until the push work on this context has begun.
*/
if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
xlog_wait(&cil->xc_ctx->push_wait, &cil->xc_push_lock);
return;
}
spin_unlock(&cil->xc_push_lock);
}
......@@ -1007,7 +1036,10 @@ xfs_log_commit_cil(
if (commit_lsn)
*commit_lsn = xc_commit_lsn;
xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (regrant && !XLOG_FORCED_SHUTDOWN(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
else
xfs_log_ticket_ungrant(log, tp->t_ticket);
tp->t_ticket = NULL;
xfs_trans_unreserve_and_mod_sb(tp);
......@@ -1028,9 +1060,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
xlog_cil_push_background(log);
up_read(&cil->xc_ctx_lock);
/* xlog_cil_push_background() releases cil->xc_ctx_lock */
xlog_cil_push_background(log);
}
/*
......@@ -1189,6 +1221,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
init_waitqueue_head(&ctx->push_wait);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
......
......@@ -51,13 +51,11 @@ enum xlog_iclog_state {
};
/*
* Flags to log ticket
* Log ticket flags
*/
#define XLOG_TIC_INITED 0x1 /* has been initialized */
#define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */
#define XLOG_TIC_PERM_RESERV 0x1 /* permanent reservation */
#define XLOG_TIC_FLAGS \
{ XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \
{ XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" }
/*
......@@ -242,6 +240,7 @@ struct xfs_cil_ctx {
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
wait_queue_head_t push_wait; /* background push throttle */
struct work_struct discard_endio_work;
};
......@@ -318,13 +317,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
* In order to keep background CIL push efficient, we will set a lower
* threshold at which background pushing is attempted without blocking current
* transaction commits. A separate, higher bound defines when CIL pushes are
* enforced to ensure we stay within our maximum checkpoint size bounds.
* threshold, yet give us plenty of space for aggregation on large logs.
* In order to keep background CIL push efficient, we only need to ensure the
* CIL is large enough to maintain sufficient in-memory relogging to avoid
* repeated physical writes of frequently modified metadata. If we allow the CIL
* to grow to a substantial fraction of the log, then we may be pinning hundreds
* of megabytes of metadata in memory until the CIL flushes. This can cause
* issues when we are running low on memory - pinned memory cannot be reclaimed,
* and the CIL consumes a lot of memory. Hence we need to set an upper physical
* size limit for the CIL that limits the maximum amount of memory pinned by the
* CIL but does not limit performance by reducing relogging efficiency
* significantly.
*
* As such, the CIL push threshold ends up being the smaller of two thresholds:
* - a threshold large enough that it allows CIL to be pushed and progress to be
* made without excessive blocking of incoming transaction commits. This is
* defined to be 12.5% of the log space - half the 25% push threshold of the
* AIL.
* - small enough that it doesn't pin excessive amounts of memory but maintains
* close to peak relogging efficiency. This is defined to be 16x the iclog
* buffer window (32MB) as measurements have shown this to be roughly the
* point of diminishing performance increases under highly concurrent
* modification workloads.
*
* To prevent the CIL from overflowing upper commit size bounds, we introduce a
* new threshold at which we block committing transactions until the background
* CIL commit commences and switches to a new context. While this is not a hard
* limit, it forces the process committing a transaction to the CIL to block and
* yeild the CPU, giving the CIL push work a chance to be scheduled and start
* work. This prevents a process running lots of transactions from overfilling
* the CIL because it is not yielding the CPU. We set the blocking limit at
* twice the background push space threshold so we keep in line with the AIL
* push thresholds.
*
* Note: this is not a -hard- limit as blocking is applied after the transaction
* is inserted into the CIL and the push has been triggered. It is largely a
* throttling mechanism that allows the CIL push to be scheduled and run. A hard
* limit will be difficult to implement without introducing global serialisation
* in the CIL commit fast path, and it's not at all clear that we actually need
* such hard limits given the ~7 years we've run without a hard limit before
* finding the first situation where a checkpoint size overflow actually
* occurred. Hence the simple throttle, and an ASSERT check to tell us that
* we've overrun the max size.
*/
#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
#define XLOG_CIL_SPACE_LIMIT(log) \
min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
(XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
......@@ -439,14 +478,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int
xlog_write(
struct xlog *log,
struct xfs_log_vec *log_vector,
struct xlog_ticket *tic,
xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog,
uint flags);
int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector,
struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
struct xlog_in_core **commit_iclog, uint flags,
bool need_start_rec);
int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket,
struct xlog_in_core **iclog, xfs_lsn_t *lsn);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
......
......@@ -167,6 +167,7 @@ typedef struct xfs_mount {
struct xfs_kobj m_error_meta_kobj;
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
struct ratelimit_state m_flush_inodes_ratelimit;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
......
......@@ -121,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
int error = -EAGAIN;
xfs_dqlock(dqp);
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
xfs_dqunlock(dqp);
return -EAGAIN;
}
if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
......@@ -139,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
int error;
/*
* We don't care about getting disk errors here. We need
......@@ -149,6 +147,8 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
} else if (error == -EAGAIN) {
goto out_unlock;
}
xfs_dqflock(dqp);
}
......@@ -174,6 +174,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
out_unlock:
xfs_dqunlock(dqp);
return error;
}
/*
......
......@@ -528,6 +528,9 @@ xfs_flush_inodes(
{
struct super_block *sb = mp->m_super;
if (!__ratelimit(&mp->m_flush_inodes_ratelimit))
return;
if (down_read_trylock(&sb->s_umount)) {
sync_inodes_sb(sb);
up_read(&sb->s_umount);
......@@ -1366,6 +1369,17 @@ xfs_fc_fill_super(
if (error)
goto out_free_names;
/*
* Cap the number of invocations of xfs_flush_inodes to 16 for every
* quarter of a second. The magic numbers here were determined by
* observation neither to cause stalls in writeback when there are a
* lot of IO threads and the fs is near ENOSPC, nor cause any fstest
* regressions. YMMV.
*/
ratelimit_state_init(&mp->m_flush_inodes_ratelimit, HZ / 4, 16);
ratelimit_set_flags(&mp->m_flush_inodes_ratelimit,
RATELIMIT_MSG_ON_RELEASE);
error = xfs_init_mount_workqueues(mp);
if (error)
goto out_close_devices;
......@@ -1861,7 +1875,8 @@ xfs_init_zones(void)
xfs_ili_zone = kmem_cache_create("xfs_ili",
sizeof(struct xfs_inode_log_item), 0,
SLAB_MEM_SPREAD, NULL);
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
......
......@@ -176,7 +176,6 @@ xfs_symlink(
return -ENAMETOOLONG;
ASSERT(pathlen > 0);
udqp = gdqp = NULL;
prid = xfs_get_initial_prid(dp);
/*
......
......@@ -1001,8 +1001,6 @@ DECLARE_EVENT_CLASS(xfs_loggrant_class,
DEFINE_EVENT(xfs_loggrant_class, name, \
TP_PROTO(struct xlog *log, struct xlog_ticket *tic), \
TP_ARGS(log, tic))
DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
......@@ -1011,12 +1009,13 @@ DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
DEFINE_LOGGRANT_EVENT(xfs_log_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_regrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ticket_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
......
......@@ -9,6 +9,7 @@
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_log_priv.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_extent_busy.h"
......@@ -150,8 +151,9 @@ xfs_trans_reserve(
uint blocks,
uint rtextents)
{
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
struct xfs_mount *mp = tp->t_mountp;
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
/* Mark this thread as being in a transaction */
current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
......@@ -162,7 +164,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd);
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
if (error != 0) {
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
return -ENOSPC;
......@@ -191,9 +193,9 @@ xfs_trans_reserve(
if (tp->t_ticket != NULL) {
ASSERT(resp->tr_logflags & XFS_TRANS_PERM_LOG_RES);
error = xfs_log_regrant(tp->t_mountp, tp->t_ticket);
error = xfs_log_regrant(mp, tp->t_ticket);
} else {
error = xfs_log_reserve(tp->t_mountp,
error = xfs_log_reserve(mp,
resp->tr_logres,
resp->tr_logcount,
&tp->t_ticket, XFS_TRANSACTION,
......@@ -213,7 +215,7 @@ xfs_trans_reserve(
* fail if the count would go below zero.
*/
if (rtextents > 0) {
error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents));
error = xfs_mod_frextents(mp, -((int64_t)rtextents));
if (error) {
error = -ENOSPC;
goto undo_log;
......@@ -229,7 +231,7 @@ xfs_trans_reserve(
*/
undo_log:
if (resp->tr_logres > 0) {
xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false);
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
tp->t_log_res = 0;
tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES;
......@@ -237,7 +239,7 @@ xfs_trans_reserve(
undo_blocks:
if (blocks > 0) {
xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd);
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
......@@ -1004,9 +1006,10 @@ __xfs_trans_commit(
*/
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant);
if (commit_lsn == -1 && !error)
error = -EIO;
if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log))
xfs_log_ticket_regrant(mp->m_log, tp->t_ticket);
else
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
......@@ -1065,7 +1068,7 @@ xfs_trans_cancel(
xfs_trans_unreserve_and_mod_dquots(tp);
if (tp->t_ticket) {
xfs_log_done(mp, tp->t_ticket, NULL, false);
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
......
......@@ -109,17 +109,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
static xfs_lsn_t
__xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
struct xfs_log_item *lip = xfs_ail_min(ailp);
if (lip)
return lip->li_lsn;
return 0;
}
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
xfs_lsn_t lsn = 0;
struct xfs_log_item *lip;
xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
lip = xfs_ail_min(ailp);
if (lip)
lsn = lip->li_lsn;
lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
......@@ -681,6 +689,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
void
xfs_ail_update_finish(
struct xfs_ail *ailp,
xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
{
struct xfs_mount *mp = ailp->ail_mount;
/* if the tail lsn hasn't changed, don't do updates or wakeups. */
if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
spin_unlock(&ailp->ail_lock);
return;
}
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
spin_unlock(&ailp->ail_lock);
xfs_log_space_wake(mp);
}
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
......@@ -712,7 +742,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
int mlip_changed = 0;
xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
......@@ -727,9 +757,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
if (mlip == lip && !tail_lsn)
tail_lsn = lip->li_lsn;
xfs_ail_delete(ailp, lip);
if (mlip == lip)
mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
......@@ -740,23 +771,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
xlog_assign_tail_lsn_locked(ailp->ail_mount);
spin_unlock(&ailp->ail_lock);
xfs_log_space_wake(ailp->ail_mount);
} else {
spin_unlock(&ailp->ail_lock);
}
xfs_ail_update_finish(ailp, tail_lsn);
}
bool
/*
* Delete one log item from the AIL.
*
* If this item was at the tail of the AIL, return the LSN of the log item so
* that we can use it to check if the LSN of the tail of the log has moved
* when finishing up the AIL delete process in xfs_ail_update_finish().
*/
xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
......@@ -764,7 +795,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
return mlip == lip;
if (mlip == lip)
return lsn;
return 0;
}
/**
......@@ -792,10 +825,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->ail_lock)
int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
bool mlip_changed;
xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
......@@ -808,17 +841,8 @@ xfs_trans_ail_delete(
return;
}
mlip_changed = xfs_ail_delete_one(ailp, lip);
if (mlip_changed) {
if (!XFS_FORCED_SHUTDOWN(mp))
xlog_assign_tail_lsn_locked(mp);
if (list_empty(&ailp->ail_head))
wake_up_all(&ailp->ail_empty);
}
spin_unlock(&ailp->ail_lock);
if (mlip_changed)
xfs_log_space_wake(ailp->ail_mount);
tail_lsn = xfs_ail_delete_one(ailp, lip);
xfs_ail_update_finish(ailp, tail_lsn);
}
int
......
......@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
__releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
int shutdown_type) __releases(ailp->ail_lock);
int shutdown_type);
static inline void
xfs_trans_ail_remove(
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment