Commit 4e69f490 authored by Chandan Babu R's avatar Chandan Babu R

Merge tag 'xfs-fstrim-busy-tag' of...

Merge tag 'xfs-fstrim-busy-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs into xfs-6.6-fixesC

xfs: reduce AGF hold times during fstrim operations

A recent log space overflow and recovery failure was root caused to
a long running truncate blocking on the AGF and ending up pinning
the tail of the log. The filesystem then hung, the machine was
rebooted, and log recoery then refused to run because there wasn't
enough space in the log for EFI transaction reservation.

The reason the long running truncate got blocked on the AGF for so
long was that an fstrim was being run. THe underlying block device
was large and very slow (10TB ceph rbd volume) and so discarding all
the free space in the AG took a really long time.

The current fstrim implementation holds the AGF across the entire
operations - both the freee space scan and the issuing of all the
discards. The discards are synchronous and single depth, so if there
are millions of free spaces, we hold the AGF lock across millions of
discard operations.

It doesn't really need to be said that this is a Bad Thing.

This series reworks the fstrim discard path to use the same
mechanisms as online discard. This allows discards to be issued
asynchronously without holding the AGF locked, enabling higher
discard queue depths (much faster on fast devices) and only
requiring the AGF lock to be held whilst we are scanning free space.

To do this, we make use of busy extents - we lock the AGF, mark all
the extents we want to discard as "busy under discard" so that
nothing will be allowed to allocate them, and then drop the AGF
lock. We then issue discards on the gathered busy extents and on
discard completion remove them from the busy list.

This results in AGF lock holds times for fstrim dropping to a few
milliseconds each batch of free extents we scan, and so the hours
long hold times that can currently occur on large, slow, badly
fragmented device no longer occur.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarChandan Babu R <chandanbabu@kernel.org>

* tag 'xfs-fstrim-busy-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: abort fstrim if kernel is suspending
  xfs: reduce AGF hold times during fstrim operations
  xfs: move log discard work to xfs_discard.c
parents 8a749fd1 e78a40b8
This diff is collapsed.
......@@ -3,8 +3,10 @@
#define XFS_DISCARD_H 1
struct fstrim_range;
struct list_head;
struct xfs_mount;
struct xfs_busy_extents;
extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy);
int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim);
#endif /* XFS_DISCARD_H */
......@@ -19,13 +19,13 @@
#include "xfs_log.h"
#include "xfs_ag.h"
void
xfs_extent_busy_insert(
struct xfs_trans *tp,
static void
xfs_extent_busy_insert_list(
struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags)
unsigned int flags,
struct list_head *busy_list)
{
struct xfs_extent_busy *new;
struct xfs_extent_busy *busyp;
......@@ -40,7 +40,7 @@ xfs_extent_busy_insert(
new->flags = flags;
/* trace before insert to be able to see failed inserts */
trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len);
trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
spin_lock(&pag->pagb_lock);
rbp = &pag->pagb_tree.rb_node;
......@@ -62,10 +62,32 @@ xfs_extent_busy_insert(
rb_link_node(&new->rb_node, parent, rbp);
rb_insert_color(&new->rb_node, &pag->pagb_tree);
list_add(&new->list, &tp->t_busy);
list_add(&new->list, busy_list);
spin_unlock(&pag->pagb_lock);
}
void
xfs_extent_busy_insert(
struct xfs_trans *tp,
struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
unsigned int flags)
{
xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
}
void
xfs_extent_busy_insert_discard(
struct xfs_perag *pag,
xfs_agblock_t bno,
xfs_extlen_t len,
struct list_head *busy_list)
{
xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
busy_list);
}
/*
* Search for a busy extent within the range of the extent we are about to
* allocate. You need to be holding the busy extent tree lock when calling
......
......@@ -16,9 +16,6 @@ struct xfs_alloc_arg;
/*
* Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that
* have been freed but whose transactions aren't committed to disk yet.
*
* Note that we use the transaction ID to record the transaction, not the
* transaction structure itself. See xfs_extent_busy_insert() for details.
*/
struct xfs_extent_busy {
struct rb_node rb_node; /* ag by-bno indexed search tree */
......@@ -31,10 +28,31 @@ struct xfs_extent_busy {
#define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */
};
/*
* List used to track groups of related busy extents all the way through
* to discard completion.
*/
struct xfs_busy_extents {
struct xfs_mount *mount;
struct list_head extent_list;
struct work_struct endio_work;
/*
* Owner is the object containing the struct xfs_busy_extents to free
* once the busy extents have been processed. If only the
* xfs_busy_extents object needs freeing, then point this at itself.
*/
void *owner;
};
void
xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag,
xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags);
void
xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno,
xfs_extlen_t len, struct list_head *busy_list);
void
xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list,
bool do_discard);
......
......@@ -16,8 +16,7 @@
#include "xfs_log.h"
#include "xfs_log_priv.h"
#include "xfs_trace.h"
struct workqueue_struct *xfs_discard_wq;
#include "xfs_discard.h"
/*
* Allocate a new ticket. Failing to get a new ticket makes it really hard to
......@@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void)
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
INIT_LIST_HEAD(&ctx->log_items);
INIT_LIST_HEAD(&ctx->lv_chain);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
......@@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate(
if (!list_empty(&cilpcp->busy_extents)) {
list_splice_init(&cilpcp->busy_extents,
&ctx->busy_extents);
&ctx->busy_extents.extent_list);
}
if (!list_empty(&cilpcp->log_items))
list_splice_init(&cilpcp->log_items, &ctx->log_items);
......@@ -708,76 +707,6 @@ xlog_cil_free_logvec(
}
}
static void
xlog_discard_endio_work(
struct work_struct *work)
{
struct xfs_cil_ctx *ctx =
container_of(work, struct xfs_cil_ctx, discard_endio_work);
struct xfs_mount *mp = ctx->cil->xc_log->l_mp;
xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
kmem_free(ctx);
}
/*
* Queue up the actual completion to a thread to avoid IRQ-safe locking for
* pagb_lock. Note that we need a unbounded workqueue, otherwise we might
* get the execution delayed up to 30 seconds for weird reasons.
*/
static void
xlog_discard_endio(
struct bio *bio)
{
struct xfs_cil_ctx *ctx = bio->bi_private;
INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work);
queue_work(xfs_discard_wq, &ctx->discard_endio_work);
bio_put(bio);
}
static void
xlog_discard_busy_extents(
struct xfs_mount *mp,
struct xfs_cil_ctx *ctx)
{
struct list_head *list = &ctx->busy_extents;
struct xfs_extent_busy *busyp;
struct bio *bio = NULL;
struct blk_plug plug;
int error = 0;
ASSERT(xfs_has_discard(mp));
blk_start_plug(&plug);
list_for_each_entry(busyp, list, list) {
trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
busyp->length);
error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
XFS_FSB_TO_BB(mp, busyp->length),
GFP_NOFS, &bio);
if (error && error != -EOPNOTSUPP) {
xfs_info(mp,
"discard failed for extent [0x%llx,%u], error %d",
(unsigned long long)busyp->bno,
busyp->length,
error);
break;
}
}
if (bio) {
bio->bi_private = ctx;
bio->bi_end_io = xlog_discard_endio;
submit_bio(bio);
} else {
xlog_discard_endio_work(&ctx->discard_endio_work);
}
blk_finish_plug(&plug);
}
/*
* Mark all items committed and clear busy extents. We free the log vector
* chains in a separate pass so that we unpin the log items as quickly as
......@@ -807,8 +736,8 @@ xlog_cil_committed(
xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain,
ctx->start_lsn, abort);
xfs_extent_busy_sort(&ctx->busy_extents);
xfs_extent_busy_clear(mp, &ctx->busy_extents,
xfs_extent_busy_sort(&ctx->busy_extents.extent_list);
xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list,
xfs_has_discard(mp) && !abort);
spin_lock(&ctx->cil->xc_push_lock);
......@@ -817,10 +746,14 @@ xlog_cil_committed(
xlog_cil_free_logvec(&ctx->lv_chain);
if (!list_empty(&ctx->busy_extents))
xlog_discard_busy_extents(mp, ctx);
else
kmem_free(ctx);
if (!list_empty(&ctx->busy_extents.extent_list)) {
ctx->busy_extents.mount = mp;
ctx->busy_extents.owner = ctx;
xfs_discard_extents(mp, &ctx->busy_extents);
return;
}
kmem_free(ctx);
}
void
......
......@@ -6,6 +6,8 @@
#ifndef __XFS_LOG_PRIV_H__
#define __XFS_LOG_PRIV_H__
#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */
struct xfs_buf;
struct xlog;
struct xlog_ticket;
......@@ -223,12 +225,11 @@ struct xfs_cil_ctx {
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
atomic_t space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_busy_extents busy_extents;
struct list_head log_items; /* log items in chkpt */
struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment