Commit 39823d0f authored by Dave Chinner's avatar Dave Chinner Committed by Darrick J. Wong

xfs: CIL work is serialised, not pipelined

Because we use a single work structure attached to the CIL rather
than the CIL context, we can only queue a single work item at a
time. This results in the CIL being single threaded and limits
performance when it becomes CPU bound.

The design of the CIL is that it is pipelined and multiple commits
can be running concurrently, but the way the work is currently
implemented means that it is not pipelining as it was intended. The
critical work to switch the CIL context can take a few milliseconds
to run, but the rest of the CIL context flush can take hundreds of
milliseconds to complete. The context switching is the serialisation
point of the CIL, once the context has been switched the rest of the
context push can run asynchrnously with all other context pushes.

Hence we can move the work to the CIL context so that we can run
multiple CIL pushes at the same time and spread the majority of
the work out over multiple CPUs. We can keep the per-cpu CIL commit
state on the CIL rather than the context, because the context is
pinned to the CIL until the switch is done and we aggregate and
drain the per-cpu state held on the CIL during the context switch.

However, because we no longer serialise the CIL work, we can have
effectively unlimited CIL pushes in progress. We don't want to do
this - not only does it create contention on the iclogs and the
state machine locks, we can run the log right out of space with
outstanding pushes. Instead, limit the work concurrency to 4
concurrent works being processed at a time. This is enough
concurrency to remove the CIL from being a CPU bound bottleneck but
not enough to create new contention points or unbound concurrency
issues.
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>
parent 0020a190
...@@ -47,6 +47,34 @@ xlog_cil_ticket_alloc( ...@@ -47,6 +47,34 @@ xlog_cil_ticket_alloc(
return tic; return tic;
} }
/*
* Unavoidable forward declaration - xlog_cil_push_work() calls
* xlog_cil_ctx_alloc() itself.
*/
static void xlog_cil_push_work(struct work_struct *work);
static struct xfs_cil_ctx *
xlog_cil_ctx_alloc(void)
{
struct xfs_cil_ctx *ctx;
ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
INIT_WORK(&ctx->push_work, xlog_cil_push_work);
return ctx;
}
static void
xlog_cil_ctx_switch(
struct xfs_cil *cil,
struct xfs_cil_ctx *ctx)
{
ctx->sequence = ++cil->xc_current_sequence;
ctx->cil = cil;
cil->xc_ctx = ctx;
}
/* /*
* After the first stage of log recovery is done, we know where the head and * After the first stage of log recovery is done, we know where the head and
* tail of the log are. We need this log initialisation done before we can * tail of the log are. We need this log initialisation done before we can
...@@ -824,11 +852,11 @@ static void ...@@ -824,11 +852,11 @@ static void
xlog_cil_push_work( xlog_cil_push_work(
struct work_struct *work) struct work_struct *work)
{ {
struct xfs_cil *cil = struct xfs_cil_ctx *ctx =
container_of(work, struct xfs_cil, xc_push_work); container_of(work, struct xfs_cil_ctx, push_work);
struct xfs_cil *cil = ctx->cil;
struct xlog *log = cil->xc_log; struct xlog *log = cil->xc_log;
struct xfs_log_vec *lv; struct xfs_log_vec *lv;
struct xfs_cil_ctx *ctx;
struct xfs_cil_ctx *new_ctx; struct xfs_cil_ctx *new_ctx;
struct xlog_ticket *tic; struct xlog_ticket *tic;
int num_iovecs; int num_iovecs;
...@@ -842,11 +870,10 @@ xlog_cil_push_work( ...@@ -842,11 +870,10 @@ xlog_cil_push_work(
DECLARE_COMPLETION_ONSTACK(bdev_flush); DECLARE_COMPLETION_ONSTACK(bdev_flush);
bool push_commit_stable; bool push_commit_stable;
new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); new_ctx = xlog_cil_ctx_alloc();
new_ctx->ticket = xlog_cil_ticket_alloc(log); new_ctx->ticket = xlog_cil_ticket_alloc(log);
down_write(&cil->xc_ctx_lock); down_write(&cil->xc_ctx_lock);
ctx = cil->xc_ctx;
spin_lock(&cil->xc_push_lock); spin_lock(&cil->xc_push_lock);
push_seq = cil->xc_push_seq; push_seq = cil->xc_push_seq;
...@@ -878,7 +905,7 @@ xlog_cil_push_work( ...@@ -878,7 +905,7 @@ xlog_cil_push_work(
/* check for a previously pushed sequence */ /* check for a previously pushed sequence */
if (push_seq < cil->xc_ctx->sequence) { if (push_seq < ctx->sequence) {
spin_unlock(&cil->xc_push_lock); spin_unlock(&cil->xc_push_lock);
goto out_skip; goto out_skip;
} }
...@@ -951,19 +978,7 @@ xlog_cil_push_work( ...@@ -951,19 +978,7 @@ xlog_cil_push_work(
} }
/* /*
* initialise the new context and attach it to the CIL. Then attach * Switch the contexts so we can drop the context lock and move out
* the current context to the CIL committing list so it can be found
* during log forces to extract the commit lsn of the sequence that
* needs to be forced.
*/
INIT_LIST_HEAD(&new_ctx->committing);
INIT_LIST_HEAD(&new_ctx->busy_extents);
new_ctx->sequence = ctx->sequence + 1;
new_ctx->cil = cil;
cil->xc_ctx = new_ctx;
/*
* The switch is now done, so we can drop the context lock and move out
* of a shared context. We can't just go straight to the commit record, * of a shared context. We can't just go straight to the commit record,
* though - we need to synchronise with previous and future commits so * though - we need to synchronise with previous and future commits so
* that the commit records are correctly ordered in the log to ensure * that the commit records are correctly ordered in the log to ensure
...@@ -988,7 +1003,7 @@ xlog_cil_push_work( ...@@ -988,7 +1003,7 @@ xlog_cil_push_work(
* deferencing a freed context pointer. * deferencing a freed context pointer.
*/ */
spin_lock(&cil->xc_push_lock); spin_lock(&cil->xc_push_lock);
cil->xc_current_sequence = new_ctx->sequence; xlog_cil_ctx_switch(cil, new_ctx);
spin_unlock(&cil->xc_push_lock); spin_unlock(&cil->xc_push_lock);
up_write(&cil->xc_ctx_lock); up_write(&cil->xc_ctx_lock);
...@@ -1136,7 +1151,7 @@ xlog_cil_push_background( ...@@ -1136,7 +1151,7 @@ xlog_cil_push_background(
spin_lock(&cil->xc_push_lock); spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) { if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence; cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work);
} }
/* /*
...@@ -1202,7 +1217,7 @@ xlog_cil_push_now( ...@@ -1202,7 +1217,7 @@ xlog_cil_push_now(
/* start on any pending background push to minimise wait time on it */ /* start on any pending background push to minimise wait time on it */
if (!async) if (!async)
flush_work(&cil->xc_push_work); flush_workqueue(log->l_mp->m_cil_workqueue);
/* /*
* If the CIL is empty or we've already pushed the sequence then * If the CIL is empty or we've already pushed the sequence then
...@@ -1216,7 +1231,7 @@ xlog_cil_push_now( ...@@ -1216,7 +1231,7 @@ xlog_cil_push_now(
cil->xc_push_seq = push_seq; cil->xc_push_seq = push_seq;
cil->xc_push_commit_stable = async; cil->xc_push_commit_stable = async;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); queue_work(log->l_mp->m_cil_workqueue, &cil->xc_ctx->push_work);
spin_unlock(&cil->xc_push_lock); spin_unlock(&cil->xc_push_lock);
} }
...@@ -1456,13 +1471,6 @@ xlog_cil_init( ...@@ -1456,13 +1471,6 @@ xlog_cil_init(
if (!cil) if (!cil)
return -ENOMEM; return -ENOMEM;
ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL);
if (!ctx) {
kmem_free(cil);
return -ENOMEM;
}
INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_cil);
INIT_LIST_HEAD(&cil->xc_committing); INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_cil_lock);
...@@ -1471,16 +1479,12 @@ xlog_cil_init( ...@@ -1471,16 +1479,12 @@ xlog_cil_init(
init_rwsem(&cil->xc_ctx_lock); init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_start_wait);
init_waitqueue_head(&cil->xc_commit_wait); init_waitqueue_head(&cil->xc_commit_wait);
INIT_LIST_HEAD(&ctx->committing);
INIT_LIST_HEAD(&ctx->busy_extents);
ctx->sequence = 1;
ctx->cil = cil;
cil->xc_ctx = ctx;
cil->xc_current_sequence = ctx->sequence;
cil->xc_log = log; cil->xc_log = log;
log->l_cilp = cil; log->l_cilp = cil;
ctx = xlog_cil_ctx_alloc();
xlog_cil_ctx_switch(cil, ctx);
return 0; return 0;
} }
......
...@@ -249,6 +249,7 @@ struct xfs_cil_ctx { ...@@ -249,6 +249,7 @@ struct xfs_cil_ctx {
struct list_head iclog_entry; struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */ struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work; struct work_struct discard_endio_work;
struct work_struct push_work;
}; };
/* /*
...@@ -282,7 +283,6 @@ struct xfs_cil { ...@@ -282,7 +283,6 @@ struct xfs_cil {
wait_queue_head_t xc_commit_wait; wait_queue_head_t xc_commit_wait;
wait_queue_head_t xc_start_wait; wait_queue_head_t xc_start_wait;
xfs_csn_t xc_current_sequence; xfs_csn_t xc_current_sequence;
struct work_struct xc_push_work;
wait_queue_head_t xc_push_wait; /* background push throttle */ wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
......
...@@ -518,9 +518,13 @@ xfs_init_mount_workqueues( ...@@ -518,9 +518,13 @@ xfs_init_mount_workqueues(
if (!mp->m_unwritten_workqueue) if (!mp->m_unwritten_workqueue)
goto out_destroy_buf; goto out_destroy_buf;
/*
* Limit the CIL pipeline depth to 4 concurrent works to bound the
* concurrency the log spinlocks will be exposed to.
*/
mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s",
XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND),
0, mp->m_super->s_id); 4, mp->m_super->s_id);
if (!mp->m_cil_workqueue) if (!mp->m_cil_workqueue)
goto out_destroy_unwritten; goto out_destroy_unwritten;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment