Commit dd81dc05 authored by Darrick J. Wong's avatar Darrick J. Wong

Merge tag 'xfs-cil-scale-5.20' of...

Merge tag 'xfs-cil-scale-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs into xfs-5.20-mergeA

xfs: improve CIL scalability

This series aims to improve the scalability of XFS transaction
commits on large CPU count machines. My 32p machine hits contention
limits in xlog_cil_commit() at about 700,000 transaction commits a
section. It hits this at 16 thread workloads, and 32 thread
workloads go no faster and just burn CPU on the CIL spinlocks.

This patchset gets rid of spinlocks and global serialisation points
in the xlog_cil_commit() path. It does this by moving to a
combination of per-cpu counters, unordered per-cpu lists and
post-ordered per-cpu lists.

This results in transaction commit rates exceeding 1.4 million
commits/s under unlink certain workloads, and while the log lock
contention is largely gone there is still significant lock
contention in the VFS (dentry cache, inode cache and security layers)
at >600,000 transactions/s that still limit scalability.

The changes to the CIL accounting and behaviour, combined with the
structural changes to xlog_write() in prior patchsets make the
per-cpu restructuring possible and sane. This allows us to move to
precalculated reservation requirements that allow for reservation
stealing to be accounted across multiple CPUs accurately.

That is, instead of trying to account for continuation log opheaders
on a "growth" basis, we pre-calculate how many iclogs we'll need to
write out a maximally sized CIL checkpoint and steal that reserveD
that space one commit at a time until the CIL has a full
reservation. If we ever run a commit when we are already at the hard
limit (because post-throttling) we simply take an extra reservation
from each commit that is run when over the limit. Hence we don't
need to do space usage math in the fast path and so never need to
sum the per-cpu counters in this fast path.

Similarly, per-cpu lists have the problem of ordering - we can't
remove an item from a per-cpu list if we want to move it forward in
the CIL. We solve this problem by using an atomic counter to give
every commit a sequence number that is copied into the log items in
that transaction. Hence relogging items just overwrites the sequence
number in the log item, and does not move it in the per-cpu lists.
Once we reaggregate the per-cpu lists back into a single list in the
CIL push work, we can run it through list-sort() and reorder it back
into a globally ordered list. This costs a bit of CPU time, but now
that the CIL can run multiple works and pipelines properly, this is
not a limiting factor for performance. It does increase fsync
latency when the CIL is full, but workloads issuing large numbers of
fsync()s or sync transactions end up with very small CILs and so the
latency impact or sorting is not measurable for such workloads.

OVerall, this pushes the transaction commit bottleneck out to the
lockless reservation grant head updates. These atomic updates don't
start to be a limiting fact until > 1.5 million transactions/s are
being run, at which point the accounting functions start to show up
in profiles as the highest CPU users. Still, this series doubles
transaction throughput without increasing CPU usage before we get
to that cacheline contention breakdown point...
`
Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Signed-off-by: default avatarDarrick J. Wong <djwong@kernel.org>

* tag 'xfs-cil-scale-5.20' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: expanding delayed logging design with background material
  xfs: xlog_sync() manually adjusts grant head space
  xfs: avoid cil push lock if possible
  xfs: move CIL ordering to the logvec chain
  xfs: convert log vector chain to use list heads
  xfs: convert CIL to unordered per cpu lists
  xfs: Add order IDs to log items in CIL
  xfs: convert CIL busy extents to per-cpu
  xfs: track CIL ticket reservation in percpu structure
  xfs: implement percpu cil space used calculation
  xfs: introduce per-cpu CIL tracking structure
  xfs: rework per-iclog header CIL reservation
  xfs: lift init CIL reservation out of xc_cil_lock
  xfs: use the CIL space used counter for emptiness checks
parents 88084a3d 51a117ed
......@@ -57,7 +57,8 @@ xlog_grant_push_ail(
STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog);
struct xlog_in_core *iclog,
struct xlog_ticket *ticket);
#if defined(DEBUG)
STATIC void
xlog_verify_grant_tail(
......@@ -567,7 +568,8 @@ xlog_state_shutdown_callbacks(
int
xlog_state_release_iclog(
struct xlog *log,
struct xlog_in_core *iclog)
struct xlog_in_core *iclog,
struct xlog_ticket *ticket)
{
xfs_lsn_t tail_lsn;
bool last_ref;
......@@ -614,7 +616,7 @@ xlog_state_release_iclog(
trace_xlog_iclog_syncing(iclog, _RET_IP_);
spin_unlock(&log->l_icloglock);
xlog_sync(log, iclog);
xlog_sync(log, iclog, ticket);
spin_lock(&log->l_icloglock);
return 0;
}
......@@ -881,7 +883,7 @@ xlog_force_iclog(
iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
if (iclog->ic_state == XLOG_STATE_ACTIVE)
xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
return xlog_state_release_iclog(iclog->ic_log, iclog);
return xlog_state_release_iclog(iclog->ic_log, iclog, NULL);
}
/*
......@@ -944,6 +946,8 @@ xlog_write_unmount_record(
.lv_niovecs = 1,
.lv_iovecp = &reg,
};
LIST_HEAD(lv_chain);
list_add(&vec.lv_list, &lv_chain);
BUILD_BUG_ON((sizeof(struct xlog_op_header) +
sizeof(struct xfs_unmount_log_format)) !=
......@@ -952,7 +956,7 @@ xlog_write_unmount_record(
/* account for space used by record data */
ticket->t_curr_res -= sizeof(unmount_rec);
return xlog_write(log, NULL, &vec, ticket, reg.i_len);
return xlog_write(log, NULL, &lv_chain, ticket, reg.i_len);
}
/*
......@@ -2025,7 +2029,8 @@ xlog_calc_iclog_size(
STATIC void
xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
struct xlog_in_core *iclog,
struct xlog_ticket *ticket)
{
unsigned int count; /* byte count of bwrite */
unsigned int roundoff; /* roundoff to BB or stripe */
......@@ -2037,9 +2042,17 @@ xlog_sync(
count = xlog_calc_iclog_size(log, iclog, &roundoff);
/* move grant heads by roundoff in sync */
/*
* If we have a ticket, account for the roundoff via the ticket
* reservation to avoid touching the hot grant heads needlessly.
* Otherwise, we have to move grant heads directly.
*/
if (ticket) {
ticket->t_curr_res -= roundoff;
} else {
xlog_grant_add_space(log, &log->l_reserve_head.grant, roundoff);
xlog_grant_add_space(log, &log->l_write_head.grant, roundoff);
}
/* put cycle number in every block */
xlog_pack_data(log, iclog, roundoff);
......@@ -2275,7 +2288,7 @@ xlog_write_get_more_iclog_space(
spin_lock(&log->l_icloglock);
ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC);
xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt);
error = xlog_state_release_iclog(log, iclog);
error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
......@@ -2471,13 +2484,13 @@ int
xlog_write(
struct xlog *log,
struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector,
struct list_head *lv_chain,
struct xlog_ticket *ticket,
uint32_t len)
{
struct xlog_in_core *iclog = NULL;
struct xfs_log_vec *lv = log_vector;
struct xfs_log_vec *lv;
uint32_t record_cnt = 0;
uint32_t data_cnt = 0;
int error = 0;
......@@ -2505,7 +2518,7 @@ xlog_write(
if (ctx)
xlog_cil_set_ctx_write_state(ctx, iclog);
while (lv) {
list_for_each_entry(lv, lv_chain, lv_list) {
/*
* If the entire log vec does not fit in the iclog, punt it to
* the partial copy loop which can handle this case.
......@@ -2526,7 +2539,6 @@ xlog_write(
xlog_write_full(lv, ticket, iclog, &log_offset,
&len, &record_cnt, &data_cnt);
}
lv = lv->lv_next;
}
ASSERT(len == 0);
......@@ -2538,7 +2550,7 @@ xlog_write(
*/
spin_lock(&log->l_icloglock);
xlog_state_finish_copy(log, iclog, record_cnt, 0);
error = xlog_state_release_iclog(log, iclog);
error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
return error;
......@@ -2958,7 +2970,7 @@ xlog_state_get_iclog_space(
* reference to the iclog.
*/
if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
error = xlog_state_release_iclog(log, iclog);
error = xlog_state_release_iclog(log, iclog, ticket);
spin_unlock(&log->l_icloglock);
if (error)
return error;
......@@ -3406,7 +3418,8 @@ xfs_log_ticket_get(
static int
xlog_calc_unit_res(
struct xlog *log,
int unit_bytes)
int unit_bytes,
int *niclogs)
{
int iclog_space;
uint num_headers;
......@@ -3486,6 +3499,8 @@ xlog_calc_unit_res(
/* roundoff padding for transaction data and one for commit record */
unit_bytes += 2 * log->l_iclog_roundoff;
if (niclogs)
*niclogs = num_headers;
return unit_bytes;
}
......@@ -3494,7 +3509,7 @@ xfs_log_calc_unit_res(
struct xfs_mount *mp,
int unit_bytes)
{
return xlog_calc_unit_res(mp->m_log, unit_bytes);
return xlog_calc_unit_res(mp->m_log, unit_bytes, NULL);
}
/*
......@@ -3512,7 +3527,7 @@ xlog_ticket_alloc(
tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
unit_res = xlog_calc_unit_res(log, unit_bytes);
unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
atomic_set(&tic->t_ref, 1);
tic->t_task = current;
......
......@@ -9,7 +9,8 @@
struct xfs_cil_ctx;
struct xfs_log_vec {
struct xfs_log_vec *lv_next; /* next lv in build list */
struct list_head lv_list; /* CIL lv chain ptrs */
uint32_t lv_order_id; /* chain ordering info */
int lv_niovecs; /* number of iovecs in lv */
struct xfs_log_iovec *lv_iovecp; /* iovec array */
struct xfs_log_item *lv_item; /* owner */
......
This diff is collapsed.
......@@ -145,13 +145,14 @@ enum xlog_iclog_state {
typedef struct xlog_ticket {
struct list_head t_queue; /* reserve/write queue */
struct task_struct *t_task; /* task that owns this ticket */
xlog_tid_t t_tid; /* transaction identifier : 4 */
atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
int t_unit_res; /* unit reservation in bytes : 4 */
char t_ocnt; /* original count : 1 */
char t_cnt; /* current count : 1 */
uint8_t t_flags; /* properties of reservation : 1 */
xlog_tid_t t_tid; /* transaction identifier */
atomic_t t_ref; /* ticket reference count */
int t_curr_res; /* current reservation */
int t_unit_res; /* unit reservation */
char t_ocnt; /* original unit count */
char t_cnt; /* current unit count */
uint8_t t_flags; /* properties of reservation */
int t_iclog_hdrs; /* iclog hdrs in t_curr_res */
} xlog_ticket_t;
/*
......@@ -221,13 +222,25 @@ struct xfs_cil_ctx {
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_in_core *commit_iclog;
struct xlog_ticket *ticket; /* chkpt ticket */
int space_used; /* aggregate size of regions */
atomic_t space_used; /* aggregate size of regions */
struct list_head busy_extents; /* busy extents in chkpt */
struct xfs_log_vec *lv_chain; /* logvecs being pushed */
struct list_head log_items; /* log items in chkpt */
struct list_head lv_chain; /* logvecs being pushed */
struct list_head iclog_entry;
struct list_head committing; /* ctx committing list */
struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
};
/*
* Per-cpu CIL tracking items
*/
struct xlog_cil_pcp {
int32_t space_used;
uint32_t space_reserved;
struct list_head busy_extents;
struct list_head log_items;
};
/*
......@@ -248,8 +261,8 @@ struct xfs_cil_ctx {
*/
struct xfs_cil {
struct xlog *xc_log;
struct list_head xc_cil;
spinlock_t xc_cil_lock;
unsigned long xc_flags;
atomic_t xc_iclog_hdrs;
struct workqueue_struct *xc_push_wq;
struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp;
......@@ -263,8 +276,17 @@ struct xfs_cil {
wait_queue_head_t xc_start_wait;
xfs_csn_t xc_current_sequence;
wait_queue_head_t xc_push_wait; /* background push throttle */
void __percpu *xc_pcp; /* percpu CIL structures */
#ifdef CONFIG_HOTPLUG_CPU
struct list_head xc_pcp_list;
#endif
} ____cacheline_aligned_in_smp;
/* xc_flags bit values */
#define XLOG_CIL_EMPTY 1
#define XLOG_CIL_PCP_SPACE 2
/*
* The amount of log space we allow the CIL to aggregate is difficult to size.
* Whatever we choose, we have to make sure we can get a reservation for the
......@@ -486,14 +508,15 @@ struct xlog_ticket *xlog_ticket_alloc(struct xlog *log, int unit_bytes,
void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
void xlog_print_trans(struct xfs_trans *);
int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx,
struct xfs_log_vec *log_vector, struct xlog_ticket *tic,
struct list_head *lv_chain, struct xlog_ticket *tic,
uint32_t len);
void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket);
void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket);
void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
int eventual_size);
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
struct xlog_ticket *ticket);
/*
* When we crack an atomic LSN, we sample it first so that the value will not
......@@ -682,4 +705,9 @@ xlog_kvmalloc(
return p;
}
/*
* CIL CPU dead notifier
*/
void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
#endif /* __XFS_LOG_PRIV_H__ */
......@@ -2213,6 +2213,7 @@ xfs_cpu_dead(
list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
spin_unlock(&xfs_mount_list_lock);
xfs_inodegc_cpu_dead(mp, cpu);
xlog_cil_pcp_dead(mp->m_log, cpu);
spin_lock(&xfs_mount_list_lock);
}
spin_unlock(&xfs_mount_list_lock);
......
......@@ -760,7 +760,7 @@ xfs_log_item_batch_insert(
void
xfs_trans_committed_bulk(
struct xfs_ail *ailp,
struct xfs_log_vec *log_vector,
struct list_head *lv_chain,
xfs_lsn_t commit_lsn,
bool aborted)
{
......@@ -775,7 +775,7 @@ xfs_trans_committed_bulk(
spin_unlock(&ailp->ail_lock);
/* unpin all the log items */
for (lv = log_vector; lv; lv = lv->lv_next ) {
list_for_each_entry(lv, lv_chain, lv_list) {
struct xfs_log_item *lip = lv->lv_item;
xfs_lsn_t item_lsn;
......
......@@ -45,6 +45,7 @@ struct xfs_log_item {
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
xfs_csn_t li_seq; /* CIL commit seq */
uint32_t li_order_id; /* CIL commit order */
};
/*
......
......@@ -19,7 +19,8 @@ void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
void xfs_trans_del_item(struct xfs_log_item *);
void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv,
void xfs_trans_committed_bulk(struct xfs_ail *ailp,
struct list_head *lv_chain,
xfs_lsn_t commit_lsn, bool aborted);
/*
* AIL traversal cursor.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment