Commit 8d280b98 authored by David Chinner's avatar David Chinner Committed by Nathan Scott

[XFS] On machines with more than 8 cpus, when running parallel I/O

threads, the incore superblock lock becomes the limiting factor for
buffered write throughput. Make the contended fields in the incore
superblock use per-cpu counters so that there is no global lock to limit
scalability.

SGI-PV: 946630
SGI-Modid: xfs-linux-melb:xfs-kern:25106a
Signed-off-by: default avatarDavid Chinner <dgc@sgi.com>
Signed-off-by: default avatarNathan Scott <nathans@sgi.com>
parent 9f4cbecd
......@@ -100,6 +100,11 @@
*/
#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
#define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */
#if CONFIG_SMP
#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
#else
#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
#endif
/*
* State flag for unwritten extent buffers.
......
......@@ -462,6 +462,7 @@ xfs_fs_counts(
{
unsigned long s;
xfs_icsb_sync_counters_lazy(mp);
s = XFS_SB_LOCK(mp);
cnt->freedata = mp->m_sb.sb_fdblocks;
cnt->freertx = mp->m_sb.sb_frextents;
......
......@@ -51,6 +51,26 @@ STATIC int xfs_uuid_mount(xfs_mount_t *);
STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
STATIC void xfs_unmountfs_wait(xfs_mount_t *);
#ifdef HAVE_PERCPU_SB
STATIC void xfs_icsb_destroy_counters(xfs_mount_t *);
STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int);
STATIC void xfs_icsb_sync_counters(xfs_mount_t *);
STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
int, int);
STATIC int xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t,
int, int);
#else
#define xfs_icsb_destroy_counters(mp) do { } while (0)
#define xfs_icsb_balance_counter(mp, a, b) do { } while (0)
#define xfs_icsb_sync_counters(mp) do { } while (0)
#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0)
#define xfs_icsb_modify_counters_locked(mp, a, b, c) do { } while (0)
#endif
static const struct {
short offset;
short type; /* 0 = integer
......@@ -113,7 +133,11 @@ xfs_mount_init(void)
{
xfs_mount_t *mp;
mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP);
if (xfs_icsb_init_counters(mp)) {
mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
}
AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail");
spinlock_init(&mp->m_sb_lock, "xfs_sb");
......@@ -177,6 +201,7 @@ xfs_mount_free(
VFS_REMOVEBHV(vfsp, &mp->m_bhv);
}
xfs_icsb_destroy_counters(mp);
kmem_free(mp, sizeof(xfs_mount_t));
}
......@@ -527,6 +552,10 @@ xfs_readsb(xfs_mount_t *mp)
ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
}
xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
mp->m_sb_bp = bp;
xfs_buf_relse(bp);
ASSERT(XFS_BUF_VALUSEMA(bp) > 0);
......@@ -1154,6 +1183,9 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
sbp = xfs_getsb(mp, 0);
if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY ||
XFS_FORCED_SHUTDOWN(mp))) {
xfs_icsb_sync_counters(mp);
/*
* mark shared-readonly if desired
*/
......@@ -1227,7 +1259,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
xfs_trans_log_buf(tp, bp, first, last);
}
/*
* xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
* a delta to a specified field in the in-core superblock. Simply
......@@ -1237,7 +1268,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
*
* The SB_LOCK must be held when this routine is called.
*/
STATIC int
int
xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
int delta, int rsvd)
{
......@@ -1406,9 +1437,26 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd)
unsigned long s;
int status;
/* check for per-cpu counters */
switch (field) {
#ifdef HAVE_PERCPU_SB
case XFS_SBS_ICOUNT:
case XFS_SBS_IFREE:
case XFS_SBS_FDBLOCKS:
if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
status = xfs_icsb_modify_counters(mp, field,
delta, rsvd);
break;
}
/* FALLTHROUGH */
#endif
default:
s = XFS_SB_LOCK(mp);
status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
XFS_SB_UNLOCK(mp, s);
break;
}
return status;
}
......@@ -1445,8 +1493,26 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
* from the loop so we'll fall into the undo loop
* below.
*/
status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field,
switch (msbp->msb_field) {
#ifdef HAVE_PERCPU_SB
case XFS_SBS_ICOUNT:
case XFS_SBS_IFREE:
case XFS_SBS_FDBLOCKS:
if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
status = xfs_icsb_modify_counters_locked(mp,
msbp->msb_field,
msbp->msb_delta, rsvd);
break;
}
/* FALLTHROUGH */
#endif
default:
status = xfs_mod_incore_sb_unlocked(mp,
msbp->msb_field,
msbp->msb_delta, rsvd);
break;
}
if (status != 0) {
break;
}
......@@ -1463,8 +1529,28 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, xfs_mod_sb_t *msb, uint nmsb, int rsvd)
if (status != 0) {
msbp--;
while (msbp >= msb) {
switch (msbp->msb_field) {
#ifdef HAVE_PERCPU_SB
case XFS_SBS_ICOUNT:
case XFS_SBS_IFREE:
case XFS_SBS_FDBLOCKS:
if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) {
status =
xfs_icsb_modify_counters_locked(mp,
msbp->msb_field,
-(msbp->msb_delta),
rsvd);
break;
}
/* FALLTHROUGH */
#endif
default:
status = xfs_mod_incore_sb_unlocked(mp,
msbp->msb_field, -(msbp->msb_delta), rsvd);
msbp->msb_field,
-(msbp->msb_delta),
rsvd);
break;
}
ASSERT(status == 0);
msbp--;
}
......@@ -1577,3 +1663,445 @@ xfs_mount_log_sbunit(
xfs_mod_sb(tp, fields);
xfs_trans_commit(tp, 0, NULL);
}
#ifdef HAVE_PERCPU_SB
/*
* Per-cpu incore superblock counters
*
* Simple concept, difficult implementation
*
* Basically, replace the incore superblock counters with a distributed per cpu
* counter for contended fields (e.g. free block count).
*
* Difficulties arise in that the incore sb is used for ENOSPC checking, and
* hence needs to be accurately read when we are running low on space. Hence
* there is a method to enable and disable the per-cpu counters based on how
* much "stuff" is available in them.
*
* Basically, a counter is enabled if there is enough free resource to justify
* running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
* ENOSPC), then we disable the counters to synchronise all callers and
* re-distribute the available resources.
*
* If, once we redistributed the available resources, we still get a failure,
* we disable the per-cpu counter and go through the slow path.
*
* The slow path is the current xfs_mod_incore_sb() function. This means that
* when we disable a per-cpu counter, we need to drain it's resources back to
* the global superblock. We do this after disabling the counter to prevent
* more threads from queueing up on the counter.
*
* Essentially, this means that we still need a lock in the fast path to enable
* synchronisation between the global counters and the per-cpu counters. This
* is not a problem because the lock will be local to a CPU almost all the time
* and have little contention except when we get to ENOSPC conditions.
*
* Basically, this lock becomes a barrier that enables us to lock out the fast
* path while we do things like enabling and disabling counters and
* synchronising the counters.
*
* Locking rules:
*
* 1. XFS_SB_LOCK() before picking up per-cpu locks
* 2. per-cpu locks always picked up via for_each_online_cpu() order
* 3. accurate counter sync requires XFS_SB_LOCK + per cpu locks
* 4. modifying per-cpu counters requires holding per-cpu lock
* 5. modifying global counters requires holding XFS_SB_LOCK
* 6. enabling or disabling a counter requires holding the XFS_SB_LOCK
* and _none_ of the per-cpu locks.
*
* Disabled counters are only ever re-enabled by a balance operation
* that results in more free resources per CPU than a given threshold.
* To ensure counters don't remain disabled, they are rebalanced when
* the global resource goes above a higher threshold (i.e. some hysteresis
* is present to prevent thrashing).
*
* Note: hotplug CPUs not yet supported
*/
int
xfs_icsb_init_counters(
xfs_mount_t *mp)
{
xfs_icsb_cnts_t *cntp;
int i;
mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
if (mp->m_sb_cnts == NULL)
return -ENOMEM;
for_each_online_cpu(i) {
cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
spin_lock_init(&cntp->icsb_lock);
}
/*
* start with all counters disabled so that the
* initial balance kicks us off correctly
*/
mp->m_icsb_counters = -1;
return 0;
}
STATIC void
xfs_icsb_destroy_counters(
xfs_mount_t *mp)
{
if (mp->m_sb_cnts)
free_percpu(mp->m_sb_cnts);
}
STATIC inline void
xfs_icsb_lock_all_counters(
xfs_mount_t *mp)
{
xfs_icsb_cnts_t *cntp;
int i;
for_each_online_cpu(i) {
cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
spin_lock(&cntp->icsb_lock);
}
}
STATIC inline void
xfs_icsb_unlock_all_counters(
xfs_mount_t *mp)
{
xfs_icsb_cnts_t *cntp;
int i;
for_each_online_cpu(i) {
cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
spin_unlock(&cntp->icsb_lock);
}
}
STATIC void
xfs_icsb_count(
xfs_mount_t *mp,
xfs_icsb_cnts_t *cnt,
int flags)
{
xfs_icsb_cnts_t *cntp;
int i;
memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
if (!(flags & XFS_ICSB_LAZY_COUNT))
xfs_icsb_lock_all_counters(mp);
for_each_online_cpu(i) {
cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
cnt->icsb_icount += cntp->icsb_icount;
cnt->icsb_ifree += cntp->icsb_ifree;
cnt->icsb_fdblocks += cntp->icsb_fdblocks;
}
if (!(flags & XFS_ICSB_LAZY_COUNT))
xfs_icsb_unlock_all_counters(mp);
}
STATIC int
xfs_icsb_counter_disabled(
xfs_mount_t *mp,
xfs_sb_field_t field)
{
ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
return test_bit(field, &mp->m_icsb_counters);
}
STATIC int
xfs_icsb_disable_counter(
xfs_mount_t *mp,
xfs_sb_field_t field)
{
xfs_icsb_cnts_t cnt;
ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
xfs_icsb_lock_all_counters(mp);
if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
/* drain back to superblock */
xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT);
switch(field) {
case XFS_SBS_ICOUNT:
mp->m_sb.sb_icount = cnt.icsb_icount;
break;
case XFS_SBS_IFREE:
mp->m_sb.sb_ifree = cnt.icsb_ifree;
break;
case XFS_SBS_FDBLOCKS:
mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
break;
default:
BUG();
}
}
xfs_icsb_unlock_all_counters(mp);
return 0;
}
STATIC void
xfs_icsb_enable_counter(
xfs_mount_t *mp,
xfs_sb_field_t field,
uint64_t count,
uint64_t resid)
{
xfs_icsb_cnts_t *cntp;
int i;
ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
xfs_icsb_lock_all_counters(mp);
for_each_online_cpu(i) {
cntp = per_cpu_ptr(mp->m_sb_cnts, i);
switch (field) {
case XFS_SBS_ICOUNT:
cntp->icsb_icount = count + resid;
break;
case XFS_SBS_IFREE:
cntp->icsb_ifree = count + resid;
break;
case XFS_SBS_FDBLOCKS:
cntp->icsb_fdblocks = count + resid;
break;
default:
BUG();
break;
}
resid = 0;
}
clear_bit(field, &mp->m_icsb_counters);
xfs_icsb_unlock_all_counters(mp);
}
STATIC void
xfs_icsb_sync_counters_int(
xfs_mount_t *mp,
int flags)
{
xfs_icsb_cnts_t cnt;
int s;
/* Pass 1: lock all counters */
if ((flags & XFS_ICSB_SB_LOCKED) == 0)
s = XFS_SB_LOCK(mp);
xfs_icsb_count(mp, &cnt, flags);
/* Step 3: update mp->m_sb fields */
if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
mp->m_sb.sb_icount = cnt.icsb_icount;
if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
mp->m_sb.sb_ifree = cnt.icsb_ifree;
if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
if ((flags & XFS_ICSB_SB_LOCKED) == 0)
XFS_SB_UNLOCK(mp, s);
}
/*
* Accurate update of per-cpu counters to incore superblock
*/
STATIC void
xfs_icsb_sync_counters(
xfs_mount_t *mp)
{
xfs_icsb_sync_counters_int(mp, 0);
}
/*
* lazy addition used for things like df, background sb syncs, etc
*/
void
xfs_icsb_sync_counters_lazy(
xfs_mount_t *mp)
{
xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT);
}
/*
* Balance and enable/disable counters as necessary.
*
* Thresholds for re-enabling counters are somewhat magic.
* inode counts are chosen to be the same number as single
* on disk allocation chunk per CPU, and free blocks is
* something far enough zero that we aren't going thrash
* when we get near ENOSPC.
*/
#define XFS_ICSB_INO_CNTR_REENABLE 64
#define XFS_ICSB_FDBLK_CNTR_REENABLE 512
STATIC void
xfs_icsb_balance_counter(
xfs_mount_t *mp,
xfs_sb_field_t field,
int flags)
{
uint64_t count, resid = 0;
int weight = num_online_cpus();
int s;
if (!(flags & XFS_ICSB_SB_LOCKED))
s = XFS_SB_LOCK(mp);
/* disable counter and sync counter */
xfs_icsb_disable_counter(mp, field);
/* update counters - first CPU gets residual*/
switch (field) {
case XFS_SBS_ICOUNT:
count = mp->m_sb.sb_icount;
resid = do_div(count, weight);
if (count < XFS_ICSB_INO_CNTR_REENABLE)
goto out;
break;
case XFS_SBS_IFREE:
count = mp->m_sb.sb_ifree;
resid = do_div(count, weight);
if (count < XFS_ICSB_INO_CNTR_REENABLE)
goto out;
break;
case XFS_SBS_FDBLOCKS:
count = mp->m_sb.sb_fdblocks;
resid = do_div(count, weight);
if (count < XFS_ICSB_FDBLK_CNTR_REENABLE)
goto out;
break;
default:
BUG();
break;
}
xfs_icsb_enable_counter(mp, field, count, resid);
out:
if (!(flags & XFS_ICSB_SB_LOCKED))
XFS_SB_UNLOCK(mp, s);
}
STATIC int
xfs_icsb_modify_counters_int(
xfs_mount_t *mp,
xfs_sb_field_t field,
int delta,
int rsvd,
int flags)
{
xfs_icsb_cnts_t *icsbp;
long long lcounter; /* long counter for 64 bit fields */
int cpu, s, locked = 0;
int ret = 0, balance_done = 0;
again:
cpu = get_cpu();
icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu),
spin_lock(&icsbp->icsb_lock);
if (unlikely(xfs_icsb_counter_disabled(mp, field)))
goto slow_path;
switch (field) {
case XFS_SBS_ICOUNT:
lcounter = icsbp->icsb_icount;
lcounter += delta;
if (unlikely(lcounter < 0))
goto slow_path;
icsbp->icsb_icount = lcounter;
break;
case XFS_SBS_IFREE:
lcounter = icsbp->icsb_ifree;
lcounter += delta;
if (unlikely(lcounter < 0))
goto slow_path;
icsbp->icsb_ifree = lcounter;
break;
case XFS_SBS_FDBLOCKS:
BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
lcounter = icsbp->icsb_fdblocks;
lcounter += delta;
if (unlikely(lcounter < 0))
goto slow_path;
icsbp->icsb_fdblocks = lcounter;
break;
default:
BUG();
break;
}
spin_unlock(&icsbp->icsb_lock);
put_cpu();
if (locked)
XFS_SB_UNLOCK(mp, s);
return 0;
/*
* The slow path needs to be run with the SBLOCK
* held so that we prevent other threads from
* attempting to run this path at the same time.
* this provides exclusion for the balancing code,
* and exclusive fallback if the balance does not
* provide enough resources to continue in an unlocked
* manner.
*/
slow_path:
spin_unlock(&icsbp->icsb_lock);
put_cpu();
/* need to hold superblock incase we need
* to disable a counter */
if (!(flags & XFS_ICSB_SB_LOCKED)) {
s = XFS_SB_LOCK(mp);
locked = 1;
flags |= XFS_ICSB_SB_LOCKED;
}
if (!balance_done) {
xfs_icsb_balance_counter(mp, field, flags);
balance_done = 1;
goto again;
} else {
/*
* we might not have enough on this local
* cpu to allocate for a bulk request.
* We need to drain this field from all CPUs
* and disable the counter fastpath
*/
xfs_icsb_disable_counter(mp, field);
}
ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
if (locked)
XFS_SB_UNLOCK(mp, s);
return ret;
}
STATIC int
xfs_icsb_modify_counters(
xfs_mount_t *mp,
xfs_sb_field_t field,
int delta,
int rsvd)
{
return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0);
}
/*
* Called when superblock is already locked
*/
STATIC int
xfs_icsb_modify_counters_locked(
xfs_mount_t *mp,
xfs_sb_field_t field,
int delta,
int rsvd)
{
return xfs_icsb_modify_counters_int(mp, field, delta,
rsvd, XFS_ICSB_SB_LOCKED);
}
#endif
......@@ -267,6 +267,32 @@ typedef struct xfs_ioops {
#define XFS_IODONE(vfsp) \
(*(mp)->m_io_ops.xfs_iodone)(vfsp)
#ifdef HAVE_PERCPU_SB
/*
* Valid per-cpu incore superblock counters. Note that if you add new counters,
* you may need to define new counter disabled bit field descriptors as there
* are more possible fields in the superblock that can fit in a bitfield on a
* 32 bit platform. The XFS_SBS_* values for the current current counters just
* fit.
*/
typedef struct xfs_icsb_cnts {
uint64_t icsb_fdblocks;
uint64_t icsb_ifree;
uint64_t icsb_icount;
spinlock_t icsb_lock;
} xfs_icsb_cnts_t;
#define XFS_ICSB_SB_LOCKED (1 << 0) /* sb already locked */
#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */
extern int xfs_icsb_init_counters(struct xfs_mount *);
extern void xfs_icsb_sync_counters_lazy(struct xfs_mount *);
#else
#define xfs_icsb_init_counters(mp) (0)
#define xfs_icsb_sync_counters_lazy(mp) do { } while (0)
#endif
typedef struct xfs_mount {
bhv_desc_t m_bhv; /* vfs xfs behavior */
......@@ -372,6 +398,10 @@ typedef struct xfs_mount {
struct xfs_qmops m_qm_ops; /* vector of XQM ops */
struct xfs_ioops m_io_ops; /* vector of I/O ops */
atomic_t m_active_trans; /* number trans frozen */
#ifdef HAVE_PERCPU_SB
xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */
unsigned long m_icsb_counters; /* disabled per-cpu counters */
#endif
} xfs_mount_t;
/*
......@@ -409,6 +439,8 @@ typedef struct xfs_mount {
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
* I/O size in stat() */
#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
counters */
/*
......@@ -546,6 +578,8 @@ extern void xfs_unmountfs_close(xfs_mount_t *, struct cred *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
extern int xfs_unmount_flush(xfs_mount_t *, int);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int);
extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
int, int);
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
uint, int);
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
......
......@@ -807,6 +807,7 @@ xfs_statvfs(
statp->f_type = XFS_SB_MAGIC;
xfs_icsb_sync_counters_lazy(mp);
s = XFS_SB_LOCK(mp);
statp->f_bsize = sbp->sb_blocksize;
lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment