Commit a3f74ffb authored by David Chinner's avatar David Chinner Committed by Lachlan McIlroy

[XFS] Don't block pdflush when writing back inodes

When pdflush is writing back inodes, it can get stuck on inode cluster
buffers that are currently under I/O. This occurs when we write data to
multiple inodes in the same inode cluster at the same time.

Effectively, delayed allocation marks the inode dirty during the data
writeback. Hence if the inode cluster was flushed during the writeback of
the first inode, the writeback of the second inode will block waiting for
the inode cluster write to complete before writing it again for the newly
dirtied inode.

Basically, we want to avoid this from happening so we don't block pdflush
and slow down all of writeback. Hence we introduce a non-blocking async
inode flush flag that pdflush uses. If this flag is set, we use
non-blocking operations (e.g. try locks) whereever we can to avoid
blocking or extra I/O being issued.

SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30501a
Signed-off-by: default avatarDavid Chinner <dgc@sgi.com>
Signed-off-by: default avatarLachlan McIlroy <lachlan@sgi.com>
parent 4ae29b43
......@@ -896,7 +896,8 @@ xfs_fs_write_inode(
struct inode *inode,
int sync)
{
int error = 0, flags = FLUSH_INODE;
int error = 0;
int flags = 0;
xfs_itrace_entry(XFS_I(inode));
if (sync) {
......
......@@ -73,12 +73,9 @@ typedef enum bhv_vrwlock {
#define IO_INVIS 0x00020 /* don't update inode timestamps */
/*
* Flags for vop_iflush call
* Flags for xfs_inode_flush
*/
#define FLUSH_SYNC 1 /* wait for flush to complete */
#define FLUSH_INODE 2 /* flush the inode itself */
#define FLUSH_LOG 4 /* force the last log entry for
* this inode out to disk */
/*
* Flush/Invalidate options for vop_toss/flush/flushinval_pages.
......
......@@ -145,11 +145,16 @@ xfs_imap_to_bp(
xfs_buf_t *bp;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
(int)imap->im_len, XFS_BUF_LOCK, &bp);
(int)imap->im_len, buf_flags, &bp);
if (error) {
cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned "
if (error != EAGAIN) {
cmn_err(CE_WARN,
"xfs_imap_to_bp: xfs_trans_read_buf()returned "
"an error %d on %s. Returning error.",
error, mp->m_fsname);
} else {
ASSERT(buf_flags & XFS_BUF_TRYLOCK);
}
return error;
}
......@@ -274,7 +279,8 @@ xfs_itobp(
xfs_dinode_t **dipp,
xfs_buf_t **bpp,
xfs_daddr_t bno,
uint imap_flags)
uint imap_flags,
uint buf_flags)
{
xfs_imap_t imap;
xfs_buf_t *bp;
......@@ -305,10 +311,17 @@ xfs_itobp(
}
ASSERT(bno == 0 || bno == imap.im_blkno);
error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
if (error)
return error;
if (!bp) {
ASSERT(buf_flags & XFS_BUF_TRYLOCK);
ASSERT(tp == NULL);
*bpp = NULL;
return EAGAIN;
}
*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
*bpp = bp;
return 0;
......@@ -812,7 +825,7 @@ xfs_iread(
* return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
* know that this is a new incore inode.
*/
error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags);
error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
if (error) {
kmem_zone_free(xfs_inode_zone, ip);
return error;
......@@ -1901,7 +1914,7 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error)
return error;
......@@ -2009,7 +2022,7 @@ xfs_iunlink_remove(
* of dealing with the buffer when there is no need to
* change it.
*/
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
......@@ -2071,7 +2084,7 @@ xfs_iunlink_remove(
* Now last_ibp points to the buffer previous to us on
* the unlinked list. Pull us from the list.
*/
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0);
error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
......@@ -2334,7 +2347,7 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0);
error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
if (error)
return error;
......@@ -2777,38 +2790,41 @@ xfs_iunpin(
}
/*
* This is called to wait for the given inode to be unpinned.
* It will sleep until this happens. The caller must have the
* inode locked in at least shared mode so that the buffer cannot
* be subsequently pinned once someone is waiting for it to be
* unpinned.
* This is called to unpin an inode. It can be directed to wait or to return
* immediately without waiting for the inode to be unpinned. The caller must
* have the inode locked in at least shared mode so that the buffer cannot be
* subsequently pinned once someone is waiting for it to be unpinned.
*/
STATIC void
xfs_iunpin_wait(
xfs_inode_t *ip)
__xfs_iunpin_wait(
xfs_inode_t *ip,
int wait)
{
xfs_inode_log_item_t *iip;
xfs_lsn_t lsn;
xfs_inode_log_item_t *iip = ip->i_itemp;
ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
if (atomic_read(&ip->i_pincount) == 0) {
if (atomic_read(&ip->i_pincount) == 0)
return;
}
iip = ip->i_itemp;
if (iip && iip->ili_last_lsn) {
lsn = iip->ili_last_lsn;
} else {
lsn = (xfs_lsn_t)0;
}
/* Give the log a push to start the unpinning I/O */
xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
iip->ili_last_lsn : 0, XFS_LOG_FORCE);
if (wait)
wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
}
/*
* Give the log a push so we don't wait here too long.
*/
xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
static inline void
xfs_iunpin_wait(
xfs_inode_t *ip)
{
__xfs_iunpin_wait(ip, 1);
}
wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
static inline void
xfs_iunpin_nowait(
xfs_inode_t *ip)
{
__xfs_iunpin_wait(ip, 0);
}
......@@ -3003,6 +3019,7 @@ xfs_iflush(
int bufwasdelwri;
struct hlist_node *entry;
enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
XFS_STATS_INC(xs_iflush_count);
......@@ -3027,11 +3044,21 @@ xfs_iflush(
}
/*
* We can't flush the inode until it is unpinned, so
* wait for it. We know noone new can pin it, because
* we are holding the inode lock shared and you need
* to hold it exclusively to pin the inode.
* We can't flush the inode until it is unpinned, so wait for it if we
* are allowed to block. We know noone new can pin it, because we are
* holding the inode lock shared and you need to hold it exclusively to
* pin the inode.
*
* If we are not allowed to block, force the log out asynchronously so
* that when we come back the inode will be unpinned. If other inodes
* in the same cluster are dirty, they will probably write the inode
* out for us if they occur after the log force completes.
*/
if (noblock && xfs_ipincount(ip)) {
xfs_iunpin_nowait(ip);
xfs_ifunlock(ip);
return EAGAIN;
}
xfs_iunpin_wait(ip);
/*
......@@ -3047,15 +3074,6 @@ xfs_iflush(
return XFS_ERROR(EIO);
}
/*
* Get the buffer containing the on-disk inode.
*/
error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
if (error) {
xfs_ifunlock(ip);
return error;
}
/*
* Decide how buffer will be flushed out. This is done before
* the call to xfs_iflush_int because this field is zeroed by it.
......@@ -3072,6 +3090,7 @@ xfs_iflush(
case XFS_IFLUSH_DELWRI_ELSE_SYNC:
flags = 0;
break;
case XFS_IFLUSH_ASYNC_NOBLOCK:
case XFS_IFLUSH_ASYNC:
case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
flags = INT_ASYNC;
......@@ -3091,6 +3110,7 @@ xfs_iflush(
case XFS_IFLUSH_DELWRI:
flags = INT_DELWRI;
break;
case XFS_IFLUSH_ASYNC_NOBLOCK:
case XFS_IFLUSH_ASYNC:
flags = INT_ASYNC;
break;
......@@ -3104,6 +3124,16 @@ xfs_iflush(
}
}
/*
* Get the buffer containing the on-disk inode.
*/
error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
if (error || !bp) {
xfs_ifunlock(ip);
return error;
}
/*
* First flush out the inode that xfs_iflush was called with.
*/
......@@ -3112,6 +3142,13 @@ xfs_iflush(
goto corrupt_out;
}
/*
* If the buffer is pinned then push on the log now so we won't
* get stuck waiting in the write for too long.
*/
if (XFS_BUF_ISPINNED(bp))
xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
/*
* inode clustering:
* see if other inodes can be gathered into this write
......@@ -3181,14 +3218,6 @@ xfs_iflush(
XFS_STATS_ADD(xs_icluster_flushinode, clcount);
}
/*
* If the buffer is pinned then push on the log so we won't
* get stuck waiting in the write for too long.
*/
if (XFS_BUF_ISPINNED(bp)){
xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
}
if (flags & INT_DELWRI) {
xfs_bdwrite(mp, bp);
} else if (flags & INT_ASYNC) {
......
......@@ -457,6 +457,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
#define XFS_IFLUSH_SYNC 3
#define XFS_IFLUSH_ASYNC 4
#define XFS_IFLUSH_DELWRI 5
#define XFS_IFLUSH_ASYNC_NOBLOCK 6
/*
* Flags for xfs_itruncate_start().
......@@ -511,7 +512,7 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int);
*/
int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
xfs_daddr_t, uint);
xfs_daddr_t, uint, uint);
int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
xfs_inode_t **, xfs_daddr_t, uint);
int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
......
......@@ -614,7 +614,8 @@ xfs_bulkstat(
xfs_buf_relse(bp);
error = xfs_itobp(mp, NULL, ip,
&dip, &bp, bno,
XFS_IMAP_BULKSTAT);
XFS_IMAP_BULKSTAT,
XFS_BUF_LOCK);
if (!error)
clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
kmem_zone_free(xfs_inode_zone, ip);
......
......@@ -3214,7 +3214,8 @@ xlog_recover_process_iunlinks(
* next inode in the bucket.
*/
error = xfs_itobp(mp, NULL, ip, &dip,
&ibp, 0, 0);
&ibp, 0, 0,
XFS_BUF_LOCK);
ASSERT(error || (dip != NULL));
}
......
......@@ -304,7 +304,8 @@ xfs_trans_read_buf(
if (tp == NULL) {
bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
if (!bp)
return XFS_ERROR(ENOMEM);
return (flags & XFS_BUF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
xfs_ioerror_alert("xfs_trans_read_buf", mp,
......
......@@ -3468,29 +3468,6 @@ xfs_inode_flush(
((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
return 0;
if (flags & FLUSH_LOG) {
if (iip && iip->ili_last_lsn) {
xlog_t *log = mp->m_log;
xfs_lsn_t sync_lsn;
int log_flags = XFS_LOG_FORCE;
spin_lock(&log->l_grant_lock);
sync_lsn = log->l_last_sync_lsn;
spin_unlock(&log->l_grant_lock);
if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
if (flags & FLUSH_SYNC)
log_flags |= XFS_LOG_SYNC;
error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
if (error)
return error;
}
if (ip->i_update_core == 0)
return 0;
}
}
/*
* We make this non-blocking if the inode is contended,
* return EAGAIN to indicate to the caller that they
......@@ -3498,9 +3475,6 @@ xfs_inode_flush(
* blocking on inodes inside another operation right
* now, they get caught later by xfs_sync.
*/
if (flags & FLUSH_INODE) {
int flush_flags;
if (flags & FLUSH_SYNC) {
xfs_ilock(ip, XFS_ILOCK_SHARED);
xfs_iflock(ip);
......@@ -3513,14 +3487,9 @@ xfs_inode_flush(
return EAGAIN;
}
if (flags & FLUSH_SYNC)
flush_flags = XFS_IFLUSH_SYNC;
else
flush_flags = XFS_IFLUSH_ASYNC;
error = xfs_iflush(ip, flush_flags);
error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
: XFS_IFLUSH_ASYNC_NOBLOCK);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
}
return error;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment