Commit 661b99e9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-for-linus-3.18-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs fixes from Dave Chinner:
 "This update fixes a warning in the new pagecache_isize_extended() and
  updates some related comments, another fix for zero-range
  misbehaviour, and an unforntuately large set of fixes for regressions
  in the bulkstat code.

  The bulkstat fixes are large but necessary.  I wouldn't normally push
  such a rework for a -rcX update, but right now xfsdump can silently
  create incomplete dumps on 3.17 and it's possible that even xfsrestore
  won't notice that the dumps were incomplete.  Hence we need to get
  this update into 3.17-stable kernels ASAP.

  In more detail, the refactoring work I committed in 3.17 has exposed a
  major hole in our QA coverage.  With both xfsdump (the major user of
  bulkstat) and xfsrestore silently ignoring missing files in the
  dump/restore process, incomplete dumps were going unnoticed if they
  were being triggered.  Many of the dump/restore filesets were so small
  that they didn't evenhave a chance of triggering the loop iteration
  bugs we introduced in 3.17, so we didn't exercise the code
  sufficiently, either.

  We have already taken steps to improve QA coverage in xfstests to
  avoid this happening again, and I've done a lot of manual verification
  of dump/restore on very large data sets (tens of millions of inodes)
  of the past week to verify this patch set results in bulkstat behaving
  the same way as it does on 3.16.

  Unfortunately, the fixes are not exactly simple - in tracking down the
  problem historic API warts were discovered (e.g xfsdump has been
  working around a 20 year old bug in the bulkstat API for the past 10
  years) and so that complicated the process of diagnosing and fixing
  the problems.  i.e. we had to fix bugs in the code as well as
  discover and re-introduce the userspace visible API bugs that we
  unwittingly "fixed" in 3.17 that xfsdump relied on to work correctly.

  Summary:

   - incorrect warnings about i_mutex locking in pagecache_isize_extended()
     and updates comments to match expected locking
   - another zero-range bug fix for stray file size updates
   - a bunch of fixes for regression in the bulkstat code introduced in
     3.17"

* tag 'xfs-for-linus-3.18-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs:
  xfs: track bulkstat progress by agino
  xfs: bulkstat error handling is broken
  xfs: bulkstat main loop logic is a mess
  xfs: bulkstat chunk-formatter has issues
  xfs: bulkstat chunk formatting cursor is broken
  xfs: bulkstat btree walk doesn't terminate
  mm: Fix comment before truncate_setsize()
  xfs: rework zero range to prevent invalid i_size updates
  mm: Remove false WARN_ON from pagecache_isize_extended()
  xfs: Check error during inode btree iteration in xfs_bulkstat()
  xfs: bulkstat doesn't release AGI buffer on error
parents 51f83ef0 00275899
...@@ -1338,7 +1338,10 @@ xfs_free_file_space( ...@@ -1338,7 +1338,10 @@ xfs_free_file_space(
goto out; goto out;
} }
/*
* Preallocate and zero a range of a file. This mechanism has the allocation
* semantics of fallocate and in addition converts data in the range to zeroes.
*/
int int
xfs_zero_file_space( xfs_zero_file_space(
struct xfs_inode *ip, struct xfs_inode *ip,
...@@ -1346,65 +1349,30 @@ xfs_zero_file_space( ...@@ -1346,65 +1349,30 @@ xfs_zero_file_space(
xfs_off_t len) xfs_off_t len)
{ {
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
uint granularity; uint blksize;
xfs_off_t start_boundary;
xfs_off_t end_boundary;
int error; int error;
trace_xfs_zero_file_space(ip); trace_xfs_zero_file_space(ip);
granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); blksize = 1 << mp->m_sb.sb_blocklog;
/* /*
* Round the range of extents we are going to convert inwards. If the * Punch a hole and prealloc the range. We use hole punch rather than
* offset is aligned, then it doesn't get changed so we zero from the * unwritten extent conversion for two reasons:
* start of the block offset points to. *
* 1.) Hole punch handles partial block zeroing for us.
*
* 2.) If prealloc returns ENOSPC, the file range is still zero-valued
* by virtue of the hole punch.
*/ */
start_boundary = round_up(offset, granularity); error = xfs_free_file_space(ip, offset, len);
end_boundary = round_down(offset + len, granularity); if (error)
goto out;
ASSERT(start_boundary >= offset);
ASSERT(end_boundary <= offset + len);
if (start_boundary < end_boundary - 1) {
/*
* Writeback the range to ensure any inode size updates due to
* appending writes make it to disk (otherwise we could just
* punch out the delalloc blocks).
*/
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
start_boundary, end_boundary - 1);
if (error)
goto out;
truncate_pagecache_range(VFS_I(ip), start_boundary,
end_boundary - 1);
/* convert the blocks */
error = xfs_alloc_file_space(ip, start_boundary,
end_boundary - start_boundary - 1,
XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT);
if (error)
goto out;
/* We've handled the interior of the range, now for the edges */
if (start_boundary != offset) {
error = xfs_iozero(ip, offset, start_boundary - offset);
if (error)
goto out;
}
if (end_boundary != offset + len)
error = xfs_iozero(ip, end_boundary,
offset + len - end_boundary);
} else {
/*
* It's either a sub-granularity range or the range spanned lies
* partially across two adjacent blocks.
*/
error = xfs_iozero(ip, offset, len);
}
error = xfs_alloc_file_space(ip, round_down(offset, blksize),
round_up(offset + len, blksize) -
round_down(offset, blksize),
XFS_BMAPI_PREALLOC);
out: out:
return error; return error;
......
...@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk( ...@@ -236,8 +236,10 @@ xfs_bulkstat_grab_ichunk(
XFS_WANT_CORRUPTED_RETURN(stat == 1); XFS_WANT_CORRUPTED_RETURN(stat == 1);
/* Check if the record contains the inode in request */ /* Check if the record contains the inode in request */
if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) {
return -EINVAL; *icount = 0;
return 0;
}
idx = agino - irec->ir_startino + 1; idx = agino - irec->ir_startino + 1;
if (idx < XFS_INODES_PER_CHUNK && if (idx < XFS_INODES_PER_CHUNK &&
...@@ -262,75 +264,76 @@ xfs_bulkstat_grab_ichunk( ...@@ -262,75 +264,76 @@ xfs_bulkstat_grab_ichunk(
#define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size) #define XFS_BULKSTAT_UBLEFT(ubleft) ((ubleft) >= statstruct_size)
struct xfs_bulkstat_agichunk {
char __user **ac_ubuffer;/* pointer into user's buffer */
int ac_ubleft; /* bytes left in user's buffer */
int ac_ubelem; /* spaces used in user's buffer */
};
/* /*
* Process inodes in chunk with a pointer to a formatter function * Process inodes in chunk with a pointer to a formatter function
* that will iget the inode and fill in the appropriate structure. * that will iget the inode and fill in the appropriate structure.
*/ */
int static int
xfs_bulkstat_ag_ichunk( xfs_bulkstat_ag_ichunk(
struct xfs_mount *mp, struct xfs_mount *mp,
xfs_agnumber_t agno, xfs_agnumber_t agno,
struct xfs_inobt_rec_incore *irbp, struct xfs_inobt_rec_incore *irbp,
bulkstat_one_pf formatter, bulkstat_one_pf formatter,
size_t statstruct_size, size_t statstruct_size,
struct xfs_bulkstat_agichunk *acp) struct xfs_bulkstat_agichunk *acp,
xfs_agino_t *last_agino)
{ {
xfs_ino_t lastino = acp->ac_lastino;
char __user **ubufp = acp->ac_ubuffer; char __user **ubufp = acp->ac_ubuffer;
int ubleft = acp->ac_ubleft; int chunkidx;
int ubelem = acp->ac_ubelem;
int chunkidx, clustidx;
int error = 0; int error = 0;
xfs_agino_t agino; xfs_agino_t agino = irbp->ir_startino;
for (agino = irbp->ir_startino, chunkidx = clustidx = 0; for (chunkidx = 0; chunkidx < XFS_INODES_PER_CHUNK;
XFS_BULKSTAT_UBLEFT(ubleft) && chunkidx++, agino++) {
irbp->ir_freecount < XFS_INODES_PER_CHUNK; int fmterror;
chunkidx++, clustidx++, agino++) {
int fmterror; /* bulkstat formatter result */
int ubused; int ubused;
xfs_ino_t ino = XFS_AGINO_TO_INO(mp, agno, agino);
ASSERT(chunkidx < XFS_INODES_PER_CHUNK); /* inode won't fit in buffer, we are done */
if (acp->ac_ubleft < statstruct_size)
break;
/* Skip if this inode is free */ /* Skip if this inode is free */
if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free) { if (XFS_INOBT_MASK(chunkidx) & irbp->ir_free)
lastino = ino;
continue; continue;
}
/*
* Count used inodes as free so we can tell when the
* chunk is used up.
*/
irbp->ir_freecount++;
/* Get the inode and fill in a single buffer */ /* Get the inode and fill in a single buffer */
ubused = statstruct_size; ubused = statstruct_size;
error = formatter(mp, ino, *ubufp, ubleft, &ubused, &fmterror); error = formatter(mp, XFS_AGINO_TO_INO(mp, agno, agino),
if (fmterror == BULKSTAT_RV_NOTHING) { *ubufp, acp->ac_ubleft, &ubused, &fmterror);
if (error && error != -ENOENT && error != -EINVAL) {
ubleft = 0; if (fmterror == BULKSTAT_RV_GIVEUP ||
break; (error && error != -ENOENT && error != -EINVAL)) {
} acp->ac_ubleft = 0;
lastino = ino;
continue;
}
if (fmterror == BULKSTAT_RV_GIVEUP) {
ubleft = 0;
ASSERT(error); ASSERT(error);
break; break;
} }
if (*ubufp)
*ubufp += ubused; /* be careful not to leak error if at end of chunk */
ubleft -= ubused; if (fmterror == BULKSTAT_RV_NOTHING || error) {
ubelem++; error = 0;
lastino = ino; continue;
}
*ubufp += ubused;
acp->ac_ubleft -= ubused;
acp->ac_ubelem++;
} }
acp->ac_lastino = lastino; /*
acp->ac_ubleft = ubleft; * Post-update *last_agino. At this point, agino will always point one
acp->ac_ubelem = ubelem; * inode past the last inode we processed successfully. Hence we
* substract that inode when setting the *last_agino cursor so that we
* return the correct cookie to userspace. On the next bulkstat call,
* the inode under the lastino cookie will be skipped as we have already
* processed it here.
*/
*last_agino = agino - 1;
return error; return error;
} }
...@@ -353,45 +356,33 @@ xfs_bulkstat( ...@@ -353,45 +356,33 @@ xfs_bulkstat(
xfs_agino_t agino; /* inode # in allocation group */ xfs_agino_t agino; /* inode # in allocation group */
xfs_agnumber_t agno; /* allocation group number */ xfs_agnumber_t agno; /* allocation group number */
xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */
int end_of_ag; /* set if we've seen the ag end */
int error; /* error code */
int fmterror;/* bulkstat formatter result */
int i; /* loop index */
int icount; /* count of inodes good in irbuf */
size_t irbsize; /* size of irec buffer in bytes */ size_t irbsize; /* size of irec buffer in bytes */
xfs_ino_t ino; /* inode number (filesystem) */
xfs_inobt_rec_incore_t *irbp; /* current irec buffer pointer */
xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */
xfs_inobt_rec_incore_t *irbufend; /* end of good irec buffer entries */
xfs_ino_t lastino; /* last inode number returned */
int nirbuf; /* size of irbuf */ int nirbuf; /* size of irbuf */
int rval; /* return value error code */
int tmp; /* result value from btree calls */
int ubcount; /* size of user's buffer */ int ubcount; /* size of user's buffer */
int ubleft; /* bytes left in user's buffer */ struct xfs_bulkstat_agichunk ac;
char __user *ubufp; /* pointer into user's buffer */ int error = 0;
int ubelem; /* spaces used in user's buffer */
/* /*
* Get the last inode value, see if there's nothing to do. * Get the last inode value, see if there's nothing to do.
*/ */
ino = (xfs_ino_t)*lastinop; agno = XFS_INO_TO_AGNO(mp, *lastinop);
lastino = ino; agino = XFS_INO_TO_AGINO(mp, *lastinop);
agno = XFS_INO_TO_AGNO(mp, ino);
agino = XFS_INO_TO_AGINO(mp, ino);
if (agno >= mp->m_sb.sb_agcount || if (agno >= mp->m_sb.sb_agcount ||
ino != XFS_AGINO_TO_INO(mp, agno, agino)) { *lastinop != XFS_AGINO_TO_INO(mp, agno, agino)) {
*done = 1; *done = 1;
*ubcountp = 0; *ubcountp = 0;
return 0; return 0;
} }
ubcount = *ubcountp; /* statstruct's */ ubcount = *ubcountp; /* statstruct's */
ubleft = ubcount * statstruct_size; /* bytes */ ac.ac_ubuffer = &ubuffer;
*ubcountp = ubelem = 0; ac.ac_ubleft = ubcount * statstruct_size; /* bytes */;
ac.ac_ubelem = 0;
*ubcountp = 0;
*done = 0; *done = 0;
fmterror = 0;
ubufp = ubuffer;
irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4);
if (!irbuf) if (!irbuf)
return -ENOMEM; return -ENOMEM;
...@@ -402,9 +393,13 @@ xfs_bulkstat( ...@@ -402,9 +393,13 @@ xfs_bulkstat(
* Loop over the allocation groups, starting from the last * Loop over the allocation groups, starting from the last
* inode returned; 0 means start of the allocation group. * inode returned; 0 means start of the allocation group.
*/ */
rval = 0; while (agno < mp->m_sb.sb_agcount) {
while (XFS_BULKSTAT_UBLEFT(ubleft) && agno < mp->m_sb.sb_agcount) { struct xfs_inobt_rec_incore *irbp = irbuf;
cond_resched(); struct xfs_inobt_rec_incore *irbufend = irbuf + nirbuf;
bool end_of_ag = false;
int icount = 0;
int stat;
error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
if (error) if (error)
break; break;
...@@ -414,10 +409,6 @@ xfs_bulkstat( ...@@ -414,10 +409,6 @@ xfs_bulkstat(
*/ */
cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno,
XFS_BTNUM_INO); XFS_BTNUM_INO);
irbp = irbuf;
irbufend = irbuf + nirbuf;
end_of_ag = 0;
icount = 0;
if (agino > 0) { if (agino > 0) {
/* /*
* In the middle of an allocation group, we need to get * In the middle of an allocation group, we need to get
...@@ -427,22 +418,23 @@ xfs_bulkstat( ...@@ -427,22 +418,23 @@ xfs_bulkstat(
error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r); error = xfs_bulkstat_grab_ichunk(cur, agino, &icount, &r);
if (error) if (error)
break; goto del_cursor;
if (icount) { if (icount) {
irbp->ir_startino = r.ir_startino; irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount; irbp->ir_freecount = r.ir_freecount;
irbp->ir_free = r.ir_free; irbp->ir_free = r.ir_free;
irbp++; irbp++;
agino = r.ir_startino + XFS_INODES_PER_CHUNK;
} }
/* Increment to the next record */ /* Increment to the next record */
error = xfs_btree_increment(cur, 0, &tmp); error = xfs_btree_increment(cur, 0, &stat);
} else { } else {
/* Start of ag. Lookup the first inode chunk */ /* Start of ag. Lookup the first inode chunk */
error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp); error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &stat);
}
if (error || stat == 0) {
end_of_ag = true;
goto del_cursor;
} }
if (error)
break;
/* /*
* Loop through inode btree records in this ag, * Loop through inode btree records in this ag,
...@@ -451,10 +443,10 @@ xfs_bulkstat( ...@@ -451,10 +443,10 @@ xfs_bulkstat(
while (irbp < irbufend && icount < ubcount) { while (irbp < irbufend && icount < ubcount) {
struct xfs_inobt_rec_incore r; struct xfs_inobt_rec_incore r;
error = xfs_inobt_get_rec(cur, &r, &i); error = xfs_inobt_get_rec(cur, &r, &stat);
if (error || i == 0) { if (error || stat == 0) {
end_of_ag = 1; end_of_ag = true;
break; goto del_cursor;
} }
/* /*
...@@ -469,77 +461,79 @@ xfs_bulkstat( ...@@ -469,77 +461,79 @@ xfs_bulkstat(
irbp++; irbp++;
icount += XFS_INODES_PER_CHUNK - r.ir_freecount; icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
} }
/* error = xfs_btree_increment(cur, 0, &stat);
* Set agino to after this chunk and bump the cursor. if (error || stat == 0) {
*/ end_of_ag = true;
agino = r.ir_startino + XFS_INODES_PER_CHUNK; goto del_cursor;
error = xfs_btree_increment(cur, 0, &tmp); }
cond_resched(); cond_resched();
} }
/* /*
* Drop the btree buffers and the agi buffer. * Drop the btree buffers and the agi buffer as we can't hold any
* We can't hold any of the locks these represent * of the locks these represent when calling iget. If there is a
* when calling iget. * pending error, then we are done.
*/ */
del_cursor:
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
xfs_buf_relse(agbp); xfs_buf_relse(agbp);
if (error)
break;
/* /*
* Now format all the good inodes into the user's buffer. * Now format all the good inodes into the user's buffer. The
* call to xfs_bulkstat_ag_ichunk() sets up the agino pointer
* for the next loop iteration.
*/ */
irbufend = irbp; irbufend = irbp;
for (irbp = irbuf; for (irbp = irbuf;
irbp < irbufend && XFS_BULKSTAT_UBLEFT(ubleft); irbp++) { irbp < irbufend && ac.ac_ubleft >= statstruct_size;
struct xfs_bulkstat_agichunk ac; irbp++) {
ac.ac_lastino = lastino;
ac.ac_ubuffer = &ubuffer;
ac.ac_ubleft = ubleft;
ac.ac_ubelem = ubelem;
error = xfs_bulkstat_ag_ichunk(mp, agno, irbp, error = xfs_bulkstat_ag_ichunk(mp, agno, irbp,
formatter, statstruct_size, &ac); formatter, statstruct_size, &ac,
&agino);
if (error) if (error)
rval = error; break;
lastino = ac.ac_lastino;
ubleft = ac.ac_ubleft;
ubelem = ac.ac_ubelem;
cond_resched(); cond_resched();
} }
/* /*
* Set up for the next loop iteration. * If we've run out of space or had a formatting error, we
* are now done
*/ */
if (XFS_BULKSTAT_UBLEFT(ubleft)) { if (ac.ac_ubleft < statstruct_size || error)
if (end_of_ag) {
agno++;
agino = 0;
} else
agino = XFS_INO_TO_AGINO(mp, lastino);
} else
break; break;
if (end_of_ag) {
agno++;
agino = 0;
}
} }
/* /*
* Done, we're either out of filesystem or space to put the data. * Done, we're either out of filesystem or space to put the data.
*/ */
kmem_free(irbuf); kmem_free(irbuf);
*ubcountp = ubelem; *ubcountp = ac.ac_ubelem;
/* /*
* Found some inodes, return them now and return the error next time. * We found some inodes, so clear the error status and return them.
* The lastino pointer will point directly at the inode that triggered
* any error that occurred, so on the next call the error will be
* triggered again and propagated to userspace as there will be no
* formatted inodes in the buffer.
*/ */
if (ubelem) if (ac.ac_ubelem)
rval = 0; error = 0;
if (agno >= mp->m_sb.sb_agcount) {
/* /*
* If we ran out of filesystem, mark lastino as off * If we ran out of filesystem, lastino will point off the end of
* the end of the filesystem, so the next call * the filesystem so the next call will return immediately.
* will return immediately. */
*/ *lastinop = XFS_AGINO_TO_INO(mp, agno, agino);
*lastinop = (xfs_ino_t)XFS_AGINO_TO_INO(mp, agno, 0); if (agno >= mp->m_sb.sb_agcount)
*done = 1; *done = 1;
} else
*lastinop = (xfs_ino_t)lastino;
return rval; return error;
} }
int int
......
...@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp, ...@@ -30,22 +30,6 @@ typedef int (*bulkstat_one_pf)(struct xfs_mount *mp,
int *ubused, int *ubused,
int *stat); int *stat);
struct xfs_bulkstat_agichunk {
xfs_ino_t ac_lastino; /* last inode returned */
char __user **ac_ubuffer;/* pointer into user's buffer */
int ac_ubleft; /* bytes left in user's buffer */
int ac_ubelem; /* spaces used in user's buffer */
};
int
xfs_bulkstat_ag_ichunk(
struct xfs_mount *mp,
xfs_agnumber_t agno,
struct xfs_inobt_rec_incore *irbp,
bulkstat_one_pf formatter,
size_t statstruct_size,
struct xfs_bulkstat_agichunk *acp);
/* /*
* Values for stat return value. * Values for stat return value.
*/ */
......
...@@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache); ...@@ -715,8 +715,9 @@ EXPORT_SYMBOL(truncate_pagecache);
* necessary) to @newsize. It will be typically be called from the filesystem's * necessary) to @newsize. It will be typically be called from the filesystem's
* setattr function when ATTR_SIZE is passed in. * setattr function when ATTR_SIZE is passed in.
* *
* Must be called with inode_mutex held and before all filesystem specific * Must be called with a lock serializing truncates and writes (generally
* block truncation has been performed. * i_mutex but e.g. xfs uses a different lock) and before all filesystem
* specific block truncation has been performed.
*/ */
void truncate_setsize(struct inode *inode, loff_t newsize) void truncate_setsize(struct inode *inode, loff_t newsize)
{ {
...@@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) ...@@ -755,7 +756,6 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
struct page *page; struct page *page;
pgoff_t index; pgoff_t index;
WARN_ON(!mutex_is_locked(&inode->i_mutex));
WARN_ON(to > inode->i_size); WARN_ON(to > inode->i_size);
if (from >= to || bsize == PAGE_CACHE_SIZE) if (from >= to || bsize == PAGE_CACHE_SIZE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment