Commit f5749432 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'xfs-4.18-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull xfs fixes from Darrick Wong:
 "Here are some patches for 4.18 to fix regressions, accounting
  problems, overflow problems, and to strengthen metadata validation to
  prevent corruption.

  This series has been run through a full xfstests run over the weekend
  and through a quick xfstests run against this morning's master, with
  no major failures reported.

  Changes since last update:

   - more metadata validation strengthening to prevent crashes.

   - fix extent offset overflow problem when insert_range on a 512b
     block fs

   - fix some off-by-one errors in the realtime fsmap code

   - fix some math errors in the default resblks calculation when free
     space is low

   - fix a problem where stale page contents are exposed via mmap read
     after a zero_range at eof

   - fix accounting problems with per-ag reservations causing statfs
     reports to vary incorrectly"

* tag 'xfs-4.18-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux:
  xfs: fix fdblocks accounting w/ RMAPBT per-AG reservation
  xfs: ensure post-EOF zeroing happens after zeroing part of a file
  xfs: fix off-by-one error in xfs_rtalloc_query_range
  xfs: fix uninitialized field in rtbitmap fsmap backend
  xfs: recheck reflink state after grabbing ILOCK_SHARED for a write
  xfs: don't allow insert-range to shift extents past the maximum offset
  xfs: don't trip over negative free space in xfs_reserve_blocks
  xfs: allow empty transactions while frozen
  xfs: xfs_iflush_abort() can be called twice on cluster writeback failure
  xfs: More robust inode extent count validation
  xfs: simplify xfs_bmap_punch_delalloc_range
parents 0e49740c d8cb5e42
......@@ -157,6 +157,7 @@ __xfs_ag_resv_free(
error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
resv->ar_reserved = 0;
resv->ar_asked = 0;
resv->ar_orig_reserved = 0;
if (error)
trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
......@@ -189,13 +190,34 @@ __xfs_ag_resv_init(
struct xfs_mount *mp = pag->pag_mount;
struct xfs_ag_resv *resv;
int error;
xfs_extlen_t reserved;
xfs_extlen_t hidden_space;
if (used > ask)
ask = used;
reserved = ask - used;
error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true);
switch (type) {
case XFS_AG_RESV_RMAPBT:
/*
* Space taken by the rmapbt is not subtracted from fdblocks
* because the rmapbt lives in the free space. Here we must
* subtract the entire reservation from fdblocks so that we
* always have blocks available for rmapbt expansion.
*/
hidden_space = ask;
break;
case XFS_AG_RESV_METADATA:
/*
* Space taken by all other metadata btrees are accounted
* on-disk as used space. We therefore only hide the space
* that is reserved but not used by the trees.
*/
hidden_space = ask - used;
break;
default:
ASSERT(0);
return -EINVAL;
}
error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
if (error) {
trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
error, _RET_IP_);
......@@ -216,7 +238,8 @@ __xfs_ag_resv_init(
resv = xfs_perag_resv(pag, type);
resv->ar_asked = ask;
resv->ar_reserved = resv->ar_orig_reserved = reserved;
resv->ar_orig_reserved = hidden_space;
resv->ar_reserved = ask - used;
trace_xfs_ag_resv_init(pag, type, ask);
return 0;
......
......@@ -5780,6 +5780,32 @@ xfs_bmap_collapse_extents(
return error;
}
/* Make sure we won't be right-shifting an extent past the maximum bound. */
int
xfs_bmap_can_insert_extents(
struct xfs_inode *ip,
xfs_fileoff_t off,
xfs_fileoff_t shift)
{
struct xfs_bmbt_irec got;
int is_empty;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
xfs_ilock(ip, XFS_ILOCK_EXCL);
error = xfs_bmap_last_extent(NULL, ip, XFS_DATA_FORK, &got, &is_empty);
if (!error && !is_empty && got.br_startoff >= off &&
((got.br_startoff + shift) & BMBT_STARTOFF_MASK) < got.br_startoff)
error = -EINVAL;
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
int
xfs_bmap_insert_extents(
struct xfs_trans *tp,
......
......@@ -227,6 +227,8 @@ int xfs_bmap_collapse_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fsblock_t *firstblock,
struct xfs_defer_ops *dfops);
int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
xfs_fileoff_t shift);
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock,
......
......@@ -962,6 +962,9 @@ typedef enum xfs_dinode_fmt {
XFS_DFORK_DSIZE(dip, mp) : \
XFS_DFORK_ASIZE(dip, mp))
#define XFS_DFORK_MAXEXT(dip, mp, w) \
(XFS_DFORK_SIZE(dip, mp, w) / sizeof(struct xfs_bmbt_rec))
/*
* Return pointers to the data or attribute forks.
*/
......@@ -1526,6 +1529,8 @@ typedef struct xfs_bmdr_block {
#define BMBT_STARTBLOCK_BITLEN 52
#define BMBT_BLOCKCOUNT_BITLEN 21
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
typedef struct xfs_bmbt_rec {
__be64 l0, l1;
} xfs_bmbt_rec_t;
......
......@@ -374,6 +374,47 @@ xfs_log_dinode_to_disk(
}
}
static xfs_failaddr_t
xfs_dinode_verify_fork(
struct xfs_dinode *dip,
struct xfs_mount *mp,
int whichfork)
{
uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
switch (XFS_DFORK_FORMAT(dip, whichfork)) {
case XFS_DINODE_FMT_LOCAL:
/*
* no local regular files yet
*/
if (whichfork == XFS_DATA_FORK) {
if (S_ISREG(be16_to_cpu(dip->di_mode)))
return __this_address;
if (be64_to_cpu(dip->di_size) >
XFS_DFORK_SIZE(dip, mp, whichfork))
return __this_address;
}
if (di_nextents)
return __this_address;
break;
case XFS_DINODE_FMT_EXTENTS:
if (di_nextents > XFS_DFORK_MAXEXT(dip, mp, whichfork))
return __this_address;
break;
case XFS_DINODE_FMT_BTREE:
if (whichfork == XFS_ATTR_FORK) {
if (di_nextents > MAXAEXTNUM)
return __this_address;
} else if (di_nextents > MAXEXTNUM) {
return __this_address;
}
break;
default:
return __this_address;
}
return NULL;
}
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
......@@ -441,24 +482,9 @@ xfs_dinode_verify(
case S_IFREG:
case S_IFLNK:
case S_IFDIR:
switch (dip->di_format) {
case XFS_DINODE_FMT_LOCAL:
/*
* no local regular files yet
*/
if (S_ISREG(mode))
return __this_address;
if (di_size > XFS_DFORK_DSIZE(dip, mp))
return __this_address;
if (dip->di_nextents)
return __this_address;
/* fall through */
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
break;
default:
return __this_address;
}
fa = xfs_dinode_verify_fork(dip, mp, XFS_DATA_FORK);
if (fa)
return fa;
break;
case 0:
/* Uninitialized inode ok. */
......@@ -468,17 +494,9 @@ xfs_dinode_verify(
}
if (XFS_DFORK_Q(dip)) {
switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
if (dip->di_anextents)
return __this_address;
/* fall through */
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
break;
default:
return __this_address;
}
fa = xfs_dinode_verify_fork(dip, mp, XFS_ATTR_FORK);
if (fa)
return fa;
} else {
/*
* If there is no fork offset, this may be a freshly-made inode
......
......@@ -1029,8 +1029,8 @@ xfs_rtalloc_query_range(
if (low_rec->ar_startext >= mp->m_sb.sb_rextents ||
low_rec->ar_startext == high_rec->ar_startext)
return 0;
if (high_rec->ar_startext >= mp->m_sb.sb_rextents)
high_rec->ar_startext = mp->m_sb.sb_rextents - 1;
if (high_rec->ar_startext > mp->m_sb.sb_rextents)
high_rec->ar_startext = mp->m_sb.sb_rextents;
/* Iterate the bitmap, looking for discrepancies. */
rtstart = low_rec->ar_startext;
......
......@@ -685,12 +685,10 @@ xfs_getbmap(
}
/*
* dead simple method of punching delalyed allocation blocks from a range in
* the inode. Walks a block at a time so will be slow, but is only executed in
* rare error cases so the overhead is not critical. This will always punch out
* both the start and end blocks, even if the ranges only partially overlap
* them, so it is up to the caller to ensure that partial blocks are not
* passed in.
* Dead simple method of punching delalyed allocation blocks from a range in
* the inode. This will always punch out both the start and end blocks, even
* if the ranges only partially overlap them, so it is up to the caller to
* ensure that partial blocks are not passed in.
*/
int
xfs_bmap_punch_delalloc_range(
......@@ -698,63 +696,44 @@ xfs_bmap_punch_delalloc_range(
xfs_fileoff_t start_fsb,
xfs_fileoff_t length)
{
xfs_fileoff_t remaining = length;
struct xfs_ifork *ifp = &ip->i_df;
xfs_fileoff_t end_fsb = start_fsb + length;
struct xfs_bmbt_irec got, del;
struct xfs_iext_cursor icur;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
do {
int done;
xfs_bmbt_irec_t imap;
int nimaps = 1;
xfs_fsblock_t firstblock;
struct xfs_defer_ops dfops;
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
if (error)
return error;
}
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
return 0;
while (got.br_startoff + got.br_blockcount > start_fsb) {
del = got;
xfs_trim_extent(&del, start_fsb, length);
/*
* Map the range first and check that it is a delalloc extent
* before trying to unmap the range. Otherwise we will be
* trying to remove a real extent (which requires a
* transaction) or a hole, which is probably a bad idea...
* A delete can push the cursor forward. Step back to the
* previous extent on non-delalloc or extents outside the
* target range.
*/
error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
XFS_BMAPI_ENTIRE);
if (error) {
/* something screwed, just bail */
if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
xfs_alert(ip->i_mount,
"Failed delalloc mapping lookup ino %lld fsb %lld.",
ip->i_ino, start_fsb);
}
if (!del.br_blockcount ||
!isnullstartblock(del.br_startblock)) {
if (!xfs_iext_prev_extent(ifp, &icur, &got))
break;
continue;
}
if (!nimaps) {
/* nothing there */
goto next_block;
}
if (imap.br_startblock != DELAYSTARTBLOCK) {
/* been converted, ignore */
goto next_block;
}
WARN_ON(imap.br_blockcount == 0);
/*
* Note: while we initialise the firstblock/dfops pair, they
* should never be used because blocks should never be
* allocated or freed for a delalloc extent and hence we need
* don't cancel or finish them after the xfs_bunmapi() call.
*/
xfs_defer_init(&dfops, &firstblock);
error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
&dfops, &done);
if (error)
error = xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur,
&got, &del);
if (error || !xfs_iext_get_extent(ifp, &icur, &got))
break;
ASSERT(!xfs_defer_has_unfinished_work(&dfops));
next_block:
start_fsb++;
remaining--;
} while(remaining > 0);
}
return error;
}
......@@ -1208,7 +1187,22 @@ xfs_free_file_space(
return 0;
if (offset + len > XFS_ISIZE(ip))
len = XFS_ISIZE(ip) - offset;
return iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops);
error = iomap_zero_range(VFS_I(ip), offset, len, NULL, &xfs_iomap_ops);
if (error)
return error;
/*
* If we zeroed right up to EOF and EOF straddles a page boundary we
* must make sure that the post-EOF area is also zeroed because the
* page could be mmap'd and iomap_zero_range doesn't do that for us.
* Writeback of the eof page will do this, albeit clumsily.
*/
if (offset + len >= XFS_ISIZE(ip) && ((offset + len) & PAGE_MASK)) {
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
(offset + len) & ~PAGE_MASK, LLONG_MAX);
}
return error;
}
/*
......@@ -1404,6 +1398,10 @@ xfs_insert_file_space(
trace_xfs_insert_file_space(ip);
error = xfs_bmap_can_insert_extents(ip, stop_fsb, shift_fsb);
if (error)
return error;
error = xfs_prepare_shift(ip, offset);
if (error)
return error;
......
......@@ -513,8 +513,8 @@ xfs_getfsmap_rtdev_rtbitmap_query(
struct xfs_trans *tp,
struct xfs_getfsmap_info *info)
{
struct xfs_rtalloc_rec alow;
struct xfs_rtalloc_rec ahigh;
struct xfs_rtalloc_rec alow = { 0 };
struct xfs_rtalloc_rec ahigh = { 0 };
int error;
xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
......
......@@ -387,7 +387,7 @@ xfs_reserve_blocks(
do {
free = percpu_counter_sum(&mp->m_fdblocks) -
mp->m_alloc_set_aside;
if (!free)
if (free <= 0)
break;
delta = request - mp->m_resblks;
......
......@@ -3236,7 +3236,6 @@ xfs_iflush_cluster(
struct xfs_inode *cip;
int nr_found;
int clcount = 0;
int bufwasdelwri;
int i;
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
......@@ -3360,37 +3359,22 @@ xfs_iflush_cluster(
* inode buffer and shut down the filesystem.
*/
rcu_read_unlock();
/*
* Clean up the buffer. If it was delwri, just release it --
* brelse can handle it with no problems. If not, shut down the
* filesystem before releasing the buffer.
*/
bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
if (bufwasdelwri)
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
if (!bufwasdelwri) {
/*
* Just like incore_relse: if we have b_iodone functions,
* mark the buffer as an error and call them. Otherwise
* mark it as stale and brelse.
* We'll always have an inode attached to the buffer for completion
* process by the time we are called from xfs_iflush(). Hence we have
* always need to do IO completion processing to abort the inodes
* attached to the buffer. handle them just like the shutdown case in
* xfs_buf_submit().
*/
if (bp->b_iodone) {
ASSERT(bp->b_iodone);
bp->b_flags &= ~XBF_DONE;
xfs_buf_stale(bp);
xfs_buf_ioerror(bp, -EIO);
xfs_buf_ioend(bp);
} else {
xfs_buf_stale(bp);
xfs_buf_relse(bp);
}
}
/*
* Unlocks the flush lock
*/
/* abort the corrupt inode, as it was not attached to the buffer */
xfs_iflush_abort(cip, false);
kmem_free(cilist);
xfs_perag_put(pag);
......@@ -3486,12 +3470,17 @@ xfs_iflush(
xfs_log_force(mp, 0);
/*
* inode clustering:
* see if other inodes can be gathered into this write
* inode clustering: try to gather other inodes into this write
*
* Note: Any error during clustering will result in the filesystem
* being shut down and completion callbacks run on the cluster buffer.
* As we have already flushed and attached this inode to the buffer,
* it has already been aborted and released by xfs_iflush_cluster() and
* so we have no further error handling to do here.
*/
error = xfs_iflush_cluster(ip, bp);
if (error)
goto cluster_corrupt_out;
return error;
*bpp = bp;
return 0;
......@@ -3500,12 +3489,8 @@ xfs_iflush(
if (bp)
xfs_buf_relse(bp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
cluster_corrupt_out:
error = -EFSCORRUPTED;
abort_out:
/*
* Unlocks the flush lock
*/
/* abort the corrupt inode, as it was not attached to the buffer */
xfs_iflush_abort(ip, false);
return error;
}
......
......@@ -963,12 +963,13 @@ xfs_ilock_for_iomap(
unsigned *lockmode)
{
unsigned mode = XFS_ILOCK_SHARED;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/*
* COW writes may allocate delalloc space or convert unwritten COW
* extents, so we need to make sure to take the lock exclusively here.
*/
if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) {
if (xfs_is_reflink_inode(ip) && is_write) {
/*
* FIXME: It could still overwrite on unshared extents and not
* need allocation.
......@@ -989,6 +990,7 @@ xfs_ilock_for_iomap(
mode = XFS_ILOCK_EXCL;
}
relock:
if (flags & IOMAP_NOWAIT) {
if (!xfs_ilock_nowait(ip, mode))
return -EAGAIN;
......@@ -996,6 +998,17 @@ xfs_ilock_for_iomap(
xfs_ilock(ip, mode);
}
/*
* The reflink iflag could have changed since the earlier unlocked
* check, so if we got ILOCK_SHARED for a write and but we're now a
* reflink inode we have to switch to ILOCK_EXCL and relock.
*/
if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
xfs_iunlock(ip, mode);
mode = XFS_ILOCK_EXCL;
goto relock;
}
*lockmode = mode;
return 0;
}
......
......@@ -258,7 +258,12 @@ xfs_trans_alloc(
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
WARN_ON(mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
* they're allowed to run while we're frozen.
*/
WARN_ON(resp->tr_logres > 0 &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);
tp = kmem_zone_zalloc(xfs_trans_zone,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment