Commit 264e89ad authored by Dave Chinner's avatar Dave Chinner

Merge branch 'xfs-dax-updates' into for-next

parents 2da5c4b0 13ad4fe3
...@@ -29,6 +29,11 @@ ...@@ -29,6 +29,11 @@
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/vmstat.h> #include <linux/vmstat.h>
/*
* dax_clear_blocks() is called from within transaction context from XFS,
* and hence this means the stack from this point must follow GFP_NOFS
* semantics for all operations.
*/
int dax_clear_blocks(struct inode *inode, sector_t block, long size) int dax_clear_blocks(struct inode *inode, sector_t block, long size)
{ {
struct block_device *bdev = inode->i_sb->s_bdev; struct block_device *bdev = inode->i_sb->s_bdev;
......
...@@ -2509,7 +2509,7 @@ xfs_alloc_vextent( ...@@ -2509,7 +2509,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after * Try near allocation first, then anywhere-in-ag after
* the first a.g. fails. * the first a.g. fails.
*/ */
if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) { (mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp, args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) % ((mp->m_agfrotor / rotorstep) %
...@@ -2640,6 +2640,14 @@ xfs_alloc_vextent( ...@@ -2640,6 +2640,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len); args->len);
#endif #endif
/* Zero the extent if we were asked to do so */
if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(args->ip, args->fsbno, args->len);
if (error)
goto error0;
}
} }
xfs_perag_put(args->pag); xfs_perag_put(args->pag);
return 0; return 0;
......
...@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg { ...@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */ struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */ struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */ struct xfs_perag *pag; /* per-ag struct for this agno */
struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */ xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */ xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */ xfs_agblock_t agbno; /* allocation group-relative block # */
...@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg { ...@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
char wasdel; /* set if allocation was prev delayed */ char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */ char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */ char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */ xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t; } xfs_alloc_arg_t;
/* /*
* Defines for userdata * Defines for userdata
*/ */
#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ #define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need); struct xfs_perag *pag, xfs_extlen_t need);
......
...@@ -3802,8 +3802,13 @@ xfs_bmap_btalloc( ...@@ -3802,8 +3802,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel; args.wasdel = ap->wasdel;
args.isfl = 0; args.isfl = 0;
args.userdata = ap->userdata; args.userdata = ap->userdata;
if ((error = xfs_alloc_vextent(&args))) if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
args.ip = ap->ip;
error = xfs_alloc_vextent(&args);
if (error)
return error; return error;
if (tryagain && args.fsbno == NULLFSBLOCK) { if (tryagain && args.fsbno == NULLFSBLOCK) {
/* /*
* Exact allocation failed. Now try with alignment * Exact allocation failed. Now try with alignment
...@@ -4302,11 +4307,14 @@ xfs_bmapi_allocate( ...@@ -4302,11 +4307,14 @@ xfs_bmapi_allocate(
/* /*
* Indicate if this is the first user data in the file, or just any * Indicate if this is the first user data in the file, or just any
* user data. * user data. And if it is userdata, indicate whether it needs to
* be initialised to zero during allocation.
*/ */
if (!(bma->flags & XFS_BMAPI_METADATA)) { if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ? bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
if (bma->flags & XFS_BMAPI_ZERO)
bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
} }
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
...@@ -4421,6 +4429,17 @@ xfs_bmapi_convert_unwritten( ...@@ -4421,6 +4429,17 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
/*
* Before insertion into the bmbt, zero the range being converted
* if required.
*/
if (flags & XFS_BMAPI_ZERO) {
error = xfs_zero_extent(bma->ip, mval->br_startblock,
mval->br_blockcount);
if (error)
return error;
}
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist, &bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags); &tmp_logflags);
...@@ -4514,6 +4533,18 @@ xfs_bmapi_write( ...@@ -4514,6 +4533,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/* zeroing is for currently only for data extents, not metadata */
ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
(XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
/*
* we can allocate unwritten extents or pre-zero allocated blocks,
* but it makes no sense to do both at once. This would result in
* zeroing the unwritten extent twice, but it still being an
* unwritten extent....
*/
ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
(XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
if (unlikely(XFS_TEST_ERROR( if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
......
...@@ -52,9 +52,9 @@ struct xfs_bmalloca { ...@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */ xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */ bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */ bool wasdel; /* replacing a delayed allocation */
bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */ bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */ bool conv; /* overwriting unwritten extents */
char userdata;/* userdata mask */
int flags; int flags;
}; };
...@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free ...@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/ */
#define XFS_BMAPI_CONVERT 0x040 #define XFS_BMAPI_CONVERT 0x040
/*
* allocate zeroed extents - this requires all newly allocated user data extents
* to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
* Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
* during the allocation range to zeroed written extents.
*/
#define XFS_BMAPI_ZERO 0x080
#define XFS_BMAPI_FLAGS \ #define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \
...@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free ...@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \
{ XFS_BMAPI_CONVERT, "CONVERT" } { XFS_BMAPI_CONVERT, "CONVERT" }, \
{ XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w) static inline int xfs_bmapi_aflag(int w)
......
...@@ -1259,13 +1259,28 @@ xfs_vm_releasepage( ...@@ -1259,13 +1259,28 @@ xfs_vm_releasepage(
* the DIO. There is only going to be one reference to the ioend and its life * the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need * cycle is constrained by the DIO completion code. hence we don't need
* reference counting here. * reference counting here.
*
* Note that for DIO, an IO to the highest supported file block offset (i.e.
* 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
* bit variable. Hence if we see this overflow, we have to assume that the IO is
* extending the file size. We won't know for sure until IO completion is run
* and the actual max write offset is communicated to the IO completion
* routine.
*
* For DAX page faults, we are preparing to never see unwritten extents here,
* nor should we ever extend the inode size. Hence we will soon have nothing to
* do here for this case, ensuring we don't have to provide an IO completion
* callback to free an ioend that we don't actually need for a fault into the
* page at offset (2^63 - 1FSB) bytes.
*/ */
static void static void
xfs_map_direct( xfs_map_direct(
struct inode *inode, struct inode *inode,
struct buffer_head *bh_result, struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap, struct xfs_bmbt_irec *imap,
xfs_off_t offset) xfs_off_t offset,
bool dax_fault)
{ {
struct xfs_ioend *ioend; struct xfs_ioend *ioend;
xfs_off_t size = bh_result->b_size; xfs_off_t size = bh_result->b_size;
...@@ -1278,6 +1293,13 @@ xfs_map_direct( ...@@ -1278,6 +1293,13 @@ xfs_map_direct(
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
if (dax_fault) {
ASSERT(type == XFS_IO_OVERWRITE);
trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
imap);
return;
}
if (bh_result->b_private) { if (bh_result->b_private) {
ioend = bh_result->b_private; ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0); ASSERT(ioend->io_size > 0);
...@@ -1292,7 +1314,8 @@ xfs_map_direct( ...@@ -1292,7 +1314,8 @@ xfs_map_direct(
ioend->io_size, ioend->io_type, ioend->io_size, ioend->io_type,
imap); imap);
} else if (type == XFS_IO_UNWRITTEN || } else if (type == XFS_IO_UNWRITTEN ||
offset + size > i_size_read(inode)) { offset + size > i_size_read(inode) ||
offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type); ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset; ioend->io_offset = offset;
ioend->io_size = size; ioend->io_size = size;
...@@ -1354,7 +1377,8 @@ __xfs_get_blocks( ...@@ -1354,7 +1377,8 @@ __xfs_get_blocks(
sector_t iblock, sector_t iblock,
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create, int create,
bool direct) bool direct,
bool dax_fault)
{ {
struct xfs_inode *ip = XFS_I(inode); struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
...@@ -1402,10 +1426,12 @@ __xfs_get_blocks( ...@@ -1402,10 +1426,12 @@ __xfs_get_blocks(
if (error) if (error)
goto out_unlock; goto out_unlock;
/* for DAX, we convert unwritten extents directly */
if (create && if (create &&
(!nimaps || (!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK || (imap.br_startblock == HOLESTARTBLOCK ||
imap.br_startblock == DELAYSTARTBLOCK))) { imap.br_startblock == DELAYSTARTBLOCK) ||
(IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) { if (direct || xfs_get_extsz_hint(ip)) {
/* /*
* xfs_iomap_write_direct() expects the shared lock. It * xfs_iomap_write_direct() expects the shared lock. It
...@@ -1450,6 +1476,12 @@ __xfs_get_blocks( ...@@ -1450,6 +1476,12 @@ __xfs_get_blocks(
goto out_unlock; goto out_unlock;
} }
if (IS_DAX(inode) && create) {
ASSERT(!ISUNWRITTEN(&imap));
/* zeroing is not needed at a higher layer */
new = 0;
}
/* trim mapping down to size requested */ /* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits)) if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result, xfs_map_trim_size(inode, iblock, bh_result,
...@@ -1467,7 +1499,8 @@ __xfs_get_blocks( ...@@ -1467,7 +1499,8 @@ __xfs_get_blocks(
set_buffer_unwritten(bh_result); set_buffer_unwritten(bh_result);
/* direct IO needs special help */ /* direct IO needs special help */
if (create && direct) if (create && direct)
xfs_map_direct(inode, bh_result, &imap, offset); xfs_map_direct(inode, bh_result, &imap, offset,
dax_fault);
} }
/* /*
...@@ -1514,7 +1547,7 @@ xfs_get_blocks( ...@@ -1514,7 +1547,7 @@ xfs_get_blocks(
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create) int create)
{ {
return __xfs_get_blocks(inode, iblock, bh_result, create, false); return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
} }
int int
...@@ -1524,7 +1557,17 @@ xfs_get_blocks_direct( ...@@ -1524,7 +1557,17 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result, struct buffer_head *bh_result,
int create) int create)
{ {
return __xfs_get_blocks(inode, iblock, bh_result, create, true); return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
}
int
xfs_get_blocks_dax_fault(
struct inode *inode,
sector_t iblock,
struct buffer_head *bh_result,
int create)
{
return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
} }
static void static void
...@@ -1623,45 +1666,6 @@ xfs_end_io_direct_write( ...@@ -1623,45 +1666,6 @@ xfs_end_io_direct_write(
__xfs_end_io_direct_write(inode, ioend, offset, size); __xfs_end_io_direct_write(inode, ioend, offset, size);
} }
/*
* For DAX we need a mapping buffer callback for unwritten extent conversion
* when page faults allocate blocks and then zero them. Note that in this
* case the mapping indicated by the ioend may extend beyond EOF. We most
* definitely do not want to extend EOF here, so we trim back the ioend size to
* EOF.
*/
#ifdef CONFIG_FS_DAX
void
xfs_end_io_dax_write(
struct buffer_head *bh,
int uptodate)
{
struct xfs_ioend *ioend = bh->b_private;
struct inode *inode = ioend->io_inode;
ssize_t size = ioend->io_size;
ASSERT(IS_DAX(ioend->io_inode));
/* if there was an error zeroing, then don't convert it */
if (!uptodate)
ioend->io_error = -EIO;
/*
* Trim update to EOF, so we don't extend EOF during unwritten extent
* conversion of partial EOF blocks.
*/
spin_lock(&XFS_I(inode)->i_flags_lock);
if (ioend->io_offset + size > i_size_read(inode))
size = i_size_read(inode) - ioend->io_offset;
spin_unlock(&XFS_I(inode)->i_flags_lock);
__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
}
#else
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
#endif
static inline ssize_t static inline ssize_t
xfs_vm_do_dio( xfs_vm_do_dio(
struct inode *inode, struct inode *inode,
......
...@@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset, ...@@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create); struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset, int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create); struct buffer_head *map_bh, int create);
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *); extern void xfs_count_page_state(struct page *, int *, int *);
......
...@@ -56,6 +56,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) ...@@ -56,6 +56,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
} }
/*
* Routine to zero an extent on disk allocated to the specific inode.
*
* The VFS functions take a linearised filesystem block offset, so we have to
* convert the sparse xfs fsb to the right format first.
* VFS types are real funky, too.
*/
int
xfs_zero_extent(
struct xfs_inode *ip,
xfs_fsblock_t start_fsb,
xfs_off_t count_fsb)
{
struct xfs_mount *mp = ip->i_mount;
xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
sector_t block = XFS_BB_TO_FSBT(mp, sector);
ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
if (IS_DAX(VFS_I(ip)))
return dax_clear_blocks(VFS_I(ip), block, size);
/*
* let the block layer decide on the fastest method of
* implementing the zeroing.
*/
return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
}
/* /*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done * caller. Frees all the extents that need freeing, which must be done
...@@ -229,6 +258,13 @@ xfs_bmap_rtalloc( ...@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip, xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen); XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
/* Zero the extent if we were asked to do so */
if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
if (error)
return error;
}
} else { } else {
ap->length = 0; ap->length = 0;
} }
......
...@@ -1493,7 +1493,7 @@ xfs_file_llseek( ...@@ -1493,7 +1493,7 @@ xfs_file_llseek(
* *
* mmap_sem (MM) * mmap_sem (MM)
* sb_start_pagefault(vfs, freeze) * sb_start_pagefault(vfs, freeze)
* i_mmap_lock (XFS - truncate serialisation) * i_mmaplock (XFS - truncate serialisation)
* page_lock (MM) * page_lock (MM)
* i_lock (XFS - extent map serialisation) * i_lock (XFS - extent map serialisation)
*/ */
...@@ -1519,8 +1519,7 @@ xfs_filemap_page_mkwrite( ...@@ -1519,8 +1519,7 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) { if (IS_DAX(inode)) {
ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
xfs_end_io_dax_write);
} else { } else {
ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret); ret = block_page_mkwrite_return(ret);
...@@ -1554,7 +1553,7 @@ xfs_filemap_fault( ...@@ -1554,7 +1553,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent * changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings. * ioend for conversion on read-only mappings.
*/ */
ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else } else
ret = filemap_fault(vma, vmf); ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
...@@ -1562,6 +1561,13 @@ xfs_filemap_fault( ...@@ -1562,6 +1561,13 @@ xfs_filemap_fault(
return ret; return ret;
} }
/*
* Similar to xfs_filemap_fault(), the DAX fault path can call into here on
* both read and write faults. Hence we need to handle both cases. There is no
* ->pmd_mkwrite callout for huge pages, so we have a single function here to
* handle both cases here. @flags carries the information on the type of fault
* occuring.
*/
STATIC int STATIC int
xfs_filemap_pmd_fault( xfs_filemap_pmd_fault(
struct vm_area_struct *vma, struct vm_area_struct *vma,
...@@ -1578,15 +1584,54 @@ xfs_filemap_pmd_fault( ...@@ -1578,15 +1584,54 @@ xfs_filemap_pmd_fault(
trace_xfs_filemap_pmd_fault(ip); trace_xfs_filemap_pmd_fault(ip);
sb_start_pagefault(inode->i_sb); if (flags & FAULT_FLAG_WRITE) {
file_update_time(vma->vm_file); sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
}
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
xfs_end_io_dax_write); NULL);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
if (flags & FAULT_FLAG_WRITE)
sb_end_pagefault(inode->i_sb);
return ret;
}
/*
* pfn_mkwrite was originally inteneded to ensure we capture time stamp
* updates on write faults. In reality, it's need to serialise against
* truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
* here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
* barrier in place.
*/
static int
xfs_filemap_pfn_mkwrite(
struct vm_area_struct *vma,
struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
struct xfs_inode *ip = XFS_I(inode);
int ret = VM_FAULT_NOPAGE;
loff_t size;
trace_xfs_filemap_pfn_mkwrite(ip);
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
/* check if the faulting page hasn't raced with truncate */
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret; return ret;
} }
static const struct vm_operations_struct xfs_file_vm_ops = { static const struct vm_operations_struct xfs_file_vm_ops = {
...@@ -1594,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = { ...@@ -1594,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
.pmd_fault = xfs_filemap_pmd_fault, .pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages, .map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite, .page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
}; };
STATIC int STATIC int
......
...@@ -132,6 +132,7 @@ xfs_iomap_write_direct( ...@@ -132,6 +132,7 @@ xfs_iomap_write_direct(
int committed; int committed;
int error; int error;
int lockmode; int lockmode;
int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip); rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip); extsz = xfs_get_extsz_hint(ip);
...@@ -195,6 +196,23 @@ xfs_iomap_write_direct( ...@@ -195,6 +196,23 @@ xfs_iomap_write_direct(
* Allocate and setup the transaction * Allocate and setup the transaction
*/ */
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
/*
* For DAX, we do not allocate unwritten extents, but instead we zero
* the block before we commit the transaction. Ideally we'd like to do
* this outside the transaction context, but if we commit and then crash
* we may not have zeroed the blocks and this will be exposed on
* recovery of the allocation. Hence we must zero before commit.
* Further, if we are mapping unwritten extents here, we need to zero
* and convert them to written so that we don't need an unwritten extent
* callback for DAX. This also means that we need to be able to dip into
* the reserve block pool if there is no space left but we need to do
* unwritten extent conversion.
*/
if (IS_DAX(VFS_I(ip))) {
bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
tp->t_flags |= XFS_TRANS_RESERVE;
}
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents); resblks, resrtextents);
/* /*
...@@ -221,7 +239,7 @@ xfs_iomap_write_direct( ...@@ -221,7 +239,7 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb); xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1; nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
XFS_BMAPI_PREALLOC, &firstfsb, resblks, imap, bmapi_flags, &firstfsb, resblks, imap,
&nimaps, &free_list); &nimaps, &free_list);
if (error) if (error)
goto out_bmap_cancel; goto out_bmap_cancel;
...@@ -232,6 +250,7 @@ xfs_iomap_write_direct( ...@@ -232,6 +250,7 @@ xfs_iomap_write_direct(
error = xfs_bmap_finish(&tp, &free_list, &committed); error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error) if (error)
goto out_bmap_cancel; goto out_bmap_cancel;
error = xfs_trans_commit(tp); error = xfs_trans_commit(tp);
if (error) if (error)
goto out_unlock; goto out_unlock;
......
...@@ -338,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); ...@@ -338,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *); extern void xfs_set_low_space_thresholds(struct xfs_mount *);
int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
xfs_off_t count_fsb);
#endif /* __XFS_MOUNT_H__ */ #endif /* __XFS_MOUNT_H__ */
...@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); ...@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_fault);
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class, DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment