Commit f21bdbba authored by Theodore Ts'o's avatar Theodore Ts'o

Merge branch 'iomap-for-next' into mb/dio

parents 0d0a60c9 a9010042
......@@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
struct iomap *iomap, struct iomap *srcmap)
{
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
......@@ -1247,7 +1247,8 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
struct iomap iomap = { 0 };
struct iomap iomap = { .type = IOMAP_HOLE };
struct iomap srcmap = { .type = IOMAP_HOLE };
unsigned flags = IOMAP_FAULT;
int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
......@@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
* the file system block size to be equal the page size, which means
* that we never have to deal with more than a single extent here.
*/
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap);
if (iomap_errp)
*iomap_errp = error;
if (error) {
......@@ -1471,7 +1472,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host;
vm_fault_t result = VM_FAULT_FALLBACK;
struct iomap iomap = { 0 };
struct iomap iomap = { .type = IOMAP_HOLE };
struct iomap srcmap = { .type = IOMAP_HOLE };
pgoff_t max_pgoff;
void *entry;
loff_t pos;
......@@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
* to look up our filesystem block.
*/
pos = (loff_t)xas.xa_index << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap,
&srcmap);
if (error)
goto unlock_entry;
......
......@@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock,
#ifdef CONFIG_FS_DAX
static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
unsigned int blkbits = inode->i_blkbits;
unsigned long first_block = offset >> blkbits;
......
......@@ -3449,7 +3449,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
}
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap)
unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int blkbits = inode->i_blkbits;
......
......@@ -1149,7 +1149,8 @@ static inline bool gfs2_iomap_need_write_lock(unsigned flags)
}
static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap)
unsigned flags, struct iomap *iomap,
struct iomap *srcmap)
{
struct gfs2_inode *ip = GFS2_I(inode);
struct metapath mp = { .mp_aheight = 1, };
......
......@@ -732,7 +732,8 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto out_uninit;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL);
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
is_sync_kiocb(iocb));
gfs2_glock_dq(&gh);
out_uninit:
......@@ -767,7 +768,8 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (offset + len > i_size_read(&ip->i_inode))
goto out;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL);
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
is_sync_kiocb(iocb));
out:
gfs2_glock_dq(&gh);
......
......@@ -3,13 +3,15 @@
# Copyright (c) 2019 Oracle.
# All Rights Reserved.
#
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += \
apply.o \
buffered-io.o \
direct-io.o \
fiemap.o \
seek.o
ccflags-y += -I $(srctree)/$(src) # needed for trace events
obj-$(CONFIG_FS_IOMAP) += iomap.o
iomap-y += trace.o \
apply.o \
buffered-io.o \
direct-io.o \
fiemap.o \
seek.o
iomap-$(CONFIG_SWAP) += swapfile.o
......@@ -23,8 +23,10 @@ loff_t
iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
const struct iomap_ops *ops, void *data, iomap_actor_t actor)
{
struct iomap iomap = { 0 };
struct iomap iomap = { .type = IOMAP_HOLE };
struct iomap srcmap = { .type = IOMAP_HOLE };
loff_t written = 0, ret;
u64 end;
/*
* Need to map a range from start position for length bytes. This can
......@@ -38,7 +40,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* expose transient stale data. If the reserve fails, we can safely
* back out at this point as there is nothing to undo.
*/
ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
if (ret)
return ret;
if (WARN_ON(iomap.offset > pos))
......@@ -50,15 +52,26 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
* Cut down the length to the one actually provided by the filesystem,
* as it might not be able to give us the whole size that we requested.
*/
if (iomap.offset + iomap.length < pos + length)
length = iomap.offset + iomap.length - pos;
end = iomap.offset + iomap.length;
if (srcmap.type != IOMAP_HOLE)
end = min(end, srcmap.offset + srcmap.length);
if (pos + length > end)
length = end - pos;
/*
* Now that we have guaranteed that the space allocation will succeed.
* Now that we have guaranteed that the space allocation will succeed,
* we can do the copy-in page by page without having to worry about
* failures exposing transient data.
*
* To support COW operations, we read in data for partially blocks from
* the srcmap if the file system filled it in. In that case we the
* length needs to be limited to the earlier of the ends of the iomaps.
* If the file system did not provide a srcmap we pass in the normal
* iomap into the actors so that they don't need to have special
* handling for the two cases.
*/
written = actor(inode, pos, length, data, &iomap);
written = actor(inode, pos, length, data, &iomap,
srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
/*
* Now the data has been copied, commit the range we've copied. This
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
* Copyright (c) 2016-2018 Christoph Hellwig.
* Copyright (C) 2016-2019 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
......@@ -12,13 +12,34 @@
#include <linux/buffer_head.h>
#include <linux/dax.h>
#include <linux/writeback.h>
#include <linux/list_sort.h>
#include <linux/swap.h>
#include <linux/bio.h>
#include <linux/sched/signal.h>
#include <linux/migrate.h>
#include "trace.h"
#include "../internal.h"
/*
* Structure allocated for each page when block size < PAGE_SIZE to track
* sub-page uptodate status and I/O completions.
*/
struct iomap_page {
atomic_t read_count;
atomic_t write_count;
DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
};
static inline struct iomap_page *to_iomap_page(struct page *page)
{
if (page_has_private(page))
return (struct iomap_page *)page_private(page);
return NULL;
}
static struct bio_set iomap_ioend_bioset;
static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
......@@ -203,9 +224,17 @@ iomap_read_inline_data(struct inode *inode, struct page *page,
SetPageUptodate(page);
}
static inline bool iomap_block_needs_zeroing(struct inode *inode,
struct iomap *iomap, loff_t pos)
{
return iomap->type != IOMAP_MAPPED ||
(iomap->flags & IOMAP_F_NEW) ||
pos >= i_size_read(inode);
}
static loff_t
iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
struct page *page = ctx->cur_page;
......@@ -226,7 +255,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (plen == 0)
goto done;
if (iomap->type != IOMAP_MAPPED || pos >= i_size_read(inode)) {
if (iomap_block_needs_zeroing(inode, iomap, pos)) {
zero_user(page, poff, plen);
iomap_set_range_uptodate(page, poff, plen);
goto done;
......@@ -293,6 +322,8 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops)
unsigned poff;
loff_t ret;
trace_iomap_readpage(page->mapping->host, 1);
for (poff = 0; poff < PAGE_SIZE; poff += ret) {
ret = iomap_apply(inode, page_offset(page) + poff,
PAGE_SIZE - poff, 0, ops, &ctx,
......@@ -351,7 +382,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos,
static loff_t
iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_readpage_ctx *ctx = data;
loff_t done, ret;
......@@ -371,7 +402,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length,
ctx->cur_page_in_bio = false;
}
ret = iomap_readpage_actor(inode, pos + done, length - done,
ctx, iomap);
ctx, iomap, srcmap);
}
return done;
......@@ -389,6 +420,8 @@ iomap_readpages(struct address_space *mapping, struct list_head *pages,
loff_t last = page_offset(list_entry(pages->next, struct page, lru));
loff_t length = last - pos + PAGE_SIZE, ret = 0;
trace_iomap_readpages(mapping->host, nr_pages);
while (length > 0) {
ret = iomap_apply(mapping->host, pos, length, 0, ops,
&ctx, iomap_readpages_actor);
......@@ -455,6 +488,8 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
int
iomap_releasepage(struct page *page, gfp_t gfp_mask)
{
trace_iomap_releasepage(page->mapping->host, page, 0, 0);
/*
* mm accommodates an old ext3 case where clean pages might not have had
* the dirty bit cleared. Thus, it can send actual dirty pages to
......@@ -470,6 +505,8 @@ EXPORT_SYMBOL_GPL(iomap_releasepage);
void
iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len)
{
trace_iomap_invalidatepage(page->mapping->host, page, offset, len);
/*
* If we are invalidating the entire page, clear the dirty state from it
* and release it to avoid unnecessary buildup of the LRU.
......@@ -511,6 +548,10 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage,
EXPORT_SYMBOL_GPL(iomap_migrate_page);
#endif /* CONFIG_MIGRATION */
enum {
IOMAP_WRITE_F_UNSHARE = (1 << 0),
};
static void
iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
{
......@@ -525,19 +566,12 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
}
static int
iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
unsigned poff, unsigned plen, unsigned from, unsigned to,
struct iomap *iomap)
iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff,
unsigned plen, struct iomap *iomap)
{
struct bio_vec bvec;
struct bio bio;
if (iomap->type != IOMAP_MAPPED || block_start >= i_size_read(inode)) {
zero_user_segments(page, poff, from, to, poff + plen);
iomap_set_range_uptodate(page, poff, plen);
return 0;
}
bio_init(&bio, &bvec, 1);
bio.bi_opf = REQ_OP_READ;
bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
......@@ -547,15 +581,15 @@ iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page *page,
}
static int
__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
struct page *page, struct iomap *iomap)
__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags,
struct page *page, struct iomap *srcmap)
{
struct iomap_page *iop = iomap_page_create(inode, page);
loff_t block_size = i_blocksize(inode);
loff_t block_start = pos & ~(block_size - 1);
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
unsigned from = offset_in_page(pos), to = from + len, poff, plen;
int status = 0;
int status;
if (PageUptodate(page))
return 0;
......@@ -566,29 +600,39 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
if (plen == 0)
break;
if ((from > poff && from < poff + plen) ||
(to > poff && to < poff + plen)) {
status = iomap_read_page_sync(inode, block_start, page,
poff, plen, from, to, iomap);
if (status)
break;
if (!(flags & IOMAP_WRITE_F_UNSHARE) &&
(from <= poff || from >= poff + plen) &&
(to <= poff || to >= poff + plen))
continue;
if (iomap_block_needs_zeroing(inode, srcmap, block_start)) {
if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE))
return -EIO;
zero_user_segments(page, poff, from, to, poff + plen);
iomap_set_range_uptodate(page, poff, plen);
continue;
}
status = iomap_read_page_sync(block_start, page, poff, plen,
srcmap);
if (status)
return status;
} while ((block_start += plen) < block_end);
return status;
return 0;
}
static int
iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
struct page **pagep, struct iomap *iomap)
struct page **pagep, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
pgoff_t index = pos >> PAGE_SHIFT;
struct page *page;
int status = 0;
BUG_ON(pos + len > iomap->offset + iomap->length);
if (srcmap != iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length);
if (fatal_signal_pending(current))
return -EINTR;
......@@ -599,18 +643,20 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
return status;
}
page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT,
AOP_FLAG_NOFS);
if (!page) {
status = -ENOMEM;
goto out_no_page;
}
if (iomap->type == IOMAP_INLINE)
iomap_read_inline_data(inode, page, iomap);
if (srcmap->type == IOMAP_INLINE)
iomap_read_inline_data(inode, page, srcmap);
else if (iomap->flags & IOMAP_F_BUFFER_HEAD)
status = __block_write_begin_int(page, pos, len, NULL, iomap);
status = __block_write_begin_int(page, pos, len, NULL, srcmap);
else
status = __iomap_write_begin(inode, pos, len, page, iomap);
status = __iomap_write_begin(inode, pos, len, flags, page,
srcmap);
if (unlikely(status))
goto out_unlock;
......@@ -656,7 +702,7 @@ EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
static int
__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page, struct iomap *iomap)
unsigned copied, struct page *page)
{
flush_dcache_page(page);
......@@ -696,20 +742,20 @@ iomap_write_end_inline(struct inode *inode, struct page *page,
}
static int
iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page, struct iomap *iomap)
iomap_write_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied,
struct page *page, struct iomap *iomap, struct iomap *srcmap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
loff_t old_size = inode->i_size;
int ret;
if (iomap->type == IOMAP_INLINE) {
if (srcmap->type == IOMAP_INLINE) {
ret = iomap_write_end_inline(inode, page, iomap, pos, copied);
} else if (iomap->flags & IOMAP_F_BUFFER_HEAD) {
} else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) {
ret = block_write_end(NULL, inode->i_mapping, pos, len, copied,
page, NULL);
} else {
ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
ret = __iomap_write_end(inode, pos, len, copied, page);
}
/*
......@@ -736,12 +782,11 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
static loff_t
iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
struct iomap *iomap, struct iomap *srcmap)
{
struct iov_iter *i = data;
long status = 0;
ssize_t written = 0;
unsigned int flags = AOP_FLAG_NOFS;
do {
struct page *page;
......@@ -771,8 +816,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
break;
}
status = iomap_write_begin(inode, pos, bytes, flags, &page,
iomap);
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap,
srcmap);
if (unlikely(status))
break;
......@@ -783,8 +828,8 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
flush_dcache_page(page);
status = iomap_write_end(inode, pos, bytes, copied, page,
iomap);
status = iomap_write_end(inode, pos, bytes, copied, page, iomap,
srcmap);
if (unlikely(status < 0))
break;
copied = status;
......@@ -835,50 +880,32 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter,
}
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
static struct page *
__iomap_read_page(struct inode *inode, loff_t offset)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
if (IS_ERR(page))
return page;
if (!PageUptodate(page)) {
put_page(page);
return ERR_PTR(-EIO);
}
return page;
}
static loff_t
iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap, struct iomap *srcmap)
{
long status = 0;
ssize_t written = 0;
do {
struct page *page, *rpage;
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
offset = offset_in_page(pos);
bytes = min_t(loff_t, PAGE_SIZE - offset, length);
/* don't bother with blocks that are not shared to start with */
if (!(iomap->flags & IOMAP_F_SHARED))
return length;
/* don't bother with holes or unwritten extents */
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return length;
rpage = __iomap_read_page(inode, pos);
if (IS_ERR(rpage))
return PTR_ERR(rpage);
do {
unsigned long offset = offset_in_page(pos);
unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length);
struct page *page;
status = iomap_write_begin(inode, pos, bytes,
AOP_FLAG_NOFS, &page, iomap);
put_page(rpage);
IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap);
if (unlikely(status))
return status;
WARN_ON_ONCE(!PageUptodate(page));
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap);
status = iomap_write_end(inode, pos, bytes, bytes, page, iomap,
srcmap);
if (unlikely(status <= 0)) {
if (WARN_ON_ONCE(status == 0))
return -EIO;
......@@ -898,14 +925,14 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
}
int
iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops)
{
loff_t ret;
while (len) {
ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
iomap_dirty_actor);
iomap_unshare_actor);
if (ret <= 0)
return ret;
pos += ret;
......@@ -914,23 +941,22 @@ iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
return 0;
}
EXPORT_SYMBOL_GPL(iomap_file_dirty);
EXPORT_SYMBOL_GPL(iomap_file_unshare);
static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
unsigned bytes, struct iomap *iomap)
unsigned bytes, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page;
int status;
status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
iomap);
status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap);
if (status)
return status;
zero_user(page, offset, bytes);
mark_page_accessed(page);
return iomap_write_end(inode, pos, bytes, bytes, page, iomap);
return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap);
}
static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
......@@ -942,14 +968,14 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
static loff_t
iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
bool *did_zero = data;
loff_t written = 0;
int status;
/* already zeroed? we're done. */
if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
return count;
do {
......@@ -961,7 +987,8 @@ iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
if (IS_DAX(inode))
status = iomap_dax_zero(pos, offset, bytes, iomap);
else
status = iomap_zero(inode, pos, offset, bytes, iomap);
status = iomap_zero(inode, pos, offset, bytes, iomap,
srcmap);
if (status < 0)
return status;
......@@ -1011,7 +1038,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page);
static loff_t
iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct page *page = data;
int ret;
......@@ -1071,3 +1098,551 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
return block_page_mkwrite_return(ret);
}
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void
iomap_finish_page_writeback(struct inode *inode, struct page *page,
int error)
{
struct iomap_page *iop = to_iomap_page(page);
if (error) {
SetPageError(page);
mapping_set_error(inode->i_mapping, -EIO);
}
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
WARN_ON_ONCE(iop && atomic_read(&iop->write_count) <= 0);
if (!iop || atomic_dec_and_test(&iop->write_count))
end_page_writeback(page);
}
/*
* We're now finished for good with this ioend structure. Update the page
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
static void
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_inline_bio;
struct bio *last = ioend->io_bio, *next;
u64 start = bio->bi_iter.bi_sector;
bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bv;
struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
* need to explicitly end the iteration here.
*/
if (bio == last)
next = NULL;
else
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
bio_for_each_segment_all(bv, bio, iter_all)
iomap_finish_page_writeback(inode, bv->bv_page, error);
bio_put(bio);
}
if (unlikely(error && !quiet)) {
printk_ratelimited(KERN_ERR
"%s: writeback error on inode %lu, offset %lld, sector %llu",
inode->i_sb->s_id, inode->i_ino, ioend->io_offset,
start);
}
}
void
iomap_finish_ioends(struct iomap_ioend *ioend, int error)
{
struct list_head tmp;
list_replace_init(&ioend->io_list, &tmp);
iomap_finish_ioend(ioend, error);
while (!list_empty(&tmp)) {
ioend = list_first_entry(&tmp, struct iomap_ioend, io_list);
list_del_init(&ioend->io_list);
iomap_finish_ioend(ioend, error);
}
}
EXPORT_SYMBOL_GPL(iomap_finish_ioends);
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
if (ioend->io_bio->bi_status != next->io_bio->bi_status)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
return false;
if ((ioend->io_type == IOMAP_UNWRITTEN) ^
(next->io_type == IOMAP_UNWRITTEN))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
return true;
}
void
iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends,
void (*merge_private)(struct iomap_ioend *ioend,
struct iomap_ioend *next))
{
struct iomap_ioend *next;
INIT_LIST_HEAD(&ioend->io_list);
while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend,
io_list))) {
if (!iomap_ioend_can_merge(ioend, next))
break;
list_move_tail(&next->io_list, &ioend->io_list);
ioend->io_size += next->io_size;
if (next->io_private && merge_private)
merge_private(ioend, next);
}
}
EXPORT_SYMBOL_GPL(iomap_ioend_try_merge);
static int
iomap_ioend_compare(void *priv, struct list_head *a, struct list_head *b)
{
struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list);
struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
void
iomap_sort_ioends(struct list_head *ioend_list)
{
list_sort(NULL, ioend_list, iomap_ioend_compare);
}
EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
struct iomap_ioend *ioend = bio->bi_private;
iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
}
/*
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the bio instead of
* submitting it. This typically only happens on a filesystem shutdown.
*/
static int
iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
int error)
{
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
if (wpc->ops->prepare_ioend)
error = wpc->ops->prepare_ioend(ioend, error);
if (error) {
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
* as there is only one reference to the ioend at this point in
* time.
*/
ioend->io_bio->bi_status = errno_to_blk_status(error);
bio_endio(ioend->io_bio);
return error;
}
submit_bio(ioend->io_bio);
return 0;
}
static struct iomap_ioend *
iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
loff_t offset, sector_t sector, struct writeback_control *wbc)
{
struct iomap_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &iomap_ioend_bioset);
bio_set_dev(bio, wpc->iomap.bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
ioend->io_private = NULL;
ioend->io_bio = bio;
return ioend;
}
/*
* Allocate a new bio, and chain the old bio to the new one.
*
* Note that we have to do perform the chaining in this unintuitive order
* so that the bi_private linkage is set up in the right direction for the
* traversal in iomap_finish_ioend().
*/
static struct bio *
iomap_chain_bio(struct bio *prev)
{
struct bio *new;
new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
bio_copy_dev(new, prev);/* also copies over blkcg information */
new->bi_iter.bi_sector = bio_end_sector(prev);
new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
bio_get(prev); /* for iomap_finish_ioend */
submit_bio(prev);
return new;
}
static bool
iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
sector_t sector)
{
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
return false;
if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
if (sector != bio_end_sector(wpc->ioend->io_bio))
return false;
return true;
}
/*
* Test to see if we have an existing ioend structure that we could append to
* first, otherwise finish off the current ioend and start another.
*/
static void
iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page,
struct iomap_page *iop, struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct list_head *iolist)
{
sector_t sector = iomap_sector(&wpc->iomap, offset);
unsigned len = i_blocksize(inode);
unsigned poff = offset & (PAGE_SIZE - 1);
bool merged, same_page = false;
if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc);
}
merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
&same_page);
if (iop && !same_page)
atomic_inc(&iop->write_count);
if (!merged) {
if (bio_full(wpc->ioend->io_bio, len)) {
wpc->ioend->io_bio =
iomap_chain_bio(wpc->ioend->io_bio);
}
bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, page, len);
}
/*
* We implement an immediate ioend submission policy here to avoid needing to
* chain multiple ioends and hence nest mempool allocations which can violate
* forward progress guarantees we need to provide. The current ioend we are
* adding blocks to is cached on the writepage context, and if the new block
* does not append to the cached ioend it will create a new ioend and cache that
* instead.
*
* If a new ioend is created and cached, the old ioend is returned and queued
* locally for submission once the entire page is processed or an error has been
* detected. While ioends are submitted immediately after they are completed,
* batching optimisations are provided by higher level block plugging.
*
* At the end of a writeback pass, there will be a cached ioend remaining on the
* writepage context that the caller will need to submit.
*/
static int
iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode,
struct page *page, u64 end_offset)
{
struct iomap_page *iop = to_iomap_page(page);
struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode);
u64 file_offset; /* file offset of page */
int error = 0, count = 0, i;
LIST_HEAD(submit_list);
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE && !iop);
WARN_ON_ONCE(iop && atomic_read(&iop->write_count) != 0);
/*
* Walk through the page to find areas to write back. If we run off the
* end of the current map or find the current map invalid, grab a new
* one.
*/
for (i = 0, file_offset = page_offset(page);
i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
i++, file_offset += len) {
if (iop && !test_bit(i, iop->uptodate))
continue;
error = wpc->ops->map_blocks(wpc, inode, file_offset);
if (error)
break;
if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
continue;
if (wpc->iomap.type == IOMAP_HOLE)
continue;
iomap_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
count++;
}
WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
WARN_ON_ONCE(!PageLocked(page));
WARN_ON_ONCE(PageWriteback(page));
/*
* We cannot cancel the ioend directly here on error. We may have
* already set other pages under writeback and hence we have to run I/O
* completion to mark the error state of the pages under writeback
* appropriately.
*/
if (unlikely(error)) {
if (!count) {
/*
* If the current page hasn't been added to ioend, it
* won't be affected by I/O completions and we must
* discard and unlock it right here.
*/
if (wpc->ops->discard_page)
wpc->ops->discard_page(page);
ClearPageUptodate(page);
unlock_page(page);
goto done;
}
/*
* If the page was not fully cleaned, we need to ensure that the
* higher layers come back to it correctly. That means we need
* to keep the page dirty, and for WB_SYNC_ALL writeback we need
* to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
* so another attempt to write this page in this writeback sweep
* will be made.
*/
set_page_writeback_keepwrite(page);
} else {
clear_page_dirty_for_io(page);
set_page_writeback(page);
}
unlock_page(page);
/*
* Preserve the original error if there was one, otherwise catch
* submission errors here and propagate into subsequent ioend
* submissions.
*/
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
int error2;
list_del_init(&ioend->io_list);
error2 = iomap_submit_ioend(wpc, ioend, error);
if (error2 && !error)
error = error2;
}
/*
* We can end up here with no error and nothing to write only if we race
* with a partial page truncate on a sub-page block sized filesystem.
*/
if (!count)
end_page_writeback(page);
done:
mapping_set_error(page->mapping, error);
return error;
}
/*
* Write out a dirty page.
*
* For delalloc space on the page we need to allocate space and flush it.
* For unwritten space on the page we need to start the conversion to
* regular allocated space.
*/
static int
iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data)
{
struct iomap_writepage_ctx *wpc = data;
struct inode *inode = page->mapping->host;
pgoff_t end_index;
u64 end_offset;
loff_t offset;
trace_iomap_writepage(inode, page, 0, 0);
/*
* Refuse to write the page out if we are called from reclaim context.
*
* This avoids stack overflows when called from deeply used stacks in
* random callers for direct reclaim or memcg reclaim. We explicitly
* allow reclaim from kswapd as the stack usage there is relatively low.
*
* This should never happen except in the case of a VM regression so
* warn about it.
*/
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
PF_MEMALLOC))
goto redirty;
/*
* Given that we do not allow direct reclaim to call us, we should
* never be called in a recursive filesystem reclaim context.
*/
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
* Is this page beyond the end of the file?
*
* The page index is less than the end_index, adjust the end_offset
* to the highest offset that this page should represent.
* -----------------------------------------------------
* | file mapping | <EOF> |
* -----------------------------------------------------
* | Page ... | Page N-2 | Page N-1 | Page N | |
* ^--------------------------------^----------|--------
* | desired writeback range | see else |
* ---------------------------------^------------------|
*/
offset = i_size_read(inode);
end_index = offset >> PAGE_SHIFT;
if (page->index < end_index)
end_offset = (loff_t)(page->index + 1) << PAGE_SHIFT;
else {
/*
* Check whether the page to write out is beyond or straddles
* i_size or not.
* -------------------------------------------------------
* | file mapping | <EOF> |
* -------------------------------------------------------
* | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
* ^--------------------------------^-----------|---------
* | | Straddles |
* ---------------------------------^-----------|--------|
*/
unsigned offset_into_page = offset & (PAGE_SIZE - 1);
/*
* Skip the page if it is fully outside i_size, e.g. due to a
* truncate operation that is in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
* iomap_vm_releasepage() is called on it and gets confused.
*
* Note that the end_index is unsigned long, it would overflow
* if the given offset is greater than 16TB on 32-bit system
* and if we do check the page is fully outside i_size or not
* via "if (page->index >= end_index + 1)" as "end_index + 1"
* will be evaluated to 0. Hence this page will be redirtied
* and be written out repeatedly which would result in an
* infinite loop, the user program that perform this operation
* will hang. Instead, we can verify this situation by checking
* if the page to write is totally beyond the i_size or if it's
* offset is just equal to the EOF.
*/
if (page->index > end_index ||
(page->index == end_index && offset_into_page == 0))
goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
* that is not a multiple of the page size, the remaining
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
zero_user_segment(page, offset_into_page, PAGE_SIZE);
/* Adjust the end_offset to the end of file */
end_offset = offset;
}
return iomap_writepage_map(wpc, wbc, inode, page, end_offset);
redirty:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
int
iomap_writepage(struct page *page, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops)
{
int ret;
wpc->ops = ops;
ret = iomap_do_writepage(page, wbc, wpc);
if (!wpc->ioend)
return ret;
return iomap_submit_ioend(wpc, wpc->ioend, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepage);
int
iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops)
{
int ret;
wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
if (!wpc->ioend)
return ret;
return iomap_submit_ioend(wpc, wpc->ioend, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct iomap_ioend, io_inline_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_init);
......@@ -358,7 +358,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length,
static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
struct iomap_dio *dio = data;
......@@ -392,7 +392,8 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
*/
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
......@@ -400,7 +401,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos, start = pos;
loff_t end = iocb->ki_pos + count - 1, ret = 0;
unsigned int flags = IOMAP_DIRECT;
bool wait_for_completion = is_sync_kiocb(iocb);
struct blk_plug plug;
struct iomap_dio *dio;
......@@ -409,6 +409,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!count)
return 0;
if (WARN_ON(is_sync_kiocb(iocb) && !wait_for_completion))
return -EIO;
dio = kmalloc(sizeof(*dio), GFP_KERNEL);
if (!dio)
return -ENOMEM;
......@@ -430,7 +433,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (pos >= dio->i_size)
goto out_free_dio;
if (iter_is_iovec(iter) && iov_iter_rw(iter) == READ)
if (iter_is_iovec(iter))
dio->flags |= IOMAP_DIO_DIRTY;
} else {
flags |= IOMAP_WRITE;
......
......@@ -44,7 +44,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi,
static loff_t
iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap)
struct iomap *iomap, struct iomap *srcmap)
{
struct fiemap_ctx *ctx = data;
loff_t ret = length;
......@@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(iomap_fiemap);
static loff_t
iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
sector_t *bno = data, addr;
......
......@@ -119,7 +119,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
static loff_t
iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_UNWRITTEN:
......@@ -165,7 +165,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole);
static loff_t
iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
void *data, struct iomap *iomap)
void *data, struct iomap *iomap, struct iomap *srcmap)
{
switch (iomap->type) {
case IOMAP_HOLE:
......
......@@ -76,7 +76,8 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
* distinction between written and unwritten extents.
*/
static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
loff_t count, void *data, struct iomap *iomap)
loff_t count, void *data, struct iomap *iomap,
struct iomap *srcmap)
{
struct iomap_swapfile_info *isi = data;
int error;
......
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2019 Christoph Hellwig
*/
#include <linux/iomap.h>
/*
* We include this last to have the helpers above available for the trace
* event implementations.
*/
#define CREATE_TRACE_POINTS
#include "trace.h"
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (c) 2009-2019 Christoph Hellwig
*
* NOTE: none of these tracepoints shall be consider a stable kernel ABI
* as they can change at any time.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM iomap
#if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _IOMAP_TRACE_H
#include <linux/tracepoint.h>
struct inode;
DECLARE_EVENT_CLASS(iomap_readpage_class,
TP_PROTO(struct inode *inode, int nr_pages),
TP_ARGS(inode, nr_pages),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, ino)
__field(int, nr_pages)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->nr_pages = nr_pages;
),
TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->nr_pages)
)
#define DEFINE_READPAGE_EVENT(name) \
DEFINE_EVENT(iomap_readpage_class, name, \
TP_PROTO(struct inode *inode, int nr_pages), \
TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(iomap_readpage);
DEFINE_READPAGE_EVENT(iomap_readpages);
DECLARE_EVENT_CLASS(iomap_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
unsigned int len),
TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(u64, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
__field(unsigned int, length)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
__entry->length = len;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
"length %x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
__entry->length)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(iomap_page_class, name, \
TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
unsigned int len), \
TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(iomap_writepage);
DEFINE_PAGE_EVENT(iomap_releasepage);
DEFINE_PAGE_EVENT(iomap_invalidatepage);
#endif /* _IOMAP_TRACE_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
#include <trace/define_trace.h>
......@@ -34,6 +34,7 @@
#include "xfs_ag_resv.h"
#include "xfs_refcount.h"
#include "xfs_icache.h"
#include "xfs_iomap.h"
kmem_zone_t *xfs_bmap_free_item_zone;
......@@ -4456,16 +4457,21 @@ int
xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
xfs_fileoff_t offset_fsb,
struct xfs_bmbt_irec *imap,
xfs_off_t offset,
struct iomap *iomap,
unsigned int *seq)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
struct xfs_bmalloca bma = { NULL };
u16 flags = 0;
struct xfs_trans *tp;
int error;
if (whichfork == XFS_COW_FORK)
flags |= IOMAP_F_SHARED;
/*
* Space for the extent and indirect blocks was reserved when the
* delalloc extent was created so there's no need to do so here.
......@@ -4495,7 +4501,7 @@ xfs_bmapi_convert_delalloc(
* the extent. Just return the real extent at this offset.
*/
if (!isnullstartblock(bma.got.br_startblock)) {
*imap = bma.got;
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel;
}
......@@ -4528,7 +4534,7 @@ xfs_bmapi_convert_delalloc(
XFS_STATS_INC(mp, xs_xstrat_quick);
ASSERT(!isnullstartblock(bma.got.br_startblock));
*imap = bma.got;
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK)
......
......@@ -228,8 +228,7 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
int eof);
int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap,
unsigned int *seq);
xfs_off_t offset, struct iomap *iomap, unsigned int *seq);
int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp,
struct xfs_inode *ip, int whichfork,
struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
......
......@@ -18,17 +18,18 @@
#include "xfs_bmap_util.h"
#include "xfs_reflink.h"
/*
* structure owned by writepages passed to individual writepage calls
*/
struct xfs_writepage_ctx {
struct xfs_bmbt_irec imap;
int fork;
struct iomap_writepage_ctx ctx;
unsigned int data_seq;
unsigned int cow_seq;
struct xfs_ioend *ioend;
};
static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx *ctx)
{
return container_of(ctx, struct xfs_writepage_ctx, ctx);
}
struct block_device *
xfs_find_bdev_for_inode(
struct inode *inode)
......@@ -55,71 +56,10 @@ xfs_find_daxdev_for_inode(
return mp->m_ddev_targp->bt_daxdev;
}
static void
xfs_finish_page_writeback(
struct inode *inode,
struct bio_vec *bvec,
int error)
{
struct iomap_page *iop = to_iomap_page(bvec->bv_page);
if (error) {
SetPageError(bvec->bv_page);
mapping_set_error(inode->i_mapping, -EIO);
}
ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
ASSERT(!iop || atomic_read(&iop->write_count) > 0);
if (!iop || atomic_dec_and_test(&iop->write_count))
end_page_writeback(bvec->bv_page);
}
/*
* We're now finished for good with this ioend structure. Update the page
* state, release holds on bios, and finally free up memory. Do not use the
* ioend after this.
*/
STATIC void
xfs_destroy_ioend(
struct xfs_ioend *ioend,
int error)
{
struct inode *inode = ioend->io_inode;
struct bio *bio = &ioend->io_inline_bio;
struct bio *last = ioend->io_bio, *next;
u64 start = bio->bi_iter.bi_sector;
bool quiet = bio_flagged(bio, BIO_QUIET);
for (bio = &ioend->io_inline_bio; bio; bio = next) {
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
/*
* For the last bio, bi_private points to the ioend, so we
* need to explicitly end the iteration here.
*/
if (bio == last)
next = NULL;
else
next = bio->bi_private;
/* walk each page on bio, ending page IO on them */
bio_for_each_segment_all(bvec, bio, iter_all)
xfs_finish_page_writeback(inode, bvec, error);
bio_put(bio);
}
if (unlikely(error && !quiet)) {
xfs_err_ratelimited(XFS_I(inode)->i_mount,
"writeback error on sector %llu", start);
}
}
/*
* Fast and loose check if this write could update the on-disk inode size.
*/
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
{
return ioend->io_offset + ioend->io_size >
XFS_I(ioend->io_inode)->i_d.di_size;
......@@ -127,7 +67,7 @@ static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
STATIC int
xfs_setfilesize_trans_alloc(
struct xfs_ioend *ioend)
struct iomap_ioend *ioend)
{
struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
struct xfs_trans *tp;
......@@ -137,7 +77,7 @@ xfs_setfilesize_trans_alloc(
if (error)
return error;
ioend->io_append_trans = tp;
ioend->io_private = tp;
/*
* We may pass freeze protection with a transaction. So tell lockdep
......@@ -200,11 +140,11 @@ xfs_setfilesize(
STATIC int
xfs_setfilesize_ioend(
struct xfs_ioend *ioend,
struct iomap_ioend *ioend,
int error)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_trans *tp = ioend->io_append_trans;
struct xfs_trans *tp = ioend->io_private;
/*
* The transaction may have been allocated in the I/O submission thread,
......@@ -228,9 +168,8 @@ xfs_setfilesize_ioend(
*/
STATIC void
xfs_end_ioend(
struct xfs_ioend *ioend)
struct iomap_ioend *ioend)
{
struct list_head ioend_list;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
......@@ -257,7 +196,7 @@ xfs_end_ioend(
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
if (ioend->io_fork == XFS_COW_FORK)
if (ioend->io_flags & IOMAP_F_SHARED)
xfs_reflink_cancel_cow_range(ip, offset, size, true);
goto done;
}
......@@ -265,49 +204,20 @@ xfs_end_ioend(
/*
* Success: commit the COW or unwritten blocks if needed.
*/
if (ioend->io_fork == XFS_COW_FORK)
if (ioend->io_flags & IOMAP_F_SHARED)
error = xfs_reflink_end_cow(ip, offset, size);
else if (ioend->io_state == XFS_EXT_UNWRITTEN)
else if (ioend->io_type == IOMAP_UNWRITTEN)
error = xfs_iomap_write_unwritten(ip, offset, size, false);
else
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
done:
if (ioend->io_append_trans)
if (ioend->io_private)
error = xfs_setfilesize_ioend(ioend, error);
list_replace_init(&ioend->io_list, &ioend_list);
xfs_destroy_ioend(ioend, error);
while (!list_empty(&ioend_list)) {
ioend = list_first_entry(&ioend_list, struct xfs_ioend,
io_list);
list_del_init(&ioend->io_list);
xfs_destroy_ioend(ioend, error);
}
iomap_finish_ioends(ioend, error);
memalloc_nofs_restore(nofs_flag);
}
/*
* We can merge two adjacent ioends if they have the same set of work to do.
*/
static bool
xfs_ioend_can_merge(
struct xfs_ioend *ioend,
struct xfs_ioend *next)
{
if (ioend->io_bio->bi_status != next->io_bio->bi_status)
return false;
if ((ioend->io_fork == XFS_COW_FORK) ^ (next->io_fork == XFS_COW_FORK))
return false;
if ((ioend->io_state == XFS_EXT_UNWRITTEN) ^
(next->io_state == XFS_EXT_UNWRITTEN))
return false;
if (ioend->io_offset + ioend->io_size != next->io_offset)
return false;
return true;
}
/*
* If the to be merged ioend has a preallocated transaction for file
* size updates we need to ensure the ioend it is merged into also
......@@ -315,104 +225,65 @@ xfs_ioend_can_merge(
* as it is guaranteed to be clean.
*/
static void
xfs_ioend_merge_append_transactions(
struct xfs_ioend *ioend,
struct xfs_ioend *next)
xfs_ioend_merge_private(
struct iomap_ioend *ioend,
struct iomap_ioend *next)
{
if (!ioend->io_append_trans) {
ioend->io_append_trans = next->io_append_trans;
next->io_append_trans = NULL;
if (!ioend->io_private) {
ioend->io_private = next->io_private;
next->io_private = NULL;
} else {
xfs_setfilesize_ioend(next, -ECANCELED);
}
}
/* Try to merge adjacent completions. */
STATIC void
xfs_ioend_try_merge(
struct xfs_ioend *ioend,
struct list_head *more_ioends)
{
struct xfs_ioend *next_ioend;
while (!list_empty(more_ioends)) {
next_ioend = list_first_entry(more_ioends, struct xfs_ioend,
io_list);
if (!xfs_ioend_can_merge(ioend, next_ioend))
break;
list_move_tail(&next_ioend->io_list, &ioend->io_list);
ioend->io_size += next_ioend->io_size;
if (next_ioend->io_append_trans)
xfs_ioend_merge_append_transactions(ioend, next_ioend);
}
}
/* list_sort compare function for ioends */
static int
xfs_ioend_compare(
void *priv,
struct list_head *a,
struct list_head *b)
{
struct xfs_ioend *ia;
struct xfs_ioend *ib;
ia = container_of(a, struct xfs_ioend, io_list);
ib = container_of(b, struct xfs_ioend, io_list);
if (ia->io_offset < ib->io_offset)
return -1;
else if (ia->io_offset > ib->io_offset)
return 1;
return 0;
}
/* Finish all pending io completions. */
void
xfs_end_io(
struct work_struct *work)
{
struct xfs_inode *ip;
struct xfs_ioend *ioend;
struct list_head completion_list;
struct xfs_inode *ip =
container_of(work, struct xfs_inode, i_ioend_work);
struct iomap_ioend *ioend;
struct list_head tmp;
unsigned long flags;
ip = container_of(work, struct xfs_inode, i_ioend_work);
spin_lock_irqsave(&ip->i_ioend_lock, flags);
list_replace_init(&ip->i_ioend_list, &completion_list);
list_replace_init(&ip->i_ioend_list, &tmp);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
list_sort(NULL, &completion_list, xfs_ioend_compare);
while (!list_empty(&completion_list)) {
ioend = list_first_entry(&completion_list, struct xfs_ioend,
io_list);
iomap_sort_ioends(&tmp);
while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
io_list))) {
list_del_init(&ioend->io_list);
xfs_ioend_try_merge(ioend, &completion_list);
iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
xfs_end_ioend(ioend);
}
}
static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
{
return ioend->io_private ||
ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED);
}
STATIC void
xfs_end_bio(
struct bio *bio)
{
struct xfs_ioend *ioend = bio->bi_private;
struct iomap_ioend *ioend = bio->bi_private;
struct xfs_inode *ip = XFS_I(ioend->io_inode);
struct xfs_mount *mp = ip->i_mount;
unsigned long flags;
if (ioend->io_fork == XFS_COW_FORK ||
ioend->io_state == XFS_EXT_UNWRITTEN ||
ioend->io_append_trans != NULL) {
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
} else
xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
ASSERT(xfs_ioend_needs_workqueue(ioend));
spin_lock_irqsave(&ip->i_ioend_lock, flags);
if (list_empty(&ip->i_ioend_list))
WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
&ip->i_ioend_work));
list_add_tail(&ioend->io_list, &ip->i_ioend_list);
spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
}
/*
......@@ -421,19 +292,19 @@ xfs_end_bio(
*/
static bool
xfs_imap_valid(
struct xfs_writepage_ctx *wpc,
struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb)
loff_t offset)
{
if (offset_fsb < wpc->imap.br_startoff ||
offset_fsb >= wpc->imap.br_startoff + wpc->imap.br_blockcount)
if (offset < wpc->iomap.offset ||
offset >= wpc->iomap.offset + wpc->iomap.length)
return false;
/*
* If this is a COW mapping, it is sufficient to check that the mapping
* covers the offset. Be careful to check this first because the caller
* can revalidate a COW mapping without updating the data seqno.
*/
if (wpc->fork == XFS_COW_FORK)
if (wpc->iomap.flags & IOMAP_F_SHARED)
return true;
/*
......@@ -443,17 +314,17 @@ xfs_imap_valid(
* checked (and found nothing at this offset) could have added
* overlapping blocks.
*/
if (wpc->data_seq != READ_ONCE(ip->i_df.if_seq))
if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
return false;
if (xfs_inode_has_cow_data(ip) &&
wpc->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
return false;
return true;
}
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
* extent that maps offset_fsb in wpc->imap.
* extent that maps offset_fsb in wpc->iomap.
*
* The current page is held locked so nothing could have removed the block
* backing offset_fsb, although it could have moved from the COW to the data
......@@ -461,32 +332,38 @@ xfs_imap_valid(
*/
static int
xfs_convert_blocks(
struct xfs_writepage_ctx *wpc,
struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
xfs_fileoff_t offset_fsb)
int whichfork,
loff_t offset)
{
int error;
unsigned *seq;
if (whichfork == XFS_COW_FORK)
seq = &XFS_WPC(wpc)->cow_seq;
else
seq = &XFS_WPC(wpc)->data_seq;
/*
* Attempt to allocate whatever delalloc extent currently backs
* offset_fsb and put the result into wpc->imap. Allocate in a loop
* because it may take several attempts to allocate real blocks for a
* contiguous delalloc extent if free space is sufficiently fragmented.
* Attempt to allocate whatever delalloc extent currently backs offset
* and put the result into wpc->iomap. Allocate in a loop because it
* may take several attempts to allocate real blocks for a contiguous
* delalloc extent if free space is sufficiently fragmented.
*/
do {
error = xfs_bmapi_convert_delalloc(ip, wpc->fork, offset_fsb,
&wpc->imap, wpc->fork == XFS_COW_FORK ?
&wpc->cow_seq : &wpc->data_seq);
error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
&wpc->iomap, seq);
if (error)
return error;
} while (wpc->imap.br_startoff + wpc->imap.br_blockcount <= offset_fsb);
} while (wpc->iomap.offset + wpc->iomap.length <= offset);
return 0;
}
STATIC int
static int
xfs_map_blocks(
struct xfs_writepage_ctx *wpc,
struct iomap_writepage_ctx *wpc,
struct inode *inode,
loff_t offset)
{
......@@ -496,6 +373,7 @@ xfs_map_blocks(
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
xfs_fileoff_t cow_fsb = NULLFILEOFF;
int whichfork = XFS_DATA_FORK;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
......@@ -519,7 +397,7 @@ xfs_map_blocks(
* against concurrent updates and provides a memory barrier on the way
* out that ensures that we always see the current value.
*/
if (xfs_imap_valid(wpc, ip, offset_fsb))
if (xfs_imap_valid(wpc, ip, offset))
return 0;
/*
......@@ -541,10 +419,10 @@ xfs_map_blocks(
xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
cow_fsb = imap.br_startoff;
if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
wpc->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
wpc->fork = XFS_COW_FORK;
whichfork = XFS_COW_FORK;
goto allocate_blocks;
}
......@@ -552,7 +430,7 @@ xfs_map_blocks(
* No COW extent overlap. Revalidate now that we may have updated
* ->cow_seq. If the data mapping is still valid, we're done.
*/
if (xfs_imap_valid(wpc, ip, offset_fsb)) {
if (xfs_imap_valid(wpc, ip, offset)) {
xfs_iunlock(ip, XFS_ILOCK_SHARED);
return 0;
}
......@@ -564,11 +442,9 @@ xfs_map_blocks(
*/
if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
imap.br_startoff = end_fsb; /* fake a hole past EOF */
wpc->data_seq = READ_ONCE(ip->i_df.if_seq);
XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
xfs_iunlock(ip, XFS_ILOCK_SHARED);
wpc->fork = XFS_DATA_FORK;
/* landed in a hole or beyond EOF? */
if (imap.br_startoff > offset_fsb) {
imap.br_blockcount = imap.br_startoff - offset_fsb;
......@@ -592,11 +468,11 @@ xfs_map_blocks(
isnullstartblock(imap.br_startblock))
goto allocate_blocks;
wpc->imap = imap;
trace_xfs_map_blocks_found(ip, offset, count, wpc->fork, &imap);
xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0;
allocate_blocks:
error = xfs_convert_blocks(wpc, ip, offset_fsb);
error = xfs_convert_blocks(wpc, ip, whichfork, offset);
if (error) {
/*
* If we failed to find the extent in the COW fork we might have
......@@ -605,7 +481,7 @@ xfs_map_blocks(
* the former case, but prevent additional retries to avoid
* looping forever for the latter case.
*/
if (error == -EAGAIN && wpc->fork == XFS_COW_FORK && !retries++)
if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
goto retry;
ASSERT(error != -EAGAIN);
return error;
......@@ -616,34 +492,22 @@ xfs_map_blocks(
* original delalloc one. Trim the return extent to the next COW
* boundary again to force a re-lookup.
*/
if (wpc->fork != XFS_COW_FORK && cow_fsb != NULLFILEOFF &&
cow_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount)
wpc->imap.br_blockcount = cow_fsb - wpc->imap.br_startoff;
if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
wpc->iomap.length = cow_offset - wpc->iomap.offset;
}
ASSERT(wpc->imap.br_startoff <= offset_fsb);
ASSERT(wpc->imap.br_startoff + wpc->imap.br_blockcount > offset_fsb);
trace_xfs_map_blocks_alloc(ip, offset, count, wpc->fork, &imap);
ASSERT(wpc->iomap.offset <= offset);
ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
return 0;
}
/*
* Submit the bio for an ioend. We are passed an ioend with a bio attached to
* it, and we submit that bio. The ioend may be used for multiple bio
* submissions, so we only want to allocate an append transaction for the ioend
* once. In the case of multiple bio submission, each bio will take an IO
* reference to the ioend to ensure that the ioend completion is only done once
* all bios have been submitted and the ioend is really done.
*
* If @status is non-zero, it means that we have a situation where some part of
* the submission process has failed after we have marked paged for writeback
* and unlocked them. In this situation, we need to fail the bio and ioend
* rather than submit it to IO. This typically only happens on a filesystem
* shutdown.
*/
STATIC int
xfs_submit_ioend(
struct writeback_control *wbc,
struct xfs_ioend *ioend,
static int
xfs_prepare_ioend(
struct iomap_ioend *ioend,
int status)
{
unsigned int nofs_flag;
......@@ -656,157 +520,24 @@ xfs_submit_ioend(
nofs_flag = memalloc_nofs_save();
/* Convert CoW extents to regular */
if (!status && ioend->io_fork == XFS_COW_FORK) {
if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
ioend->io_offset, ioend->io_size);
}
/* Reserve log space if we might write beyond the on-disk inode size. */
if (!status &&
(ioend->io_fork == XFS_COW_FORK ||
ioend->io_state != XFS_EXT_UNWRITTEN) &&
((ioend->io_flags & IOMAP_F_SHARED) ||
ioend->io_type != IOMAP_UNWRITTEN) &&
xfs_ioend_is_append(ioend) &&
!ioend->io_append_trans)
!ioend->io_private)
status = xfs_setfilesize_trans_alloc(ioend);
memalloc_nofs_restore(nofs_flag);
ioend->io_bio->bi_private = ioend;
ioend->io_bio->bi_end_io = xfs_end_bio;
/*
* If we are failing the IO now, just mark the ioend with an
* error and finish it. This will run IO completion immediately
* as there is only one reference to the ioend at this point in
* time.
*/
if (status) {
ioend->io_bio->bi_status = errno_to_blk_status(status);
bio_endio(ioend->io_bio);
return status;
}
submit_bio(ioend->io_bio);
return 0;
}
static struct xfs_ioend *
xfs_alloc_ioend(
struct inode *inode,
int fork,
xfs_exntst_t state,
xfs_off_t offset,
struct block_device *bdev,
sector_t sector,
struct writeback_control *wbc)
{
struct xfs_ioend *ioend;
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = sector;
bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
bio->bi_write_hint = inode->i_write_hint;
wbc_init_bio(wbc, bio);
ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_fork = fork;
ioend->io_state = state;
ioend->io_inode = inode;
ioend->io_size = 0;
ioend->io_offset = offset;
ioend->io_append_trans = NULL;
ioend->io_bio = bio;
return ioend;
}
/*
* Allocate a new bio, and chain the old bio to the new one.
*
* Note that we have to do perform the chaining in this unintuitive order
* so that the bi_private linkage is set up in the right direction for the
* traversal in xfs_destroy_ioend().
*/
static struct bio *
xfs_chain_bio(
struct bio *prev)
{
struct bio *new;
new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
bio_copy_dev(new, prev);/* also copies over blkcg information */
new->bi_iter.bi_sector = bio_end_sector(prev);
new->bi_opf = prev->bi_opf;
new->bi_write_hint = prev->bi_write_hint;
bio_chain(prev, new);
bio_get(prev); /* for xfs_destroy_ioend */
submit_bio(prev);
return new;
}
/*
* Test to see if we have an existing ioend structure that we could append to
* first, otherwise finish off the current ioend and start another.
*/
STATIC void
xfs_add_to_ioend(
struct inode *inode,
xfs_off_t offset,
struct page *page,
struct iomap_page *iop,
struct xfs_writepage_ctx *wpc,
struct writeback_control *wbc,
struct list_head *iolist)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
struct block_device *bdev = xfs_find_bdev_for_inode(inode);
unsigned len = i_blocksize(inode);
unsigned poff = offset & (PAGE_SIZE - 1);
bool merged, same_page = false;
sector_t sector;
sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
if (!wpc->ioend ||
wpc->fork != wpc->ioend->io_fork ||
wpc->imap.br_state != wpc->ioend->io_state ||
sector != bio_end_sector(wpc->ioend->io_bio) ||
offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
if (wpc->ioend)
list_add(&wpc->ioend->io_list, iolist);
wpc->ioend = xfs_alloc_ioend(inode, wpc->fork,
wpc->imap.br_state, offset, bdev, sector, wbc);
}
merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff,
&same_page);
if (iop && !same_page)
atomic_inc(&iop->write_count);
if (!merged) {
if (bio_full(wpc->ioend->io_bio, len))
wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);
bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, page, len);
}
STATIC void
xfs_vm_invalidatepage(
struct page *page,
unsigned int offset,
unsigned int length)
{
trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
iomap_invalidatepage(page, offset, length);
if (xfs_ioend_needs_workqueue(ioend))
ioend->io_bio->bi_end_io = xfs_end_bio;
return status;
}
/*
......@@ -820,8 +551,8 @@ xfs_vm_invalidatepage(
* transaction as there is no space left for block reservation (typically why we
* see a ENOSPC in writeback).
*/
STATIC void
xfs_aops_discard_page(
static void
xfs_discard_page(
struct page *page)
{
struct inode *inode = page->mapping->host;
......@@ -843,246 +574,14 @@ xfs_aops_discard_page(
if (error && !XFS_FORCED_SHUTDOWN(mp))
xfs_alert(mp, "page discard unable to remove delalloc mapping.");
out_invalidate:
xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
iomap_invalidatepage(page, 0, PAGE_SIZE);
}
/*
* We implement an immediate ioend submission policy here to avoid needing to
* chain multiple ioends and hence nest mempool allocations which can violate
* forward progress guarantees we need to provide. The current ioend we are
* adding blocks to is cached on the writepage context, and if the new block
* does not append to the cached ioend it will create a new ioend and cache that
* instead.
*
* If a new ioend is created and cached, the old ioend is returned and queued
* locally for submission once the entire page is processed or an error has been
* detected. While ioends are submitted immediately after they are completed,
* batching optimisations are provided by higher level block plugging.
*
* At the end of a writeback pass, there will be a cached ioend remaining on the
* writepage context that the caller will need to submit.
*/
static int
xfs_writepage_map(
struct xfs_writepage_ctx *wpc,
struct writeback_control *wbc,
struct inode *inode,
struct page *page,
uint64_t end_offset)
{
LIST_HEAD(submit_list);
struct iomap_page *iop = to_iomap_page(page);
unsigned len = i_blocksize(inode);
struct xfs_ioend *ioend, *next;
uint64_t file_offset; /* file offset of page */
int error = 0, count = 0, i;
ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
ASSERT(!iop || atomic_read(&iop->write_count) == 0);
/*
* Walk through the page to find areas to write back. If we run off the
* end of the current map or find the current map invalid, grab a new
* one.
*/
for (i = 0, file_offset = page_offset(page);
i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
i++, file_offset += len) {
if (iop && !test_bit(i, iop->uptodate))
continue;
error = xfs_map_blocks(wpc, inode, file_offset);
if (error)
break;
if (wpc->imap.br_startblock == HOLESTARTBLOCK)
continue;
xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
&submit_list);
count++;
}
ASSERT(wpc->ioend || list_empty(&submit_list));
ASSERT(PageLocked(page));
ASSERT(!PageWriteback(page));
/*
* On error, we have to fail the ioend here because we may have set
* pages under writeback, we have to make sure we run IO completion to
* mark the error state of the IO appropriately, so we can't cancel the
* ioend directly here. That means we have to mark this page as under
* writeback if we included any blocks from it in the ioend chain so
* that completion treats it correctly.
*
* If we didn't include the page in the ioend, the on error we can
* simply discard and unlock it as there are no other users of the page
* now. The caller will still need to trigger submission of outstanding
* ioends on the writepage context so they are treated correctly on
* error.
*/
if (unlikely(error)) {
if (!count) {
xfs_aops_discard_page(page);
ClearPageUptodate(page);
unlock_page(page);
goto done;
}
/*
* If the page was not fully cleaned, we need to ensure that the
* higher layers come back to it correctly. That means we need
* to keep the page dirty, and for WB_SYNC_ALL writeback we need
* to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
* so another attempt to write this page in this writeback sweep
* will be made.
*/
set_page_writeback_keepwrite(page);
} else {
clear_page_dirty_for_io(page);
set_page_writeback(page);
}
unlock_page(page);
/*
* Preserve the original error if there was one, otherwise catch
* submission errors here and propagate into subsequent ioend
* submissions.
*/
list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
int error2;
list_del_init(&ioend->io_list);
error2 = xfs_submit_ioend(wbc, ioend, error);
if (error2 && !error)
error = error2;
}
/*
* We can end up here with no error and nothing to write only if we race
* with a partial page truncate on a sub-page block sized filesystem.
*/
if (!count)
end_page_writeback(page);
done:
mapping_set_error(page->mapping, error);
return error;
}
/*
* Write out a dirty page.
*
* For delalloc space on the page we need to allocate space and flush it.
* For unwritten space on the page we need to start the conversion to
* regular allocated space.
*/
STATIC int
xfs_do_writepage(
struct page *page,
struct writeback_control *wbc,
void *data)
{
struct xfs_writepage_ctx *wpc = data;
struct inode *inode = page->mapping->host;
loff_t offset;
uint64_t end_offset;
pgoff_t end_index;
trace_xfs_writepage(inode, page, 0, 0);
/*
* Refuse to write the page out if we are called from reclaim context.
*
* This avoids stack overflows when called from deeply used stacks in
* random callers for direct reclaim or memcg reclaim. We explicitly
* allow reclaim from kswapd as the stack usage there is relatively low.
*
* This should never happen except in the case of a VM regression so
* warn about it.
*/
if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
PF_MEMALLOC))
goto redirty;
/*
* Given that we do not allow direct reclaim to call us, we should
* never be called while in a filesystem transaction.
*/
if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
goto redirty;
/*
* Is this page beyond the end of the file?
*
* The page index is less than the end_index, adjust the end_offset
* to the highest offset that this page should represent.
* -----------------------------------------------------
* | file mapping | <EOF> |
* -----------------------------------------------------
* | Page ... | Page N-2 | Page N-1 | Page N | |
* ^--------------------------------^----------|--------
* | desired writeback range | see else |
* ---------------------------------^------------------|
*/
offset = i_size_read(inode);
end_index = offset >> PAGE_SHIFT;
if (page->index < end_index)
end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
else {
/*
* Check whether the page to write out is beyond or straddles
* i_size or not.
* -------------------------------------------------------
* | file mapping | <EOF> |
* -------------------------------------------------------
* | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
* ^--------------------------------^-----------|---------
* | | Straddles |
* ---------------------------------^-----------|--------|
*/
unsigned offset_into_page = offset & (PAGE_SIZE - 1);
/*
* Skip the page if it is fully outside i_size, e.g. due to a
* truncate operation that is in progress. We must redirty the
* page so that reclaim stops reclaiming it. Otherwise
* xfs_vm_releasepage() is called on it and gets confused.
*
* Note that the end_index is unsigned long, it would overflow
* if the given offset is greater than 16TB on 32-bit system
* and if we do check the page is fully outside i_size or not
* via "if (page->index >= end_index + 1)" as "end_index + 1"
* will be evaluated to 0. Hence this page will be redirtied
* and be written out repeatedly which would result in an
* infinite loop, the user program that perform this operation
* will hang. Instead, we can verify this situation by checking
* if the page to write is totally beyond the i_size or if it's
* offset is just equal to the EOF.
*/
if (page->index > end_index ||
(page->index == end_index && offset_into_page == 0))
goto redirty;
/*
* The page straddles i_size. It must be zeroed out on each
* and every writepage invocation because it may be mmapped.
* "A file is mapped in multiples of the page size. For a file
* that is not a multiple of the page size, the remaining
* memory is zeroed when mapped, and writes to that region are
* not written out to the file."
*/
zero_user_segment(page, offset_into_page, PAGE_SIZE);
/* Adjust the end_offset to the end of file */
end_offset = offset;
}
return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
redirty:
redirty_page_for_writepage(wbc, page);
unlock_page(page);
return 0;
}
static const struct iomap_writeback_ops xfs_writeback_ops = {
.map_blocks = xfs_map_blocks,
.prepare_ioend = xfs_prepare_ioend,
.discard_page = xfs_discard_page,
};
STATIC int
xfs_vm_writepage(
......@@ -1090,12 +589,8 @@ xfs_vm_writepage(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
int ret;
ret = xfs_do_writepage(page, wbc, &wpc);
if (wpc.ioend)
ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
return ret;
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
......@@ -1104,13 +599,9 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = { };
int ret;
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
if (wpc.ioend)
ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
return ret;
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
STATIC int
......@@ -1123,15 +614,6 @@ xfs_dax_writepages(
xfs_find_bdev_for_inode(mapping->host), wbc);
}
STATIC int
xfs_vm_releasepage(
struct page *page,
gfp_t gfp_mask)
{
trace_xfs_releasepage(page->mapping->host, page, 0, 0);
return iomap_releasepage(page, gfp_mask);
}
STATIC sector_t
xfs_vm_bmap(
struct address_space *mapping,
......@@ -1160,7 +642,6 @@ xfs_vm_readpage(
struct file *unused,
struct page *page)
{
trace_xfs_vm_readpage(page->mapping->host, 1);
return iomap_readpage(page, &xfs_iomap_ops);
}
......@@ -1171,7 +652,6 @@ xfs_vm_readpages(
struct list_head *pages,
unsigned nr_pages)
{
trace_xfs_vm_readpages(mapping->host, nr_pages);
return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
}
......@@ -1191,8 +671,8 @@ const struct address_space_operations xfs_address_space_operations = {
.writepage = xfs_vm_writepage,
.writepages = xfs_vm_writepages,
.set_page_dirty = iomap_set_page_dirty,
.releasepage = xfs_vm_releasepage,
.invalidatepage = xfs_vm_invalidatepage,
.releasepage = iomap_releasepage,
.invalidatepage = iomap_invalidatepage,
.bmap = xfs_vm_bmap,
.direct_IO = noop_direct_IO,
.migratepage = iomap_migrate_page,
......
......@@ -6,23 +6,6 @@
#ifndef __XFS_AOPS_H__
#define __XFS_AOPS_H__
extern struct bio_set xfs_ioend_bioset;
/*
* Structure for buffered I/O completions.
*/
struct xfs_ioend {
struct list_head io_list; /* next ioend in chain */
int io_fork; /* inode fork written back */
xfs_exntst_t io_state; /* extent state */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
xfs_off_t io_offset; /* offset in the file */
struct xfs_trans *io_append_trans;/* xact. for size update */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
};
extern const struct address_space_operations xfs_address_space_operations;
extern const struct address_space_operations xfs_dax_aops;
......
......@@ -188,7 +188,7 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
xfs_ilock(ip, XFS_IOLOCK_SHARED);
ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL, is_sync_kiocb(iocb));
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
......@@ -547,15 +547,12 @@ xfs_file_dio_aio_write(
}
trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
/*
* If unaligned, this is the only IO in-flight. If it has not yet
* completed, wait on it before we release the iolock to prevent
* subsequent overlapping IO.
* If unaligned, this is the only IO in-flight. Wait on it before we
* release the iolock to prevent subsequent overlapping IO.
*/
if (ret == -EIOCBQUEUED && unaligned_io)
inode_dio_wait(inode);
ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops,
is_sync_kiocb(iocb) || unaligned_io);
out:
xfs_iunlock(ip, iolock);
......
......@@ -54,7 +54,7 @@ xfs_bmbt_to_iomap(
struct xfs_inode *ip,
struct iomap *iomap,
struct xfs_bmbt_irec *imap,
bool shared)
u16 flags)
{
struct xfs_mount *mp = ip->i_mount;
......@@ -79,12 +79,11 @@ xfs_bmbt_to_iomap(
iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
iomap->flags = flags;
if (xfs_ipincount(ip) &&
(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
if (shared)
iomap->flags |= IOMAP_F_SHARED;
return 0;
}
......@@ -540,6 +539,7 @@ xfs_file_iomap_begin_delay(
struct xfs_iext_cursor icur, ccur;
xfs_fsblock_t prealloc_blocks = 0;
bool eof = false, cow_eof = false, shared = false;
u16 iomap_flags = 0;
int whichfork = XFS_DATA_FORK;
int error = 0;
......@@ -707,22 +707,28 @@ xfs_file_iomap_begin_delay(
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail.
*/
iomap->flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, count, whichfork,
whichfork == XFS_DATA_FORK ? &imap : &cmap);
if (whichfork == XFS_DATA_FORK) {
iomap_flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, count, whichfork, &imap);
} else {
trace_xfs_iomap_alloc(ip, offset, count, whichfork, &cmap);
}
done:
if (whichfork == XFS_COW_FORK) {
if (imap.br_startoff > offset_fsb) {
xfs_trim_extent(&cmap, offset_fsb,
imap.br_startoff - offset_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap,
IOMAP_F_SHARED);
goto out_unlock;
}
/* ensure we only report blocks we have a reservation for */
xfs_trim_extent(&imap, cmap.br_startoff, cmap.br_blockcount);
shared = true;
}
error = xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
if (shared)
iomap_flags |= IOMAP_F_SHARED;
error = xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
......@@ -922,7 +928,8 @@ xfs_file_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap)
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
......@@ -930,6 +937,7 @@ xfs_file_iomap_begin(
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
bool shared = false;
u16 iomap_flags = 0;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
......@@ -1045,11 +1053,20 @@ xfs_file_iomap_begin(
if (error)
return error;
iomap->flags |= IOMAP_F_NEW;
iomap_flags |= IOMAP_F_NEW;
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
/*
* Writes that span EOF might trigger an IO size update on completion,
* so consider them to be dirty for the purposes of O_DSYNC even if
* there is no other metadata changes pending or have been made here.
*/
if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
iomap_flags |= IOMAP_F_DIRTY;
if (shared)
iomap_flags |= IOMAP_F_SHARED;
return xfs_bmbt_to_iomap(ip, iomap, &imap, iomap_flags);
out_found:
ASSERT(nimaps);
......@@ -1145,7 +1162,8 @@ xfs_seek_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap)
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
......@@ -1193,7 +1211,7 @@ xfs_seek_iomap_begin(
if (data_fsb < cow_fsb + cmap.br_blockcount)
end_fsb = min(end_fsb, data_fsb);
xfs_trim_extent(&cmap, offset_fsb, end_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true);
error = xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
/*
* This is a COW extent, so we must probe the page cache
* because there could be dirty page cache being backed
......@@ -1215,7 +1233,7 @@ xfs_seek_iomap_begin(
imap.br_state = XFS_EXT_NORM;
done:
xfs_trim_extent(&imap, offset_fsb, end_fsb);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
out_unlock:
xfs_iunlock(ip, lockmode);
return error;
......@@ -1231,7 +1249,8 @@ xfs_xattr_iomap_begin(
loff_t offset,
loff_t length,
unsigned flags,
struct iomap *iomap)
struct iomap *iomap,
struct iomap *srcmap)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
......@@ -1261,7 +1280,7 @@ xfs_xattr_iomap_begin(
if (error)
return error;
ASSERT(nimaps);
return xfs_bmbt_to_iomap(ip, iomap, &imap, false);
return xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
}
const struct iomap_ops xfs_xattr_iomap_ops = {
......
......@@ -16,7 +16,7 @@ int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
int xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
struct xfs_bmbt_irec *, bool shared);
struct xfs_bmbt_irec *, u16);
xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
static inline xfs_filblks_t
......
......@@ -178,7 +178,7 @@ xfs_fs_map_blocks(
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, false);
error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0);
*device_generation = mp->m_generation;
return error;
out_unlock:
......
......@@ -1442,7 +1442,7 @@ xfs_reflink_dirty_extents(
flen = XFS_FSB_TO_B(mp, rlen);
if (fpos + flen > isize)
flen = isize - fpos;
error = iomap_file_dirty(VFS_I(ip), fpos, flen,
error = iomap_file_unshare(VFS_I(ip), fpos, flen,
&xfs_iomap_ops);
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (error)
......
......@@ -40,7 +40,6 @@
#include <linux/parser.h>
static const struct super_operations xfs_super_operations;
struct bio_set xfs_ioend_bioset;
static struct kset *xfs_kset; /* top-level xfs sysfs dir */
#ifdef DEBUG
......@@ -1853,15 +1852,10 @@ MODULE_ALIAS_FS("xfs");
STATIC int __init
xfs_init_zones(void)
{
if (bioset_init(&xfs_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
offsetof(struct xfs_ioend, io_inline_bio),
BIOSET_NEED_BVECS))
goto out;
xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
"xfs_log_ticket");
if (!xfs_log_ticket_zone)
goto out_free_ioend_bioset;
goto out;
xfs_bmap_free_item_zone = kmem_zone_init(
sizeof(struct xfs_extent_free_item),
......@@ -1996,8 +1990,6 @@ xfs_init_zones(void)
kmem_zone_destroy(xfs_bmap_free_item_zone);
out_destroy_log_ticket_zone:
kmem_zone_destroy(xfs_log_ticket_zone);
out_free_ioend_bioset:
bioset_exit(&xfs_ioend_bioset);
out:
return -ENOMEM;
}
......@@ -2028,7 +2020,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_btree_cur_zone);
kmem_zone_destroy(xfs_bmap_free_item_zone);
kmem_zone_destroy(xfs_log_ticket_zone);
bioset_exit(&xfs_ioend_bioset);
}
STATIC int __init
......
......@@ -1158,71 +1158,6 @@ DEFINE_RW_EVENT(xfs_file_buffered_write);
DEFINE_RW_EVENT(xfs_file_direct_write);
DEFINE_RW_EVENT(xfs_file_dax_write);
DECLARE_EVENT_CLASS(xfs_page_class,
TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
unsigned int len),
TP_ARGS(inode, page, off, len),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(pgoff_t, pgoff)
__field(loff_t, size)
__field(unsigned long, offset)
__field(unsigned int, length)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = XFS_I(inode)->i_ino;
__entry->pgoff = page_offset(page);
__entry->size = i_size_read(inode);
__entry->offset = off;
__entry->length = len;
),
TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
"length %x",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pgoff,
__entry->size,
__entry->offset,
__entry->length)
)
#define DEFINE_PAGE_EVENT(name) \
DEFINE_EVENT(xfs_page_class, name, \
TP_PROTO(struct inode *inode, struct page *page, unsigned long off, \
unsigned int len), \
TP_ARGS(inode, page, off, len))
DEFINE_PAGE_EVENT(xfs_writepage);
DEFINE_PAGE_EVENT(xfs_releasepage);
DEFINE_PAGE_EVENT(xfs_invalidatepage);
DECLARE_EVENT_CLASS(xfs_readpage_class,
TP_PROTO(struct inode *inode, int nr_pages),
TP_ARGS(inode, nr_pages),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
__field(int, nr_pages)
),
TP_fast_assign(
__entry->dev = inode->i_sb->s_dev;
__entry->ino = inode->i_ino;
__entry->nr_pages = nr_pages;
),
TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->nr_pages)
)
#define DEFINE_READPAGE_EVENT(name) \
DEFINE_EVENT(xfs_readpage_class, name, \
TP_PROTO(struct inode *inode, int nr_pages), \
TP_ARGS(inode, nr_pages))
DEFINE_READPAGE_EVENT(xfs_vm_readpage);
DEFINE_READPAGE_EVENT(xfs_vm_readpages);
DECLARE_EVENT_CLASS(xfs_imap_class,
TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
int whichfork, struct xfs_bmbt_irec *irec),
......
......@@ -4,6 +4,7 @@
#include <linux/atomic.h>
#include <linux/bitmap.h>
#include <linux/blk_types.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/mm_types.h>
......@@ -12,6 +13,7 @@
struct address_space;
struct fiemap_extent_info;
struct inode;
struct iomap_writepage_ctx;
struct iov_iter;
struct kiocb;
struct page;
......@@ -21,28 +23,45 @@ struct vm_fault;
/*
* Types of block ranges for iomap mappings:
*/
#define IOMAP_HOLE 0x01 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */
#define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */
#define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */
#define IOMAP_INLINE 0x05 /* data inline in the inode */
#define IOMAP_HOLE 0 /* no blocks allocated, need allocation */
#define IOMAP_DELALLOC 1 /* delayed allocation blocks */
#define IOMAP_MAPPED 2 /* blocks allocated at @addr */
#define IOMAP_UNWRITTEN 3 /* blocks allocated at @addr in unwritten state */
#define IOMAP_INLINE 4 /* data inline in the inode */
/*
* Flags for all iomap mappings:
* Flags reported by the file system from iomap_begin:
*
* IOMAP_F_NEW indicates that the blocks have been newly allocated and need
* zeroing for areas that no data is copied to.
*
* IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
* written data and requires fdatasync to commit them to persistent storage.
* This needs to take into account metadata changes that *may* be made at IO
* completion, such as file size updates from direct IO.
*
* IOMAP_F_SHARED indicates that the blocks are shared, and will need to be
* unshared as part a write.
*
* IOMAP_F_MERGED indicates that the iomap contains the merge of multiple block
* mappings.
*
* IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
* buffer heads for this mapping.
*/
#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */
#define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */
#define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */
#define IOMAP_F_SIZE_CHANGED 0x08 /* file size has changed */
#define IOMAP_F_NEW 0x01
#define IOMAP_F_DIRTY 0x02
#define IOMAP_F_SHARED 0x04
#define IOMAP_F_MERGED 0x08
#define IOMAP_F_BUFFER_HEAD 0x10
/*
* Flags that only need to be reported for IOMAP_REPORT requests:
* Flags set by the core iomap code during operations:
*
* IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
* has changed as the result of this write operation.
*/
#define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */
#define IOMAP_F_SHARED 0x20 /* block shared with another file */
#define IOMAP_F_SIZE_CHANGED 0x100
/*
* Flags from 0x1000 up are for file system specific usage:
......@@ -110,7 +129,8 @@ struct iomap_ops {
* The actual length is returned in iomap->length.
*/
int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, struct iomap *iomap);
unsigned flags, struct iomap *iomap,
struct iomap *srcmap);
/*
* Commit and/or unreserve space previous allocated using iomap_begin.
......@@ -126,29 +146,12 @@ struct iomap_ops {
* Main iomap iterator function.
*/
typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
void *data, struct iomap *iomap);
void *data, struct iomap *iomap, struct iomap *srcmap);
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
unsigned flags, const struct iomap_ops *ops, void *data,
iomap_actor_t actor);
/*
* Structure allocate for each page when block size < PAGE_SIZE to track
* sub-page uptodate status and I/O completions.
*/
struct iomap_page {
atomic_t read_count;
atomic_t write_count;
DECLARE_BITMAP(uptodate, PAGE_SIZE / 512);
};
static inline struct iomap_page *to_iomap_page(struct page *page)
{
if (page_has_private(page))
return (struct iomap_page *)page_private(page);
return NULL;
}
ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
const struct iomap_ops *ops);
int iomap_readpage(struct page *page, const struct iomap_ops *ops);
......@@ -166,7 +169,7 @@ int iomap_migrate_page(struct address_space *mapping, struct page *newpage,
#else
#define iomap_migrate_page NULL
#endif
int iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len,
bool *did_zero, const struct iomap_ops *ops);
......@@ -183,6 +186,63 @@ loff_t iomap_seek_data(struct inode *inode, loff_t offset,
sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
const struct iomap_ops *ops);
/*
* Structure for writeback I/O completions.
*/
struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_type;
u16 io_flags; /* IOMAP_F_* */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
loff_t io_offset; /* offset in the file */
void *io_private; /* file system private data */
struct bio *io_bio; /* bio being built */
struct bio io_inline_bio; /* MUST BE LAST! */
};
struct iomap_writeback_ops {
/*
* Required, maps the blocks so that writeback can be performed on
* the range starting at offset.
*/
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
loff_t offset);
/*
* Optional, allows the file systems to perform actions just before
* submitting the bio and/or override the bio end_io handler for complex
* operations like copy on write extent manipulation or unwritten extent
* conversions.
*/
int (*prepare_ioend)(struct iomap_ioend *ioend, int status);
/*
* Optional, allows the file system to discard state on a page where
* we failed to submit any I/O.
*/
void (*discard_page)(struct page *page);
};
struct iomap_writepage_ctx {
struct iomap iomap;
struct iomap_ioend *ioend;
const struct iomap_writeback_ops *ops;
};
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
void iomap_ioend_try_merge(struct iomap_ioend *ioend,
struct list_head *more_ioends,
void (*merge_private)(struct iomap_ioend *ioend,
struct iomap_ioend *next));
void iomap_sort_ioends(struct list_head *ioend_list);
int iomap_writepage(struct page *page, struct writeback_control *wbc,
struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops);
int iomap_writepages(struct address_space *mapping,
struct writeback_control *wbc, struct iomap_writepage_ctx *wpc,
const struct iomap_writeback_ops *ops);
/*
* Flags for direct I/O ->end_io:
*/
......@@ -195,7 +255,8 @@ struct iomap_dio_ops {
};
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
bool wait_for_completion);
int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
#ifdef CONFIG_SWAP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment