Commit 4759d386 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull DAX updates from Dan Williams:
 "The completion of Jan's DAX work for 4.10.

  As I mentioned in the libnvdimm-for-4.10 pull request, these are some
  final fixes for the DAX dirty-cacheline-tracking invalidation work
  that was merged through the -mm, ext4, and xfs trees in -rc1. These
  patches were prepared prior to the merge window, but we waited for
  4.10-rc1 to have a stable merge base after all the prerequisites were
  merged.

  Quoting Jan on the overall changes in these patches:

     "So I'd like all these 6 patches to go for rc2. The first three
      patches fix invalidation of exceptional DAX entries (a bug which
      is there for a long time) - without these patches data loss can
      occur on power failure even though user called fsync(2). The other
      three patches change locking of DAX faults so that ->iomap_begin()
      is called in a more relaxed locking context and we are safe to
      start a transaction there for ext4"

  These have received a build success notification from the kbuild
  robot, and pass the latest libnvdimm unit tests. There have not been
  any -next releases since -rc1, so they have not appeared there"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  ext4: Simplify DAX fault path
  dax: Call ->iomap_begin without entry lock during dax fault
  dax: Finish fault completely when loading holes
  dax: Avoid page invalidation races and unnecessary radix tree traversals
  mm: Invalidate DAX radix tree entries only if appropriate
  ext2: Return BH_New buffers for zeroed blocks
parents 238d1d0f 1db17542
...@@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, ...@@ -451,16 +451,37 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
} }
static int __dax_invalidate_mapping_entry(struct address_space *mapping,
pgoff_t index, bool trunc)
{
int ret = 0;
void *entry;
struct radix_tree_root *page_tree = &mapping->page_tree;
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
if (!entry || !radix_tree_exceptional_entry(entry))
goto out;
if (!trunc &&
(radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
goto out;
radix_tree_delete(page_tree, index);
mapping->nrexceptional--;
ret = 1;
out:
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
/* /*
* Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
* entry to get unlocked before deleting it. * entry to get unlocked before deleting it.
*/ */
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
{ {
void *entry; int ret = __dax_invalidate_mapping_entry(mapping, index, true);
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
/* /*
* This gets called from truncate / punch_hole path. As such, the caller * This gets called from truncate / punch_hole path. As such, the caller
* must hold locks protecting against concurrent modifications of the * must hold locks protecting against concurrent modifications of the
...@@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) ...@@ -468,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* caller has seen exceptional entry for this index, we better find it * caller has seen exceptional entry for this index, we better find it
* at that index as well... * at that index as well...
*/ */
if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { WARN_ON_ONCE(!ret);
spin_unlock_irq(&mapping->tree_lock); return ret;
return 0; }
}
radix_tree_delete(&mapping->page_tree, index); /*
* Invalidate exceptional DAX entry if easily possible. This handles DAX
* entries for invalidate_inode_pages() so we evict the entry only if we can
* do so without blocking.
*/
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
{
int ret = 0;
void *entry, **slot;
struct radix_tree_root *page_tree = &mapping->page_tree;
spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
if (!entry || !radix_tree_exceptional_entry(entry) ||
slot_locked(mapping, slot))
goto out;
if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
goto out;
radix_tree_delete(page_tree, index);
mapping->nrexceptional--; mapping->nrexceptional--;
ret = 1;
out:
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
dax_wake_mapping_entry_waiter(mapping, index, entry, true); if (ret)
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
return ret;
}
return 1; /*
* Invalidate exceptional DAX entry if it is clean.
*/
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index)
{
return __dax_invalidate_mapping_entry(mapping, index, false);
} }
/* /*
...@@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) ...@@ -488,15 +539,16 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
* otherwise it will simply fall out of the page cache under memory * otherwise it will simply fall out of the page cache under memory
* pressure without ever having been dirtied. * pressure without ever having been dirtied.
*/ */
static int dax_load_hole(struct address_space *mapping, void *entry, static int dax_load_hole(struct address_space *mapping, void **entry,
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
struct page *page; struct page *page;
int ret;
/* Hole page already exists? Return it... */ /* Hole page already exists? Return it... */
if (!radix_tree_exceptional_entry(entry)) { if (!radix_tree_exceptional_entry(*entry)) {
vmf->page = entry; page = *entry;
return VM_FAULT_LOCKED; goto out;
} }
/* This will replace locked radix tree entry with a hole page */ /* This will replace locked radix tree entry with a hole page */
...@@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry, ...@@ -504,8 +556,17 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
vmf->gfp_mask | __GFP_ZERO); vmf->gfp_mask | __GFP_ZERO);
if (!page) if (!page)
return VM_FAULT_OOM; return VM_FAULT_OOM;
out:
vmf->page = page; vmf->page = page;
return VM_FAULT_LOCKED; ret = finish_fault(vmf);
vmf->page = NULL;
*entry = page;
if (!ret) {
/* Grab reference for PTE that is now referencing the page */
get_page(page);
return VM_FAULT_NOPAGE;
}
return ret;
} }
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
...@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ...@@ -934,6 +995,17 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
return -EIO; return -EIO;
/*
* Write can allocate block for an area which has a hole page mapped
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
}
while (pos < end) { while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1); unsigned offset = pos & (PAGE_SIZE - 1);
struct blk_dax_ctl dax = { 0 }; struct blk_dax_ctl dax = { 0 };
...@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -992,23 +1064,6 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE) if (iov_iter_rw(iter) == WRITE)
flags |= IOMAP_WRITE; flags |= IOMAP_WRITE;
/*
* Yes, even DAX files can have page cache attached to them: A zeroed
* page is inserted into the pagecache when we have to serve a write
* fault on a hole. It should never be dirtied and can simply be
* dropped from the pagecache once we get real data for the page.
*
* XXX: This is racy against mmap, and there's nothing we can do about
* it. We'll eventually need to shift this down even further so that
* we can check if we allocated blocks over a hole first.
*/
if (mapping->nrpages) {
ret = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT,
(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
}
while (iov_iter_count(iter)) { while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
iter, dax_iomap_actor); iter, dax_iomap_actor);
...@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -1023,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
} }
EXPORT_SYMBOL_GPL(dax_iomap_rw); EXPORT_SYMBOL_GPL(dax_iomap_rw);
static int dax_fault_return(int error)
{
if (error == 0)
return VM_FAULT_NOPAGE;
if (error == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
}
/** /**
* dax_iomap_fault - handle a page fault on a DAX file * dax_iomap_fault - handle a page fault on a DAX file
* @vma: The virtual memory area where the fault occurred * @vma: The virtual memory area where the fault occurred
...@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -1055,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (pos >= i_size_read(inode)) if (pos >= i_size_read(inode))
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto out;
}
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE; flags |= IOMAP_WRITE;
...@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -1071,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
*/ */
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) if (error)
goto unlock_entry; return dax_fault_return(error);
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
error = -EIO; /* fs corruption? */ vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
}
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
vmf_ret = dax_fault_return(PTR_ERR(entry));
goto finish_iomap; goto finish_iomap;
} }
...@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -1096,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
} }
if (error) if (error)
goto finish_iomap; goto error_unlock_entry;
__SetPageUptodate(vmf->cow_page); __SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf); vmf_ret = finish_fault(vmf);
if (!vmf_ret) if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW; vmf_ret = VM_FAULT_DONE_COW;
goto finish_iomap; goto unlock_entry;
} }
switch (iomap.type) { switch (iomap.type) {
...@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -1114,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
} }
error = dax_insert_mapping(mapping, iomap.bdev, sector, error = dax_insert_mapping(mapping, iomap.bdev, sector,
PAGE_SIZE, &entry, vma, vmf); PAGE_SIZE, &entry, vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY)
error = 0;
break; break;
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) { if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, entry, vmf); vmf_ret = dax_load_hole(mapping, &entry, vmf);
break; goto unlock_entry;
} }
/*FALLTHRU*/ /*FALLTHRU*/
default: default:
...@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -1128,31 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
break; break;
} }
error_unlock_entry:
vmf_ret = dax_fault_return(error) | major;
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap: finish_iomap:
if (ops->iomap_end) { if (ops->iomap_end) {
if (error || (vmf_ret & VM_FAULT_ERROR)) { int copied = PAGE_SIZE;
/* keep previous error */
ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, if (vmf_ret & VM_FAULT_ERROR)
&iomap); copied = 0;
} else { /*
error = ops->iomap_end(inode, pos, PAGE_SIZE, * The fault is done by now and there's no way back (other
PAGE_SIZE, flags, &iomap); * thread may be already happily using PTE we have installed).
} * Just ignore error from ->iomap_end since we cannot do much
} * with it.
unlock_entry: */
if (vmf_ret != VM_FAULT_LOCKED || error) ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
/* -EBUSY is fine, somebody else faulted on the same PTE */
if (error < 0 && error != -EBUSY)
return VM_FAULT_SIGBUS | major;
if (vmf_ret) {
WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */
return vmf_ret;
} }
return VM_FAULT_NOPAGE | major; return vmf_ret;
} }
EXPORT_SYMBOL_GPL(dax_iomap_fault); EXPORT_SYMBOL_GPL(dax_iomap_fault);
...@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -1276,16 +1337,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((pgoff | PG_PMD_COLOUR) > max_pgoff) if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
goto fallback; goto fallback;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto fallback;
/* /*
* Note that we don't use iomap_apply here. We aren't doing I/O, only * Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way * setting up a mapping, so really we're using iomap_begin() as a way
...@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -1294,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
pos = (loff_t)pgoff << PAGE_SHIFT; pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error) if (error)
goto unlock_entry; goto fallback;
if (iomap.offset + iomap.length < pos + PMD_SIZE) if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap; goto finish_iomap;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto finish_iomap;
vmf.pgoff = pgoff; vmf.pgoff = pgoff;
vmf.flags = flags; vmf.flags = flags;
vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
...@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -1310,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (WARN_ON_ONCE(write)) if (WARN_ON_ONCE(write))
goto finish_iomap; goto unlock_entry;
result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
&entry); &entry);
break; break;
...@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -1319,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
break; break;
} }
unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap: finish_iomap:
if (ops->iomap_end) { if (ops->iomap_end) {
if (result == VM_FAULT_FALLBACK) { int copied = PMD_SIZE;
ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags,
&iomap); if (result == VM_FAULT_FALLBACK)
} else { copied = 0;
error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, /*
iomap_flags, &iomap); * The fault is done by now and there's no way back (other
if (error) * thread may be already happily using PMD we have installed).
result = VM_FAULT_FALLBACK; * Just ignore error from ->iomap_end since we cannot do much
} * with it.
*/
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
} }
unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
fallback: fallback:
if (result == VM_FAULT_FALLBACK) { if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, pmd, address); split_huge_pmd(vma, pmd, address);
......
...@@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode, ...@@ -751,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
mutex_unlock(&ei->truncate_mutex); mutex_unlock(&ei->truncate_mutex);
goto cleanup; goto cleanup;
} }
} else {
*new = true;
} }
*new = true;
ext2_splice_branch(inode, iblock, partial, indirect_blks, count); ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
mutex_unlock(&ei->truncate_mutex); mutex_unlock(&ei->truncate_mutex);
......
...@@ -258,7 +258,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ...@@ -258,7 +258,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
int result; int result;
handle_t *handle = NULL;
struct inode *inode = file_inode(vma->vm_file); struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb; struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE; bool write = vmf->flags & FAULT_FLAG_WRITE;
...@@ -266,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -266,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (write) { if (write) {
sb_start_pagefault(sb); sb_start_pagefault(sb);
file_update_time(vma->vm_file); file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem); }
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, down_read(&EXT4_I(inode)->i_mmap_sem);
EXT4_DATA_TRANS_BLOCKS(sb)); result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
} else up_read(&EXT4_I(inode)->i_mmap_sem);
down_read(&EXT4_I(inode)->i_mmap_sem); if (write)
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else
result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb); sb_end_pagefault(sb);
} else
up_read(&EXT4_I(inode)->i_mmap_sem);
return result; return result;
} }
...@@ -292,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, ...@@ -292,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags) pmd_t *pmd, unsigned int flags)
{ {
int result; int result;
handle_t *handle = NULL;
struct inode *inode = file_inode(vma->vm_file); struct inode *inode = file_inode(vma->vm_file);
struct super_block *sb = inode->i_sb; struct super_block *sb = inode->i_sb;
bool write = flags & FAULT_FLAG_WRITE; bool write = flags & FAULT_FLAG_WRITE;
...@@ -300,27 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, ...@@ -300,27 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
if (write) { if (write) {
sb_start_pagefault(sb); sb_start_pagefault(sb);
file_update_time(vma->vm_file); file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
ext4_chunk_trans_blocks(inode,
PMD_SIZE / PAGE_SIZE));
} else
down_read(&EXT4_I(inode)->i_mmap_sem);
if (IS_ERR(handle))
result = VM_FAULT_SIGBUS;
else {
result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
&ext4_iomap_ops);
} }
down_read(&EXT4_I(inode)->i_mmap_sem);
if (write) { result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
if (!IS_ERR(handle)) &ext4_iomap_ops);
ext4_journal_stop(handle); up_read(&EXT4_I(inode)->i_mmap_sem);
up_read(&EXT4_I(inode)->i_mmap_sem); if (write)
sb_end_pagefault(sb); sb_end_pagefault(sb);
} else
up_read(&EXT4_I(inode)->i_mmap_sem);
return result; return result;
} }
......
...@@ -41,6 +41,9 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -41,6 +41,9 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
struct iomap_ops *ops); struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping, void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all); pgoff_t index, void *entry, bool wake_all);
......
...@@ -24,20 +24,12 @@ ...@@ -24,20 +24,12 @@
#include <linux/rmap.h> #include <linux/rmap.h>
#include "internal.h" #include "internal.h"
static void clear_exceptional_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
pgoff_t index, void *entry) void *entry)
{ {
struct radix_tree_node *node; struct radix_tree_node *node;
void **slot; void **slot;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
if (dax_mapping(mapping)) {
dax_delete_mapping_entry(mapping, index);
return;
}
spin_lock_irq(&mapping->tree_lock); spin_lock_irq(&mapping->tree_lock);
/* /*
* Regular page slots are stabilized by the page lock even * Regular page slots are stabilized by the page lock even
...@@ -55,6 +47,56 @@ static void clear_exceptional_entry(struct address_space *mapping, ...@@ -55,6 +47,56 @@ static void clear_exceptional_entry(struct address_space *mapping,
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
} }
/*
* Unconditionally remove exceptional entry. Usually called from truncate path.
*/
static void truncate_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
if (dax_mapping(mapping)) {
dax_delete_mapping_entry(mapping, index);
return;
}
clear_shadow_entry(mapping, index, entry);
}
/*
* Invalidate exceptional entry if easily possible. This handles exceptional
* entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
* clean entries.
*/
static int invalidate_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return 1;
if (dax_mapping(mapping))
return dax_invalidate_mapping_entry(mapping, index);
clear_shadow_entry(mapping, index, entry);
return 1;
}
/*
* Invalidate exceptional entry if clean. This handles exceptional entries for
* invalidate_inode_pages2() so for DAX it evicts only clean entries.
*/
static int invalidate_exceptional_entry2(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return 1;
if (dax_mapping(mapping))
return dax_invalidate_mapping_entry_sync(mapping, index);
clear_shadow_entry(mapping, index, entry);
return 1;
}
/** /**
* do_invalidatepage - invalidate part or all of a page * do_invalidatepage - invalidate part or all of a page
* @page: the page which is affected * @page: the page which is affected
...@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping, ...@@ -262,7 +304,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
break; break;
if (radix_tree_exceptional_entry(page)) { if (radix_tree_exceptional_entry(page)) {
clear_exceptional_entry(mapping, index, page); truncate_exceptional_entry(mapping, index,
page);
continue; continue;
} }
...@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping, ...@@ -351,7 +394,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
} }
if (radix_tree_exceptional_entry(page)) { if (radix_tree_exceptional_entry(page)) {
clear_exceptional_entry(mapping, index, page); truncate_exceptional_entry(mapping, index,
page);
continue; continue;
} }
...@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, ...@@ -470,7 +514,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
break; break;
if (radix_tree_exceptional_entry(page)) { if (radix_tree_exceptional_entry(page)) {
clear_exceptional_entry(mapping, index, page); invalidate_exceptional_entry(mapping, index,
page);
continue; continue;
} }
...@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ...@@ -592,7 +637,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
break; break;
if (radix_tree_exceptional_entry(page)) { if (radix_tree_exceptional_entry(page)) {
clear_exceptional_entry(mapping, index, page); if (!invalidate_exceptional_entry2(mapping,
index, page))
ret = -EBUSY;
continue; continue;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment