Commit cccbce67 authored by Dan Williams's avatar Dan Williams

filesystem-dax: convert to dax_direct_access()

Now that a dax_device is plumbed through all dax-capable drivers we can
switch from block_device_operations to dax_operations for invoking
->direct_access.

This also lets us kill off some usages of struct blk_dax_ctl on the way
to its eventual removal.
Suggested-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
parent a41fe02b
...@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void) ...@@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void)
} }
fs_initcall(init_dax_wait_table); fs_initcall(init_dax_wait_table);
static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
{
struct request_queue *q = bdev->bd_queue;
long rc = -EIO;
dax->addr = ERR_PTR(-EIO);
if (blk_queue_enter(q, true) != 0)
return rc;
rc = bdev_direct_access(bdev, dax);
if (rc < 0) {
dax->addr = ERR_PTR(rc);
blk_queue_exit(q);
return rc;
}
return rc;
}
static void dax_unmap_atomic(struct block_device *bdev,
const struct blk_dax_ctl *dax)
{
if (IS_ERR(dax->addr))
return;
blk_queue_exit(bdev->bd_queue);
}
static int dax_is_pmd_entry(void *entry) static int dax_is_pmd_entry(void *entry)
{ {
return (unsigned long)entry & RADIX_DAX_PMD; return (unsigned long)entry & RADIX_DAX_PMD;
...@@ -553,21 +527,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry, ...@@ -553,21 +527,30 @@ static int dax_load_hole(struct address_space *mapping, void **entry,
return ret; return ret;
} }
static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
struct page *to, unsigned long vaddr) sector_t sector, size_t size, struct page *to,
unsigned long vaddr)
{ {
struct blk_dax_ctl dax = { void *vto, *kaddr;
.sector = sector, pgoff_t pgoff;
.size = size, pfn_t pfn;
}; long rc;
void *vto; int id;
if (dax_map_atomic(bdev, &dax) < 0) rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
return PTR_ERR(dax.addr); if (rc)
return rc;
id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
vto = kmap_atomic(to); vto = kmap_atomic(to);
copy_user_page(vto, (void __force *)dax.addr, vaddr, to); copy_user_page(vto, (void __force *)kaddr, vaddr, to);
kunmap_atomic(vto); kunmap_atomic(vto);
dax_unmap_atomic(bdev, &dax); dax_read_unlock(id);
return 0; return 0;
} }
...@@ -735,12 +718,16 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, ...@@ -735,12 +718,16 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
} }
static int dax_writeback_one(struct block_device *bdev, static int dax_writeback_one(struct block_device *bdev,
struct address_space *mapping, pgoff_t index, void *entry) struct dax_device *dax_dev, struct address_space *mapping,
pgoff_t index, void *entry)
{ {
struct radix_tree_root *page_tree = &mapping->page_tree; struct radix_tree_root *page_tree = &mapping->page_tree;
struct blk_dax_ctl dax; void *entry2, **slot, *kaddr;
void *entry2, **slot; long ret = 0, id;
int ret = 0; sector_t sector;
pgoff_t pgoff;
size_t size;
pfn_t pfn;
/* /*
* A page got tagged dirty in DAX mapping? Something is seriously * A page got tagged dirty in DAX mapping? Something is seriously
...@@ -789,26 +776,29 @@ static int dax_writeback_one(struct block_device *bdev, ...@@ -789,26 +776,29 @@ static int dax_writeback_one(struct block_device *bdev,
* 'entry'. This allows us to flush for PMD_SIZE and not have to * 'entry'. This allows us to flush for PMD_SIZE and not have to
* worry about partial PMD writebacks. * worry about partial PMD writebacks.
*/ */
dax.sector = dax_radix_sector(entry); sector = dax_radix_sector(entry);
dax.size = PAGE_SIZE << dax_radix_order(entry); size = PAGE_SIZE << dax_radix_order(entry);
id = dax_read_lock();
ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
if (ret)
goto dax_unlock;
/* /*
* We cannot hold tree_lock while calling dax_map_atomic() because it * dax_direct_access() may sleep, so cannot hold tree_lock over
* eventually calls cond_resched(). * its invocation.
*/ */
ret = dax_map_atomic(bdev, &dax); ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
if (ret < 0) { if (ret < 0)
put_locked_mapping_entry(mapping, index, entry); goto dax_unlock;
return ret;
}
if (WARN_ON_ONCE(ret < dax.size)) { if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
ret = -EIO; ret = -EIO;
goto unmap; goto dax_unlock;
} }
dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
wb_cache_pmem(dax.addr, dax.size); wb_cache_pmem(kaddr, size);
/* /*
* After we have flushed the cache, we can clear the dirty tag. There * After we have flushed the cache, we can clear the dirty tag. There
* cannot be new dirty data in the pfn after the flush has completed as * cannot be new dirty data in the pfn after the flush has completed as
...@@ -818,8 +808,8 @@ static int dax_writeback_one(struct block_device *bdev, ...@@ -818,8 +808,8 @@ static int dax_writeback_one(struct block_device *bdev,
spin_lock_irq(&mapping->tree_lock); spin_lock_irq(&mapping->tree_lock);
radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
unmap: dax_unlock:
dax_unmap_atomic(bdev, &dax); dax_read_unlock(id);
put_locked_mapping_entry(mapping, index, entry); put_locked_mapping_entry(mapping, index, entry);
return ret; return ret;
...@@ -840,6 +830,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, ...@@ -840,6 +830,7 @@ int dax_writeback_mapping_range(struct address_space *mapping,
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
pgoff_t start_index, end_index; pgoff_t start_index, end_index;
pgoff_t indices[PAGEVEC_SIZE]; pgoff_t indices[PAGEVEC_SIZE];
struct dax_device *dax_dev;
struct pagevec pvec; struct pagevec pvec;
bool done = false; bool done = false;
int i, ret = 0; int i, ret = 0;
...@@ -850,6 +841,10 @@ int dax_writeback_mapping_range(struct address_space *mapping, ...@@ -850,6 +841,10 @@ int dax_writeback_mapping_range(struct address_space *mapping,
if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
return 0; return 0;
dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
if (!dax_dev)
return -EIO;
start_index = wbc->range_start >> PAGE_SHIFT; start_index = wbc->range_start >> PAGE_SHIFT;
end_index = wbc->range_end >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT;
...@@ -870,38 +865,49 @@ int dax_writeback_mapping_range(struct address_space *mapping, ...@@ -870,38 +865,49 @@ int dax_writeback_mapping_range(struct address_space *mapping,
break; break;
} }
ret = dax_writeback_one(bdev, mapping, indices[i], ret = dax_writeback_one(bdev, dax_dev, mapping,
pvec.pages[i]); indices[i], pvec.pages[i]);
if (ret < 0) if (ret < 0) {
put_dax(dax_dev);
return ret; return ret;
}
} }
} }
put_dax(dax_dev);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping, static int dax_insert_mapping(struct address_space *mapping,
struct block_device *bdev, sector_t sector, size_t size, struct block_device *bdev, struct dax_device *dax_dev,
void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) sector_t sector, size_t size, void **entryp,
struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
unsigned long vaddr = vmf->address; unsigned long vaddr = vmf->address;
struct blk_dax_ctl dax = {
.sector = sector,
.size = size,
};
void *ret;
void *entry = *entryp; void *entry = *entryp;
void *ret, *kaddr;
pgoff_t pgoff;
int id, rc;
pfn_t pfn;
if (dax_map_atomic(bdev, &dax) < 0) rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
return PTR_ERR(dax.addr); if (rc)
dax_unmap_atomic(bdev, &dax); return rc;
ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
dax_read_unlock(id);
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret)) if (IS_ERR(ret))
return PTR_ERR(ret); return PTR_ERR(ret);
*entryp = ret; *entryp = ret;
return vm_insert_mixed(vma, vaddr, dax.pfn); return vm_insert_mixed(vma, vaddr, pfn);
} }
/** /**
...@@ -950,24 +956,34 @@ static bool dax_range_is_aligned(struct block_device *bdev, ...@@ -950,24 +956,34 @@ static bool dax_range_is_aligned(struct block_device *bdev,
return true; return true;
} }
int __dax_zero_page_range(struct block_device *bdev, sector_t sector, int __dax_zero_page_range(struct block_device *bdev,
unsigned int offset, unsigned int length) struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int size)
{ {
struct blk_dax_ctl dax = { if (dax_range_is_aligned(bdev, offset, size)) {
.sector = sector, sector_t start_sector = sector + (offset >> 9);
.size = PAGE_SIZE,
};
if (dax_range_is_aligned(bdev, offset, length)) {
sector_t start_sector = dax.sector + (offset >> 9);
return blkdev_issue_zeroout(bdev, start_sector, return blkdev_issue_zeroout(bdev, start_sector,
length >> 9, GFP_NOFS, true); size >> 9, GFP_NOFS, true);
} else { } else {
if (dax_map_atomic(bdev, &dax) < 0) pgoff_t pgoff;
return PTR_ERR(dax.addr); long rc, id;
clear_pmem(dax.addr + offset, length); void *kaddr;
dax_unmap_atomic(bdev, &dax); pfn_t pfn;
rc = bdev_dax_pgoff(bdev, sector, size, &pgoff);
if (rc)
return rc;
id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr,
&pfn);
if (rc < 0) {
dax_read_unlock(id);
return rc;
}
clear_pmem(kaddr + offset, size);
dax_read_unlock(id);
} }
return 0; return 0;
} }
...@@ -982,9 +998,12 @@ static loff_t ...@@ -982,9 +998,12 @@ static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap) struct iomap *iomap)
{ {
struct block_device *bdev = iomap->bdev;
struct dax_device *dax_dev = iomap->dax_dev;
struct iov_iter *iter = data; struct iov_iter *iter = data;
loff_t end = pos + length, done = 0; loff_t end = pos + length, done = 0;
ssize_t ret = 0; ssize_t ret = 0;
int id;
if (iov_iter_rw(iter) == READ) { if (iov_iter_rw(iter) == READ) {
end = min(end, i_size_read(inode)); end = min(end, i_size_read(inode));
...@@ -1009,34 +1028,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ...@@ -1009,34 +1028,42 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
(end - 1) >> PAGE_SHIFT); (end - 1) >> PAGE_SHIFT);
} }
id = dax_read_lock();
while (pos < end) { while (pos < end) {
unsigned offset = pos & (PAGE_SIZE - 1); unsigned offset = pos & (PAGE_SIZE - 1);
struct blk_dax_ctl dax = { 0 }; const size_t size = ALIGN(length + offset, PAGE_SIZE);
const sector_t sector = dax_iomap_sector(iomap, pos);
ssize_t map_len; ssize_t map_len;
pgoff_t pgoff;
void *kaddr;
pfn_t pfn;
if (fatal_signal_pending(current)) { if (fatal_signal_pending(current)) {
ret = -EINTR; ret = -EINTR;
break; break;
} }
dax.sector = dax_iomap_sector(iomap, pos); ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; if (ret)
map_len = dax_map_atomic(iomap->bdev, &dax); break;
map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
&kaddr, &pfn);
if (map_len < 0) { if (map_len < 0) {
ret = map_len; ret = map_len;
break; break;
} }
dax.addr += offset; map_len = PFN_PHYS(map_len);
kaddr += offset;
map_len -= offset; map_len -= offset;
if (map_len > end - pos) if (map_len > end - pos)
map_len = end - pos; map_len = end - pos;
if (iov_iter_rw(iter) == WRITE) if (iov_iter_rw(iter) == WRITE)
map_len = copy_from_iter_pmem(dax.addr, map_len, iter); map_len = copy_from_iter_pmem(kaddr, map_len, iter);
else else
map_len = copy_to_iter(dax.addr, map_len, iter); map_len = copy_to_iter(kaddr, map_len, iter);
dax_unmap_atomic(iomap->bdev, &dax);
if (map_len <= 0) { if (map_len <= 0) {
ret = map_len ? map_len : -EFAULT; ret = map_len ? map_len : -EFAULT;
break; break;
...@@ -1046,6 +1073,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ...@@ -1046,6 +1073,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
length -= map_len; length -= map_len;
done += map_len; done += map_len;
} }
dax_read_unlock(id);
return done ? done : ret; return done ? done : ret;
} }
...@@ -1152,8 +1180,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ...@@ -1152,8 +1180,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
clear_user_highpage(vmf->cow_page, vaddr); clear_user_highpage(vmf->cow_page, vaddr);
break; break;
case IOMAP_MAPPED: case IOMAP_MAPPED:
error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, error = copy_user_dax(iomap.bdev, iomap.dax_dev,
vmf->cow_page, vaddr); sector, PAGE_SIZE, vmf->cow_page, vaddr);
break; break;
default: default:
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
...@@ -1178,8 +1206,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ...@@ -1178,8 +1206,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR; major = VM_FAULT_MAJOR;
} }
error = dax_insert_mapping(mapping, iomap.bdev, sector, error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev,
PAGE_SIZE, &entry, vmf->vma, vmf); sector, PAGE_SIZE, &entry, vmf->vma, vmf);
/* -EBUSY is fine, somebody else faulted on the same PTE */ /* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY) if (error == -EBUSY)
error = 0; error = 0;
...@@ -1229,41 +1257,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, ...@@ -1229,41 +1257,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap,
loff_t pos, void **entryp) loff_t pos, void **entryp)
{ {
struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct address_space *mapping = vmf->vma->vm_file->f_mapping;
const sector_t sector = dax_iomap_sector(iomap, pos);
struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev; struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct blk_dax_ctl dax = { const size_t size = PMD_SIZE;
.sector = dax_iomap_sector(iomap, pos), void *ret = NULL, *kaddr;
.size = PMD_SIZE, long length = 0;
}; pgoff_t pgoff;
long length = dax_map_atomic(bdev, &dax); pfn_t pfn;
void *ret = NULL; int id;
if (length < 0) /* dax_map_atomic() failed */ if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback; goto fallback;
if (length < PMD_SIZE)
goto unmap_fallback;
if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
goto unmap_fallback;
if (!pfn_t_devmap(dax.pfn))
goto unmap_fallback;
dax_unmap_atomic(bdev, &dax);
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, id = dax_read_lock();
length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (length < 0)
goto unlock_fallback;
length = PFN_PHYS(length);
if (length < size)
goto unlock_fallback;
if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
goto unlock_fallback;
if (!pfn_t_devmap(pfn))
goto unlock_fallback;
dax_read_unlock(id);
ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector,
RADIX_DAX_PMD); RADIX_DAX_PMD);
if (IS_ERR(ret)) if (IS_ERR(ret))
goto fallback; goto fallback;
*entryp = ret; *entryp = ret;
trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
dax.pfn, vmf->flags & FAULT_FLAG_WRITE); pfn, vmf->flags & FAULT_FLAG_WRITE);
unmap_fallback: unlock_fallback:
dax_unmap_atomic(bdev, &dax); dax_read_unlock(id);
fallback: fallback:
trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
dax.pfn, ret);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
......
...@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, ...@@ -360,7 +360,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
sector_t sector = iomap->blkno + sector_t sector = iomap->blkno +
(((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9);
return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector,
offset, bytes);
} }
static loff_t static loff_t
......
...@@ -70,11 +70,13 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, ...@@ -70,11 +70,13 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
pgoff_t index, void *entry, bool wake_all); pgoff_t index, void *entry, bool wake_all);
#ifdef CONFIG_FS_DAX #ifdef CONFIG_FS_DAX
int __dax_zero_page_range(struct block_device *bdev, sector_t sector, int __dax_zero_page_range(struct block_device *bdev,
struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length); unsigned int offset, unsigned int length);
#else #else
static inline int __dax_zero_page_range(struct block_device *bdev, static inline int __dax_zero_page_range(struct block_device *bdev,
sector_t sector, unsigned int offset, unsigned int length) struct dax_device *dax_dev, sector_t sector,
unsigned int offset, unsigned int length)
{ {
return -ENXIO; return -ENXIO;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment