Commit 735a2573 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] truncate/invalidate_inode_pages rewrite

Rewrite these functions to use gang lookup.

- This probably has similar performance to the old code in the common case.

- It will be vastly quicker than current code for the worst case
  (single-page truncate).

- invalidate_inode_pages() has been changed.  It used to use
  page_count(page) as the "is it mapped into pagetables" heuristic.  It
  now uses the (page->pte.direct != 0) heuristic.

- Removes the worst cause of scheduling latency in the kernel.

- It's a big code cleanup.

- invalidate_inode_pages() has been changed to take an address_space
  *, not an inode *.

- the maximum hold times for mapping->page_lock are enormously reduced,
  making it quite feasible to turn this into an irq-safe lock.  Which, it
  seems, is a requirement for sane AIO<->direct-io integration, as well
  as possibly other AIO things.

(Thanks Hugh for fixing a bug in this one as well).

(Christoph added some stuff too)
parent 55b40732
......@@ -443,7 +443,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
* We really want to use invalidate_inode_pages2() for
* that, but not until that's cleaned up.
*/
invalidate_inode_pages(bdev->bd_inode);
invalidate_inode_pages(bdev->bd_inode->i_mapping);
}
void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
......
......@@ -301,7 +301,7 @@ jffs_setattr(struct dentry *dentry, struct iattr *iattr)
inode->i_blocks = (inode->i_size + 511) >> 9;
if (len) {
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
}
inode->i_ctime = CURRENT_TIME;
inode->i_mtime = inode->i_ctime;
......@@ -1520,7 +1520,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
}
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty(inode);
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
out_isem:
return err;
......
......@@ -125,14 +125,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* throught inode->i_sem or some other mechanism.
*/
if (page->index == 0)
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
unlock_page(page);
return 0;
error:
SetPageError(page);
kunmap(page);
unlock_page(page);
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
desc->error = error;
return -EIO;
}
......
......@@ -564,7 +564,7 @@ nfs_zap_caches(struct inode *inode)
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
NFS_CACHEINV(inode);
......@@ -1130,7 +1130,7 @@ __nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
if (invalid) {
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
} else if (time_after(jiffies, NFS_ATTRTIMEO_UPDATE(inode)+NFS_ATTRTIMEO(inode))) {
if ((NFS_ATTRTIMEO(inode) <<= 1) > NFS_MAXATTRTIMEO(inode))
......
......@@ -210,7 +210,7 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
(long) last_sz, (long) inode->i_size);
if (!S_ISDIR(inode->i_mode))
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
}
}
......@@ -274,7 +274,7 @@ smb_refresh_inode(struct dentry *dentry)
* But we do want to invalidate the caches ...
*/
if (!S_ISDIR(inode->i_mode))
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
else
smb_invalid_dir_cache(inode);
error = -EIO;
......
......@@ -1140,7 +1140,7 @@ extern int full_check_disk_change(struct block_device *);
extern int __check_disk_change(dev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
extern void invalidate_inode_pages(struct address_space *mapping);
extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
......
......@@ -41,6 +41,9 @@ extern struct page * find_trylock_page(struct address_space *mapping,
unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);
extern unsigned int find_get_pages(struct address_space *mapping,
pgoff_t start, unsigned int nr_pages,
struct page **pages);
/*
* Returns locked page at given index in given cache, creating it if needed.
......
......@@ -8,6 +8,7 @@
#define PAGEVEC_SIZE 16
struct page;
struct address_space;
struct pagevec {
unsigned nr;
......@@ -21,6 +22,8 @@ void __pagevec_lru_add(struct pagevec *pvec);
void lru_add_drain(void);
void pagevec_deactivate_inactive(struct pagevec *pvec);
void pagevec_strip(struct pagevec *pvec);
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages);
static inline void pagevec_init(struct pagevec *pvec)
{
......
......@@ -9,6 +9,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
pdflush.o page-writeback.o rmap.o madvise.o vcache.o
pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
truncate.o
include $(TOPDIR)/Rules.make
This diff is collapsed.
......@@ -238,6 +238,29 @@ void pagevec_strip(struct pagevec *pvec)
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
* @nr_pages: The maximum number of pages
*
* pagevec_lookup() will search for and return a group of up to @nr_pages pages
* in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
*
* pagevec_lookup() returns the number of pages which were found.
*/
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages)
{
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
/*
* Perform any setup for the swap system
*/
......
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
* Copyright (C) 2002, Linus Torvalds
*
* 10Sep2002 akpm@zip.com.au
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
block_invalidatepage */
static int do_invalidatepage(struct page *page, unsigned long offset)
{
int (*invalidatepage)(struct page *, unsigned long);
invalidatepage = page->mapping->a_ops->invalidatepage;
if (invalidatepage == NULL)
invalidatepage = block_invalidatepage;
return (*invalidatepage)(page, offset);
}
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_nopage().
*/
static void truncate_complete_page(struct page *page)
{
if (PagePrivate(page))
do_invalidatepage(page, 0);
clear_page_dirty(page);
ClearPageUptodate(page);
remove_from_page_cache(page);
page_cache_release(page);
}
/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
* Truncate the page cache at a set offset, removing the pages that are beyond
* that offset (and zeroing out partial pages).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
* will wait. This is to prevent as much IO as possible in the affected region.
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
* Called under (and serialised by) inode->i_sem.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t next;
int i;
pagevec_init(&pvec);
next = start;
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
next = page->index + 1;
if (TestSetPageLocked(page))
continue;
if (PageWriteback(page)) {
unlock_page(page);
continue;
}
truncate_complete_page(page);
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
if (partial) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
wait_on_page_writeback(page);
truncate_partial_page(page, partial);
unlock_page(page);
page_cache_release(page);
}
}
next = start;
for ( ; ; ) {
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
if (next == start)
break;
next = start;
continue;
}
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
lock_page(page);
wait_on_page_writeback(page);
next = page->index + 1;
truncate_complete_page(page);
unlock_page(page);
}
pagevec_release(&pvec);
}
if (lstart == 0 && mapping->nrpages)
printk("%s: I goofed!\n", __FUNCTION__);
}
/**
* invalidate_inode_pages - Invalidate all the unlocked pages of one inode
* @inode: the inode which pages we want to invalidate
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*
* invalidate_inode_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
*/
void invalidate_inode_pages(struct address_space *mapping)
{
struct pagevec pvec;
pgoff_t next = 0;
int i;
pagevec_init(&pvec);
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
if (TestSetPageLocked(page)) {
next++;
continue;
}
next = page->index + 1;
if (PageDirty(page) || PageWriteback(page))
goto unlock;
if (PagePrivate(page) && !try_to_release_page(page, 0))
goto unlock;
if (page_mapped(page))
goto unlock;
truncate_complete_page(page);
unlock:
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
}
/**
* invalidate_inode_pages2 - remove all unmapped pages from an address_space
* @mapping - the address_space
*
* invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
* where the page is seen to be mapped into process pagetables. In that case,
* the page is marked clean but is left attached to its address_space.
*
* FIXME: invalidate_inode_pages2() is probably trivially livelockable.
*/
void invalidate_inode_pages2(struct address_space *mapping)
{
struct pagevec pvec;
pgoff_t next = 0;
int i;
pagevec_init(&pvec);
while (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
lock_page(page);
if (page->mapping) { /* truncate race? */
wait_on_page_writeback(page);
next = page->index + 1;
if (page_mapped(page))
clear_page_dirty(page);
else
truncate_complete_page(page);
}
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment