Commit 735a2573 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] truncate/invalidate_inode_pages rewrite

Rewrite these functions to use gang lookup.

- This probably has similar performance to the old code in the common case.

- It will be vastly quicker than current code for the worst case
  (single-page truncate).

- invalidate_inode_pages() has been changed.  It used to use
  page_count(page) as the "is it mapped into pagetables" heuristic.  It
  now uses the (page->pte.direct != 0) heuristic.

- Removes the worst cause of scheduling latency in the kernel.

- It's a big code cleanup.

- invalidate_inode_pages() has been changed to take an address_space
  *, not an inode *.

- the maximum hold times for mapping->page_lock are enormously reduced,
  making it quite feasible to turn this into an irq-safe lock.  Which, it
  seems, is a requirement for sane AIO<->direct-io integration, as well
  as possibly other AIO things.

(Thanks Hugh for fixing a bug in this one as well).

(Christoph added some stuff too)
parent 55b40732
......@@ -443,7 +443,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
* We really want to use invalidate_inode_pages2() for
* that, but not until that's cleaned up.
*/
invalidate_inode_pages(bdev->bd_inode);
invalidate_inode_pages(bdev->bd_inode->i_mapping);
}
void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
......
......@@ -301,7 +301,7 @@ jffs_setattr(struct dentry *dentry, struct iattr *iattr)
inode->i_blocks = (inode->i_size + 511) >> 9;
if (len) {
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
}
inode->i_ctime = CURRENT_TIME;
inode->i_mtime = inode->i_ctime;
......@@ -1520,7 +1520,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
}
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty(inode);
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
out_isem:
return err;
......
......@@ -125,14 +125,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
* throught inode->i_sem or some other mechanism.
*/
if (page->index == 0)
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
unlock_page(page);
return 0;
error:
SetPageError(page);
kunmap(page);
unlock_page(page);
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
desc->error = error;
return -EIO;
}
......
......@@ -564,7 +564,7 @@ nfs_zap_caches(struct inode *inode)
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
NFS_CACHEINV(inode);
......@@ -1130,7 +1130,7 @@ __nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
if (invalid) {
NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
} else if (time_after(jiffies, NFS_ATTRTIMEO_UPDATE(inode)+NFS_ATTRTIMEO(inode))) {
if ((NFS_ATTRTIMEO(inode) <<= 1) > NFS_MAXATTRTIMEO(inode))
......
......@@ -210,7 +210,7 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
(long) last_sz, (long) inode->i_size);
if (!S_ISDIR(inode->i_mode))
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
}
}
......@@ -274,7 +274,7 @@ smb_refresh_inode(struct dentry *dentry)
* But we do want to invalidate the caches ...
*/
if (!S_ISDIR(inode->i_mode))
invalidate_inode_pages(inode);
invalidate_inode_pages(inode->i_mapping);
else
smb_invalid_dir_cache(inode);
error = -EIO;
......
......@@ -1140,7 +1140,7 @@ extern int full_check_disk_change(struct block_device *);
extern int __check_disk_change(dev_t);
extern int invalidate_inodes(struct super_block *);
extern int invalidate_device(kdev_t, int);
extern void invalidate_inode_pages(struct inode *);
extern void invalidate_inode_pages(struct address_space *mapping);
extern void invalidate_inode_pages2(struct address_space *mapping);
extern void write_inode_now(struct inode *, int);
extern int filemap_fdatawrite(struct address_space *);
......
......@@ -41,6 +41,9 @@ extern struct page * find_trylock_page(struct address_space *mapping,
unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);
extern unsigned int find_get_pages(struct address_space *mapping,
pgoff_t start, unsigned int nr_pages,
struct page **pages);
/*
* Returns locked page at given index in given cache, creating it if needed.
......
......@@ -8,6 +8,7 @@
#define PAGEVEC_SIZE 16
struct page;
struct address_space;
struct pagevec {
unsigned nr;
......@@ -21,6 +22,8 @@ void __pagevec_lru_add(struct pagevec *pvec);
void lru_add_drain(void);
void pagevec_deactivate_inactive(struct pagevec *pvec);
void pagevec_strip(struct pagevec *pvec);
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages);
static inline void pagevec_init(struct pagevec *pvec)
{
......
......@@ -9,6 +9,7 @@ obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
pdflush.o page-writeback.o rmap.o madvise.o vcache.o
pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
truncate.o
include $(TOPDIR)/Rules.make
......@@ -104,341 +104,6 @@ static inline int sync_page(struct page *page)
return 0;
}
/**
* invalidate_inode_pages - Invalidate all the unlocked pages of one inode
* @inode: the inode which pages we want to invalidate
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*/
void invalidate_inode_pages(struct inode * inode)
{
struct list_head *head, *curr;
struct page * page;
struct address_space *mapping = inode->i_mapping;
struct pagevec pvec;
head = &mapping->clean_pages;
pagevec_init(&pvec);
write_lock(&mapping->page_lock);
curr = head->next;
while (curr != head) {
page = list_entry(curr, struct page, list);
curr = curr->next;
/* We cannot invalidate something in dirty.. */
if (PageDirty(page))
continue;
/* ..or locked */
if (TestSetPageLocked(page))
continue;
if (PagePrivate(page) && !try_to_release_page(page, 0))
goto unlock;
if (page_count(page) != 1)
goto unlock;
__remove_from_page_cache(page);
unlock_page(page);
if (!pagevec_add(&pvec, page))
__pagevec_release(&pvec);
continue;
unlock:
unlock_page(page);
continue;
}
write_unlock(&mapping->page_lock);
pagevec_release(&pvec);
}
static int do_invalidatepage(struct page *page, unsigned long offset)
{
int (*invalidatepage)(struct page *, unsigned long);
invalidatepage = page->mapping->a_ops->invalidatepage;
if (invalidatepage)
return (*invalidatepage)(page, offset);
return block_invalidatepage(page, offset);
}
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_nopage().
*/
static void truncate_complete_page(struct page *page)
{
if (PagePrivate(page))
do_invalidatepage(page, 0);
clear_page_dirty(page);
ClearPageUptodate(page);
remove_from_page_cache(page);
page_cache_release(page);
}
/*
* Writeback walks the page list in ->prev order, which is low-to-high file
* offsets in the common case where he file was written linearly. So truncate
* walks the page list in the opposite (->next) direction, to avoid getting
* into lockstep with writeback's cursor. To prune as many pages as possible
* before the truncate cursor collides with the writeback cursor.
*/
static int truncate_list_pages(struct address_space *mapping,
struct list_head *head, unsigned long start, unsigned *partial)
{
struct list_head *curr;
struct page * page;
int unlocked = 0;
struct pagevec release_pvec;
pagevec_init(&release_pvec);
restart:
curr = head->next;
while (curr != head) {
unsigned long offset;
page = list_entry(curr, struct page, list);
offset = page->index;
/* Is one of the pages to truncate? */
if ((offset >= start) || (*partial && (offset + 1) == start)) {
int failed;
page_cache_get(page);
failed = TestSetPageLocked(page);
if (!failed && PageWriteback(page)) {
unlock_page(page);
list_del(head);
list_add_tail(head, curr);
write_unlock(&mapping->page_lock);
wait_on_page_writeback(page);
if (!pagevec_add(&release_pvec, page))
__pagevec_release(&release_pvec);
unlocked = 1;
write_lock(&mapping->page_lock);
goto restart;
}
list_del(head);
if (!failed) /* Restart after this page */
list_add(head, curr);
else /* Restart on this page */
list_add_tail(head, curr);
write_unlock(&mapping->page_lock);
unlocked = 1;
if (!failed) {
if (*partial && (offset + 1) == start) {
truncate_partial_page(page, *partial);
*partial = 0;
} else {
truncate_complete_page(page);
}
unlock_page(page);
} else {
wait_on_page_locked(page);
}
if (!pagevec_add(&release_pvec, page))
__pagevec_release(&release_pvec);
cond_resched();
write_lock(&mapping->page_lock);
goto restart;
}
curr = curr->next;
}
if (pagevec_count(&release_pvec)) {
write_unlock(&mapping->page_lock);
pagevec_release(&release_pvec);
write_lock(&mapping->page_lock);
unlocked = 1;
}
return unlocked;
}
/*
* Unconditionally clean all pages outside `start'. The mapping lock
* must be held.
*/
static void clean_list_pages(struct address_space *mapping,
struct list_head *head, unsigned long start)
{
struct page *page;
struct list_head *curr;
for (curr = head->next; curr != head; curr = curr->next) {
page = list_entry(curr, struct page, list);
if (page->index > start)
clear_page_dirty(page);
}
}
/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from with to truncate
*
* Truncate the page cache at a set offset, removing the pages
* that are beyond that offset (and zeroing out partial pages).
* If any page is locked we wait for it to become unlocked.
*/
void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
{
unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
int unlocked;
write_lock(&mapping->page_lock);
clean_list_pages(mapping, &mapping->io_pages, start);
clean_list_pages(mapping, &mapping->dirty_pages, start);
do {
unlocked = truncate_list_pages(mapping,
&mapping->io_pages, start, &partial);
unlocked |= truncate_list_pages(mapping,
&mapping->dirty_pages, start, &partial);
unlocked |= truncate_list_pages(mapping,
&mapping->clean_pages, start, &partial);
unlocked |= truncate_list_pages(mapping,
&mapping->locked_pages, start, &partial);
} while (unlocked);
/* Traversed all three lists without dropping the lock */
write_unlock(&mapping->page_lock);
}
static inline int invalidate_this_page2(struct address_space * mapping,
struct page * page,
struct list_head * curr,
struct list_head * head)
{
int unlocked = 1;
/*
* The page is locked and we hold the mapping lock as well
* so both page_count(page) and page_buffers stays constant here.
* AKPM: fixme: No global lock any more. Is this still OK?
*/
if (page_count(page) == 1 + !!page_has_buffers(page)) {
/* Restart after this page */
list_del(head);
list_add_tail(head, curr);
page_cache_get(page);
write_unlock(&mapping->page_lock);
truncate_complete_page(page);
} else {
if (page_has_buffers(page)) {
/* Restart after this page */
list_del(head);
list_add_tail(head, curr);
page_cache_get(page);
write_unlock(&mapping->page_lock);
do_invalidatepage(page, 0);
} else
unlocked = 0;
clear_page_dirty(page);
ClearPageUptodate(page);
}
return unlocked;
}
static int invalidate_list_pages2(struct address_space * mapping,
struct list_head * head)
{
struct list_head *curr;
struct page * page;
int unlocked = 0;
struct pagevec release_pvec;
pagevec_init(&release_pvec);
restart:
curr = head->prev;
while (curr != head) {
page = list_entry(curr, struct page, list);
if (!TestSetPageLocked(page)) {
int __unlocked;
if (PageWriteback(page)) {
write_unlock(&mapping->page_lock);
wait_on_page_writeback(page);
unlocked = 1;
write_lock(&mapping->page_lock);
unlock_page(page);
goto restart;
}
__unlocked = invalidate_this_page2(mapping,
page, curr, head);
unlock_page(page);
unlocked |= __unlocked;
if (!__unlocked) {
curr = curr->prev;
continue;
}
} else {
/* Restart on this page */
list_del(head);
list_add(head, curr);
page_cache_get(page);
write_unlock(&mapping->page_lock);
unlocked = 1;
wait_on_page_locked(page);
}
if (!pagevec_add(&release_pvec, page))
__pagevec_release(&release_pvec);
cond_resched();
write_lock(&mapping->page_lock);
goto restart;
}
if (pagevec_count(&release_pvec)) {
write_unlock(&mapping->page_lock);
pagevec_release(&release_pvec);
write_lock(&mapping->page_lock);
unlocked = 1;
}
return unlocked;
}
/**
* invalidate_inode_pages2 - Clear all the dirty bits around if it can't
* free the pages because they're mapped.
* @mapping: the address_space which pages we want to invalidate
*/
void invalidate_inode_pages2(struct address_space *mapping)
{
int unlocked;
write_lock(&mapping->page_lock);
do {
unlocked = invalidate_list_pages2(mapping,
&mapping->clean_pages);
unlocked |= invalidate_list_pages2(mapping,
&mapping->dirty_pages);
unlocked |= invalidate_list_pages2(mapping,
&mapping->io_pages);
unlocked |= invalidate_list_pages2(mapping,
&mapping->locked_pages);
} while (unlocked);
write_unlock(&mapping->page_lock);
}
/*
* In-memory filesystems have to fail their
* writepage function - and this has to be
......@@ -823,6 +488,37 @@ struct page *find_or_create_page(struct address_space *mapping,
return page;
}
/**
* find_get_pages - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
* find_get_pages() will search for and return a group of up to
* @nr_pages pages in the mapping. The pages are placed at @pages.
* find_get_pages() takes a reference against the returned pages.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
*
* find_get_pages() returns the number of pages which were found.
*/
unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
unsigned int i;
unsigned int ret;
read_lock(&mapping->page_lock);
ret = radix_tree_gang_lookup(&mapping->page_tree,
(void **)pages, start, nr_pages);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
read_unlock(&mapping->page_lock);
return ret;
}
/*
* Same as grab_cache_page, but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
......
......@@ -238,6 +238,29 @@ void pagevec_strip(struct pagevec *pvec)
}
}
/**
* pagevec_lookup - gang pagecache lookup
* @pvec: Where the resulting pages are placed
* @mapping: The address_space to search
* @start: The starting page index
* @nr_pages: The maximum number of pages
*
* pagevec_lookup() will search for and return a group of up to @nr_pages pages
* in the mapping. The pages are placed in @pvec. pagevec_lookup() takes a
* reference against the pages in @pvec.
*
* The search returns a group of mapping-contiguous pages with ascending
* indexes. There may be holes in the indices due to not-present pages.
*
* pagevec_lookup() returns the number of pages which were found.
*/
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages)
{
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
/*
* Perform any setup for the swap system
*/
......
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
* Copyright (C) 2002, Linus Torvalds
*
* 10Sep2002 akpm@zip.com.au
* Initial version.
*/
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/buffer_head.h> /* grr. try_to_release_page,
block_invalidatepage */
static int do_invalidatepage(struct page *page, unsigned long offset)
{
int (*invalidatepage)(struct page *, unsigned long);
invalidatepage = page->mapping->a_ops->invalidatepage;
if (invalidatepage == NULL)
invalidatepage = block_invalidatepage;
return (*invalidatepage)(page, offset);
}
static inline void truncate_partial_page(struct page *page, unsigned partial)
{
memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
if (PagePrivate(page))
do_invalidatepage(page, partial);
}
/*
* If truncate cannot remove the fs-private metadata from the page, the page
* becomes anonymous. It will be left on the LRU and may even be mapped into
* user pagetables if we're racing with filemap_nopage().
*/
static void truncate_complete_page(struct page *page)
{
if (PagePrivate(page))
do_invalidatepage(page, 0);
clear_page_dirty(page);
ClearPageUptodate(page);
remove_from_page_cache(page);
page_cache_release(page);
}
/**
* truncate_inode_pages - truncate *all* the pages from an offset
* @mapping: mapping to truncate
* @lstart: offset from which to truncate
*
* Truncate the page cache at a set offset, removing the pages that are beyond
* that offset (and zeroing out partial pages).
*
* Truncate takes two passes - the first pass is nonblocking. It will not
* block on page locks and it will not block on writeback. The second pass
* will wait. This is to prevent as much IO as possible in the affected region.
* The first pass will remove most pages, so the search cost of the second pass
* is low.
*
* Called under (and serialised by) inode->i_sem.
*/
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
struct pagevec pvec;
pgoff_t next;
int i;
pagevec_init(&pvec);
next = start;
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
next = page->index + 1;
if (TestSetPageLocked(page))
continue;
if (PageWriteback(page)) {
unlock_page(page);
continue;
}
truncate_complete_page(page);
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
if (partial) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
wait_on_page_writeback(page);
truncate_partial_page(page, partial);
unlock_page(page);
page_cache_release(page);
}
}
next = start;
for ( ; ; ) {
if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
if (next == start)
break;
next = start;
continue;
}
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
lock_page(page);
wait_on_page_writeback(page);
next = page->index + 1;
truncate_complete_page(page);
unlock_page(page);
}
pagevec_release(&pvec);
}
if (lstart == 0 && mapping->nrpages)
printk("%s: I goofed!\n", __FUNCTION__);
}
/**
* invalidate_inode_pages - Invalidate all the unlocked pages of one inode
* @inode: the inode which pages we want to invalidate
*
* This function only removes the unlocked pages, if you want to
* remove all the pages of one inode, you must call truncate_inode_pages.
*
* invalidate_inode_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
*/
void invalidate_inode_pages(struct address_space *mapping)
{
struct pagevec pvec;
pgoff_t next = 0;
int i;
pagevec_init(&pvec);
while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
if (TestSetPageLocked(page)) {
next++;
continue;
}
next = page->index + 1;
if (PageDirty(page) || PageWriteback(page))
goto unlock;
if (PagePrivate(page) && !try_to_release_page(page, 0))
goto unlock;
if (page_mapped(page))
goto unlock;
truncate_complete_page(page);
unlock:
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
}
/**
* invalidate_inode_pages2 - remove all unmapped pages from an address_space
* @mapping - the address_space
*
* invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
* where the page is seen to be mapped into process pagetables. In that case,
* the page is marked clean but is left attached to its address_space.
*
* FIXME: invalidate_inode_pages2() is probably trivially livelockable.
*/
void invalidate_inode_pages2(struct address_space *mapping)
{
struct pagevec pvec;
pgoff_t next = 0;
int i;
pagevec_init(&pvec);
while (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
lock_page(page);
if (page->mapping) { /* truncate race? */
wait_on_page_writeback(page);
next = page->index + 1;
if (page_mapped(page))
clear_page_dirty(page);
else
truncate_complete_page(page);
}
unlock_page(page);
}
pagevec_release(&pvec);
cond_resched();
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment