Commit 53bf7bef authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Remove mapping->vm_writeback

The vm_writeback address_space operation was designed to provide the VM
with a "clustered writeout" capability.  It allowed the filesystem to
perform more intelligent writearound decisions when the VM was trying
to clean a particular page.

I can't say I ever saw any real benefit from this - not much writeout
actually happens on that path - quite a lot of work has gone into
minimising it actually.

The default ->vm_writeback a_op which I provided wrote back the pages
in ->dirty_pages order.  But there is one scenario in which this causes
problems - writing a single 4G file with mem=4G.  We end up with all of
ZONE_NORMAL full of dirty pages, but all writeback effort is against
highmem pages.  (Because there is about 1.5G of dirty memory total).

Net effect: the machine stalls ZONE_NORMAL allocation attempts until
the ->dirty_pages writeback advances onto ZONE_NORMAL pages.

This can be fixed most sweetly with additional radix-tree
infrastructure which will be quite complex.  Later.


So this patch dumps it all, and goes back to using writepage
against individual pages as they come off the LRU.
parent 5fa9d488
......@@ -135,7 +135,6 @@ prototypes:
int (*readpage)(struct file *, struct page *);
int (*sync_page)(struct page *);
int (*writepages)(struct address_space *, int *nr_to_write);
int (*vm_writeback)(struct page *, int *nr_to_write);
int (*set_page_dirty)(struct page *page);
int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
......@@ -153,7 +152,6 @@ readpage: no yes, unlocks
readpages: no
sync_page: no maybe
writepages: no
vm_writeback: no yes
set_page_dirty no no
prepare_write: no yes
commit_write: no yes
......@@ -190,14 +188,6 @@ written. The address_space implementation may write more (or less) pages
than *nr_to_write asks for, but it should try to be reasonably close. If
nr_to_write is NULL, all dirty pages must be written.
->vm_writeback() is called from the VM. The address_space should
start I/O against at least *nr_to_write pages, including the passed page. As
each page is written its PG_launder flag must be set (inside the page lock).
The vm_writeback() function is provided so that filesytems can perform
clustered writeback around the page which the VM is trying to clean.
If a_ops.vm_writeback is NULL the VM will fall back to single-page writepage().
->set_page_dirty() is called from various places in the kernel
when the target page is marked as needing writeback. It may be called
under spinlock (it cannot block) and is sometimes called with the page
......
......@@ -758,7 +758,6 @@ struct address_space_operations def_blk_aops = {
.prepare_write = blkdev_prepare_write,
.commit_write = blkdev_commit_write,
.writepages = generic_writepages,
.vm_writeback = generic_vm_writeback,
.direct_IO = blkdev_direct_IO,
};
......
......@@ -655,7 +655,6 @@ struct address_space_operations ext2_aops = {
.bmap = ext2_bmap,
.direct_IO = ext2_direct_IO,
.writepages = ext2_writepages,
.vm_writeback = generic_vm_writeback,
};
/*
......
......@@ -614,12 +614,6 @@ mpage_writepages(struct address_space *mapping,
bio = mpage_writepage(bio, page, get_block,
&last_block_in_bio, &ret);
}
if ((current->flags & PF_MEMALLOC) &&
!PageActive(page) && PageLRU(page)) {
if (!pagevec_add(&pvec, page))
pagevec_deactivate_inactive(&pvec);
page = NULL;
}
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
......@@ -630,16 +624,13 @@ mpage_writepages(struct address_space *mapping,
} else {
unlock_page(page);
}
if (page)
page_cache_release(page);
page_cache_release(page);
write_lock(&mapping->page_lock);
}
/*
* Leave any remaining dirty pages on ->io_pages
*/
write_unlock(&mapping->page_lock);
pagevec_deactivate_inactive(&pvec);
if (bio)
mpage_bio_submit(WRITE, bio);
return ret;
......
......@@ -285,9 +285,6 @@ struct address_space_operations {
/* Write back some dirty pages from this mapping. */
int (*writepages)(struct address_space *, struct writeback_control *);
/* Perform a writeback as a memory-freeing operation. */
int (*vm_writeback)(struct page *, struct writeback_control *);
/* Set a page dirty */
int (*set_page_dirty)(struct page *page);
......@@ -1256,9 +1253,6 @@ extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
extern loff_t remote_llseek(struct file *file, loff_t offset, int origin);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int generic_vm_writeback(struct page *page,
struct writeback_control *wbc);
static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
read_descriptor_t * desc,
read_actor_t actor)
......
......@@ -21,7 +21,6 @@ void __pagevec_release_nonlru(struct pagevec *pvec);
void __pagevec_free(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec);
void __pagevec_lru_add_active(struct pagevec *pvec);
void pagevec_deactivate_inactive(struct pagevec *pvec);
void pagevec_strip(struct pagevec *pvec);
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages);
......
......@@ -390,71 +390,6 @@ static int __init page_writeback_init(void)
}
module_init(page_writeback_init);
/*
* A library function, which implements the vm_writeback a_op. It's fairly
* lame at this time. The idea is: the VM wants to liberate this page,
* so we pass the page to the address_space and give the fs the opportunity
* to write out lots of pages around this one. It allows extent-based
* filesytems to do intelligent things. It lets delayed-allocate filesystems
* perform better file layout. It lets the address_space opportunistically
* write back disk-contiguous pages which are in other zones.
*
* FIXME: the VM wants to start I/O against *this* page. Because its zone
* is under pressure. But this function may start writeout against a
* totally different set of pages. Unlikely to be a huge problem, but if it
* is, we could just writepage the page if it is still (PageDirty &&
* !PageWriteback) (See below).
*
* Another option is to just reposition page->mapping->dirty_pages so we
* *know* that the page will be written. That will work fine, but seems
* unpleasant. (If the page is not for-sure on ->dirty_pages we're dead).
* Plus it assumes that the address_space is performing writeback in
* ->dirty_pages order.
*
* So. The proper fix is to leave the page locked-and-dirty and to pass
* it all the way down.
*/
int generic_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
/*
* We don't own this inode, and we don't want the address_space
* vanishing while writeback is walking its pages.
*/
inode = igrab(inode);
unlock_page(page);
if (inode) {
do_writepages(inode->i_mapping, wbc);
/*
* This iput() will internally call ext2_discard_prealloc(),
* which is rather bogus. But there is no other way of
* dropping our ref to the inode. However, there's no harm
* in dropping the prealloc, because there probably isn't any.
* Just a waste of cycles.
*/
iput(inode);
#if 0
if (!PageWriteback(page) && PageDirty(page)) {
lock_page(page);
if (!PageWriteback(page)&&test_clear_page_dirty(page)) {
int ret;
ret = page->mapping->a_ops->writepage(page);
if (ret == -EAGAIN)
__set_page_dirty_nobuffers(page);
} else {
unlock_page(page);
}
}
#endif
}
return 0;
}
EXPORT_SYMBOL(generic_vm_writeback);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
if (mapping->a_ops->writepages)
......
......@@ -128,23 +128,8 @@ int swap_readpage(struct file *file, struct page *page)
out:
return ret;
}
/*
* swapper_space doesn't have a real inode, so it gets a special vm_writeback()
* so we don't need swap special cases in generic_vm_writeback().
*
* Swap pages are !PageLocked and PageWriteback while under writeout so that
* memory allocators will throttle against them.
*/
static int swap_vm_writeback(struct page *page, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
unlock_page(page);
return generic_writepages(mapping, wbc);
}
struct address_space_operations swap_aops = {
.vm_writeback = swap_vm_writeback,
.writepage = swap_writepage,
.readpage = swap_readpage,
.sync_page = block_sync_page,
......
......@@ -717,14 +717,6 @@ static int shmem_writepages(struct address_space *mapping, struct writeback_cont
return 0;
}
static int shmem_vm_writeback(struct page *page, struct writeback_control *wbc)
{
clear_page_dirty(page);
if (shmem_writepage(page) < 0)
set_page_dirty(page);
return 0;
}
/*
* shmem_getpage - either get the page from swap or allocate a new one
*
......@@ -1811,7 +1803,6 @@ static void destroy_inodecache(void)
static struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.writepages = shmem_writepages,
.vm_writeback = shmem_vm_writeback,
.set_page_dirty = __set_page_dirty_nobuffers,
#ifdef CONFIG_TMPFS
.readpage = shmem_readpage,
......
......@@ -199,35 +199,6 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
pagevec_reinit(pvec);
}
/*
* Move all the inactive pages to the head of the inactive list and release
* them. Reinitialises the caller's pagevec.
*/
void pagevec_deactivate_inactive(struct pagevec *pvec)
{
int i;
struct zone *zone = NULL;
if (pagevec_count(pvec) == 0)
return;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct zone *pagezone = page_zone(page);
if (pagezone != zone) {
if (zone)
spin_unlock_irq(&zone->lru_lock);
zone = pagezone;
spin_lock_irq(&zone->lru_lock);
}
if (!PageActive(page) && PageLRU(page))
list_move(&page->lru, &pagezone->inactive_list);
}
if (zone)
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(pvec);
}
/*
* Add the passed pages to the LRU, then drop the caller's refcount
* on them. Reinitialises the caller's pagevec.
......
......@@ -284,12 +284,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
pte_chain_unlock(page);
#endif /* CONFIG_SWAP */
/*
* FIXME: this is CPU-inefficient for shared mappings.
* try_to_unmap() will set the page dirty and ->vm_writeback
* will write it. So we're back to page-at-a-time writepage
* in LRU order.
*/
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
......@@ -308,13 +302,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
* See swapfile.c:page_queue_congested().
*/
if (PageDirty(page)) {
int (*writeback)(struct page *,
struct writeback_control *);
struct backing_dev_info *bdi;
const int cluster_size = SWAP_CLUSTER_MAX;
struct writeback_control wbc = {
.nr_to_write = cluster_size,
};
if (!is_page_cache_freeable(page))
goto keep_locked;
......@@ -326,13 +314,15 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
if (bdi != current->backing_dev_info &&
bdi_write_congested(bdi))
goto keep_locked;
writeback = mapping->a_ops->vm_writeback;
if (writeback == NULL)
writeback = generic_vm_writeback;
(*writeback)(page, &wbc);
*max_scan -= (cluster_size - wbc.nr_to_write);
goto keep;
if (test_clear_page_dirty(page)) {
write_lock(&mapping->page_lock);
list_move(&page->list, &mapping->locked_pages);
write_unlock(&mapping->page_lock);
if (mapping->a_ops->writepage(page) == -EAGAIN)
__set_page_dirty_nobuffers(page);
goto keep;
}
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment