Commit 1d7d3304 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] stop using the address_space dirty_pages list

Move everything over to walking the radix tree via the PAGECACHE_TAG_DIRTY
tag.  Remove address_space.dirty_pages.
parent 40c8348e
......@@ -825,12 +825,6 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
* page on the dirty page list.
*
* There is also a small window where the page is dirty, and not on dirty_pages.
* Also a possibility that by the time the page is added to dirty_pages, it has
* been set clean. The page lists are somewhat approximate in this regard.
* It's better to have clean pages accidentally attached to dirty_pages than to
* leave dirty pages attached to clean_pages.
*
* We use private_lock to lock against try_to_free_buffers while using the
* page's buffer list. Also use this to protect against clean buffers being
* added to the page after it was set dirty.
......@@ -871,8 +865,6 @@ int __set_page_dirty_buffers(struct page *page)
if (page->mapping) { /* Race with truncate? */
if (!mapping->backing_dev_info->memory_backed)
inc_page_state(nr_dirty);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
radix_tree_tag_set(&mapping->page_tree, page->index,
PAGECACHE_TAG_DIRTY);
}
......@@ -1228,7 +1220,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
* The relationship between dirty buffers and dirty pages:
*
* Whenever a page has any dirty buffers, the page's dirty bit is set, and
* the page appears on its address_space.dirty_pages list.
* the page is tagged dirty in its radix tree.
*
* At all times, the dirtiness of the buffers represents the dirtiness of
* subsections of the page. If the page has buffers, the page dirty bit is
......@@ -1250,10 +1242,10 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
/**
* mark_buffer_dirty - mark a buffer_head as needing writeout
*
* mark_buffer_dirty() will set the dirty bit against the buffer,
* then set its backing page dirty, then attach the page to its
* address_space's dirty_pages list and then attach the address_space's
* inode to its superblock's dirty inode list.
* mark_buffer_dirty() will set the dirty bit against the buffer, then set its
* backing page dirty, then tag the page as dirty in its address_space's radix
* tree and then attach the address_space's inode to its superblock's dirty
* inode list.
*
* mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
* mapping->tree_lock and the global inode_lock.
......
......@@ -129,12 +129,6 @@ static void write_inode(struct inode *inode, int sync)
* starvation of particular inodes when others are being redirtied, prevent
* livelocks, etc.
*
* So what we do is to move all pages which are to be written from dirty_pages
* onto io_pages. And keep on writing io_pages until it's empty. Refusing to
* move more pages onto io_pages until io_pages is empty. Once that point has
* been reached, we are ready to take another pass across the inode's dirty
* pages.
*
* Called under inode_lock.
*/
static int
......@@ -159,10 +153,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
* read speculatively by this cpu before &= ~I_DIRTY -- mikulas
*/
spin_lock_irq(&mapping->tree_lock);
if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
spin_unlock_irq(&mapping->tree_lock);
spin_unlock(&inode_lock);
ret = do_writepages(mapping, wbc);
......@@ -180,10 +170,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode_lock);
inode->i_state &= ~I_LOCK;
if (!(inode->i_state & I_FREEING)) {
if (!list_empty(&mapping->io_pages)) {
/* Needs more writeback */
inode->i_state |= I_DIRTY_PAGES;
} else if (!list_empty(&mapping->dirty_pages)) {
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
/* Redirtied */
inode->i_state |= I_DIRTY_PAGES;
inode->dirtied_when = jiffies;
......
......@@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode)
memset(inode, 0, sizeof(*inode));
INIT_HLIST_NODE(&inode->i_hash);
INIT_LIST_HEAD(&inode->i_data.clean_pages);
INIT_LIST_HEAD(&inode->i_data.dirty_pages);
INIT_LIST_HEAD(&inode->i_data.locked_pages);
INIT_LIST_HEAD(&inode->i_data.io_pages);
INIT_LIST_HEAD(&inode->i_dentry);
......
......@@ -592,28 +592,12 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
* (The next two paragraphs refer to code which isn't here yet, but they
* explain the presence of address_space.io_pages)
*
* Pages can be moved from clean_pages or locked_pages onto dirty_pages
* at any time - it's not possible to lock against that. So pages which
* have already been added to a BIO may magically reappear on the dirty_pages
* list. And mpage_writepages() will again try to lock those pages.
* But I/O has not yet been started against the page. Thus deadlock.
*
* To avoid this, mpage_writepages() will only write pages from io_pages. The
* caller must place them there. We walk io_pages, locking the pages and
* submitting them for I/O, moving them to locked_pages.
*
* This has the added benefit of preventing a livelock which would otherwise
* occur if pages are being dirtied faster than we can write them out.
*
* If a page is already under I/O, generic_writepages() skips it, even
* if it's dirty. This is desirable behaviour for memory-cleaning writeback,
* but it is INCORRECT for data-integrity system calls such as fsync(). fsync()
* and msync() need to guarantee that all the data which was dirty at the time
* the call was made get new I/O started against them. So if called_for_sync()
* is true, we must wait for existing IO to complete.
*
* It's fairly rare for PageWriteback pages to be on ->dirty_pages. It
* means that someone redirtied the page while it was under I/O.
*/
int
mpage_writepages(struct address_space *mapping,
......@@ -625,6 +609,9 @@ mpage_writepages(struct address_space *mapping,
int ret = 0;
int done = 0;
int (*writepage)(struct page *page, struct writeback_control *wbc);
struct pagevec pvec;
int nr_pages;
pgoff_t index;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
......@@ -635,72 +622,58 @@ mpage_writepages(struct address_space *mapping,
if (get_block == NULL)
writepage = mapping->a_ops->writepage;
spin_lock_irq(&mapping->tree_lock);
while (!list_empty(&mapping->io_pages) && !done) {
struct page *page = list_entry(mapping->io_pages.prev,
struct page, list);
list_del(&page->list);
if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) {
if (PageDirty(page)) {
list_add(&page->list, &mapping->dirty_pages);
continue;
}
list_add(&page->list, &mapping->locked_pages);
continue;
}
if (!PageDirty(page)) {
list_add(&page->list, &mapping->clean_pages);
continue;
}
list_add(&page->list, &mapping->locked_pages);
page_cache_get(page);
spin_unlock_irq(&mapping->tree_lock);
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file mapping.
*/
lock_page(page);
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
if (page->mapping == mapping && !PageWriteback(page) &&
test_clear_page_dirty(page)) {
if (writepage) {
ret = (*writepage)(page, wbc);
if (ret) {
if (ret == -ENOSPC)
set_bit(AS_ENOSPC,
&mapping->flags);
else
set_bit(AS_EIO,
&mapping->flags);
pagevec_init(&pvec, 0);
index = 0;
while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
unsigned i;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
/*
* At this point we hold neither mapping->tree_lock nor
* lock on the page itself: the page may be truncated or
* invalidated (changing page->mapping to NULL), or even
* swizzled back from swapper_space to tmpfs file
* mapping
*/
lock_page(page);
if (wbc->sync_mode != WB_SYNC_NONE)
wait_on_page_writeback(page);
if (page->mapping == mapping && !PageWriteback(page) &&
test_clear_page_dirty(page)) {
if (writepage) {
ret = (*writepage)(page, wbc);
if (ret) {
if (ret == -ENOSPC)
set_bit(AS_ENOSPC,
&mapping->flags);
else
set_bit(AS_EIO,
&mapping->flags);
}
} else {
bio = mpage_writepage(bio, page,
get_block, &last_block_in_bio,
&ret, wbc);
}
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking &&
bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
}
} else {
bio = mpage_writepage(bio, page, get_block,
&last_block_in_bio, &ret, wbc);
unlock_page(page);
}
if (ret || (--(wbc->nr_to_write) <= 0))
done = 1;
if (wbc->nonblocking && bdi_write_congested(bdi)) {
wbc->encountered_congestion = 1;
done = 1;
}
} else {
unlock_page(page);
}
page_cache_release(page);
spin_lock_irq(&mapping->tree_lock);
pagevec_release(&pvec);
}
/*
* Leave any remaining dirty pages on ->io_pages
*/
spin_unlock_irq(&mapping->tree_lock);
if (bio)
mpage_bio_submit(WRITE, bio);
return ret;
......
......@@ -600,7 +600,8 @@ static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
#define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages)
#define VN_DIRTY(vp) (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
#define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
PAGECACHE_TAG_DIRTY)
#define VMODIFY(vp) VN_FLAGSET(vp, VMODIFIED)
#define VUNMODIFY(vp) VN_FLAGCLR(vp, VMODIFIED)
......
......@@ -324,7 +324,6 @@ struct address_space {
struct radix_tree_root page_tree; /* radix tree of all pages */
spinlock_t tree_lock; /* and spinlock protecting it */
struct list_head clean_pages; /* list of clean pages */
struct list_head dirty_pages; /* list of dirty pages */
struct list_head locked_pages; /* list of locked pages */
struct list_head io_pages; /* being prepared for I/O */
unsigned long nrpages; /* number of total pages */
......@@ -371,6 +370,8 @@ struct block_device {
#define PAGECACHE_TAG_DIRTY 0
#define PAGECACHE_TAG_WRITEBACK 1
int mapping_tagged(struct address_space *mapping, int tag);
/*
* Use sequence counter to get consistent i_size on 32-bit processors.
*/
......
......@@ -69,9 +69,10 @@ extern struct page * find_trylock_page(struct address_space *mapping,
unsigned long index);
extern struct page * find_or_create_page(struct address_space *mapping,
unsigned long index, unsigned int gfp_mask);
extern unsigned int find_get_pages(struct address_space *mapping,
pgoff_t start, unsigned int nr_pages,
struct page **pages);
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages);
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages);
/*
* Returns locked page at given index in given cache, creating it if needed.
......
......@@ -22,8 +22,11 @@ void __pagevec_free(struct pagevec *pvec);
void __pagevec_lru_add(struct pagevec *pvec);
void __pagevec_lru_add_active(struct pagevec *pvec);
void pagevec_strip(struct pagevec *pvec);
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages);
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages);
unsigned pagevec_lookup_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, int tag,
unsigned nr_pages);
static inline void pagevec_init(struct pagevec *pvec, int cold)
{
......
......@@ -100,9 +100,7 @@ void __remove_from_page_cache(struct page *page)
struct address_space *mapping = page->mapping;
radix_tree_delete(&mapping->page_tree, page->index);
list_del(&page->list);
page->mapping = NULL;
mapping->nrpages--;
pagecache_acct(-1);
}
......@@ -148,9 +146,6 @@ static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
if (mapping->backing_dev_info->memory_backed)
return 0;
spin_lock_irq(&mapping->tree_lock);
list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
spin_unlock_irq(&mapping->tree_lock);
ret = do_writepages(mapping, &wbc);
return ret;
}
......@@ -190,11 +185,7 @@ int filemap_fdatawait(struct address_space * mapping)
struct page *page;
page = list_entry(mapping->locked_pages.next,struct page,list);
list_del(&page->list);
if (PageDirty(page))
list_add(&page->list, &mapping->dirty_pages);
else
list_add(&page->list, &mapping->clean_pages);
list_del_init(&page->list);
if (!PageWriteback(page)) {
if (++progress > 32) {
......@@ -228,7 +219,6 @@ int filemap_fdatawait(struct address_space * mapping)
return ret;
}
EXPORT_SYMBOL(filemap_fdatawait);
int filemap_write_and_wait(struct address_space *mapping)
......@@ -539,7 +529,7 @@ EXPORT_SYMBOL(find_or_create_page);
*
* find_get_pages() returns the number of pages which were found.
*/
unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
unsigned int nr_pages, struct page **pages)
{
unsigned int i;
......@@ -554,6 +544,27 @@ unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
return ret;
}
/*
* Like find_get_pages, except we only return pages which are tagged with
* `tag'. We update *start to index the next page for the traversal.
*/
unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
int tag, unsigned int nr_pages, struct page **pages)
{
unsigned int i;
unsigned int ret;
spin_lock_irq(&mapping->tree_lock);
ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
(void **)pages, *index, nr_pages, tag);
for (i = 0; i < ret; i++)
page_cache_get(pages[i]);
if (ret)
*index = pages[ret - 1]->index + 1;
spin_unlock_irq(&mapping->tree_lock);
return ret;
}
/*
* Same as grab_cache_page, but do not wait if the page is unavailable.
* This is intended for speculative data generators, where the data can
......
......@@ -472,12 +472,8 @@ int write_one_page(struct page *page, int wait)
if (wait)
wait_on_page_writeback(page);
spin_lock_irq(&mapping->tree_lock);
list_del(&page->list);
if (test_clear_page_dirty(page)) {
list_add(&page->list, &mapping->locked_pages);
page_cache_get(page);
spin_unlock_irq(&mapping->tree_lock);
ret = mapping->a_ops->writepage(page, &wbc);
if (ret == 0 && wait) {
wait_on_page_writeback(page);
......@@ -486,8 +482,6 @@ int write_one_page(struct page *page, int wait)
}
page_cache_release(page);
} else {
list_add(&page->list, &mapping->clean_pages);
spin_unlock_irq(&mapping->tree_lock);
unlock_page(page);
}
return ret;
......@@ -495,9 +489,8 @@ int write_one_page(struct page *page, int wait)
EXPORT_SYMBOL(write_one_page);
/*
* For address_spaces which do not use buffers. Just set the page's dirty bit
* and move it to the dirty_pages list. Also perform space reservation if
* required.
* For address_spaces which do not use buffers. Just tag the page as dirty in
* its radix tree.
*
* __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page
* is still safe, as long as it actually manages to find some blocks at
......@@ -520,8 +513,6 @@ int __set_page_dirty_nobuffers(struct page *page)
BUG_ON(page->mapping != mapping);
if (!mapping->backing_dev_info->memory_backed)
inc_page_state(nr_dirty);
list_del(&page->list);
list_add(&page->list, &mapping->dirty_pages);
radix_tree_tag_set(&mapping->page_tree,
page->index, PAGECACHE_TAG_DIRTY);
}
......@@ -646,3 +637,19 @@ int test_set_page_writeback(struct page *page)
}
EXPORT_SYMBOL(test_set_page_writeback);
/*
* Return true if any of the pages in the mapping are marged with the
* passed tag.
*/
int mapping_tagged(struct address_space *mapping, int tag)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&mapping->tree_lock, flags);
ret = radix_tree_tagged(&mapping->page_tree, tag);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
return ret;
}
EXPORT_SYMBOL(mapping_tagged);
......@@ -682,6 +682,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
return NULL;
got_pg:
kernel_map_pages(page, 1 << order, 1);
INIT_LIST_HEAD(&page->list);
INIT_LIST_HEAD(&page->lru);
return page;
}
......
......@@ -353,13 +353,21 @@ void pagevec_strip(struct pagevec *pvec)
*
* pagevec_lookup() returns the number of pages which were found.
*/
unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned int nr_pages)
unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
pgoff_t start, unsigned nr_pages)
{
pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
return pagevec_count(pvec);
}
unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
pgoff_t *index, int tag, unsigned nr_pages)
{
pvec->nr = find_get_pages_tag(mapping, index, tag,
nr_pages, pvec->pages);
return pagevec_count(pvec);
}
#ifdef CONFIG_SMP
/*
......
......@@ -27,7 +27,6 @@ struct address_space swapper_space = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC),
.tree_lock = SPIN_LOCK_UNLOCKED,
.clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages),
.dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages),
.io_pages = LIST_HEAD_INIT(swapper_space.io_pages),
.locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages),
.a_ops = &swap_aops,
......@@ -210,7 +209,6 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
if (!err) {
if (!swap_duplicate(entry))
BUG();
/* shift page from clean_pages to dirty_pages list */
BUG_ON(PageDirty(page));
set_page_dirty(page);
INC_CACHE_INFO(add_total);
......@@ -245,7 +243,6 @@ int move_from_swap_cache(struct page *page, unsigned long index,
if (!err) {
swap_free(entry);
/* shift page from clean_pages to dirty_pages list */
__clear_page_dirty(page);
set_page_dirty(page);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment