Commit b6938a7b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] readahead optimisations

Been looking at a workload which involves several processes which seek
around and read from a large file.  There are a few problems:
generic_file_lseek is bouncing i_sem around like mad, and readahead is
doing lots of pointless pagecache probing.

This patch addresses readahead.

Presumably the change will be larger on machines which have higher
bandwidth memory than my test box, of which there are many.

This patch teaches readahead to detect the situation where no IO is
actually being performed as a result of its actions.  Now, we don't
want to sacrifice IO efficiency to save a bit of CPU, so the code is
very cautious.  But eventually, after some tens of consecutive
readahead attempts were found to perform no I/O at all, readahead will
turn itself off.

readahead will be turned on again when either generic_file_read() or
filemap_nopage() get a pagecache miss.  The function
handle_ra_thrashing() has been renamed to handle_ra_miss() to reflect
its widened role.

A performance bug in page_cache_readround() was fixed - if
ra->next_size is zero, that function needs to leave it well alone,
because next_size==0 is a magic value meaning that the file has just
been opened and that readahead needs to get aggressive.  This change
makes a `make dep' run at the same speed as in the 2.4 kernel.  It used
to take 4x as long...

`make dep' is an interesting test because it uses mmap to read the files.
parent e64fa3db
...@@ -466,11 +466,11 @@ int write_one_page(struct page *page, int wait); ...@@ -466,11 +466,11 @@ int write_one_page(struct page *page, int wait);
/* readahead.c */ /* readahead.c */
#define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MAX_READAHEAD 128 /* kbytes */
#define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */
void do_page_cache_readahead(struct file *file, int do_page_cache_readahead(struct file *file,
unsigned long offset, unsigned long nr_to_read); unsigned long offset, unsigned long nr_to_read);
void page_cache_readahead(struct file *file, unsigned long offset); void page_cache_readahead(struct file *file, unsigned long offset);
void page_cache_readaround(struct file *file, unsigned long offset); void page_cache_readaround(struct file *file, unsigned long offset);
void handle_ra_thrashing(struct file *file); void handle_ra_miss(struct file *file);
/* vma is the first one with address < vma->vm_end, /* vma is the first one with address < vma->vm_end,
* and even address < vma->vm_start. Have to extend vma. */ * and even address < vma->vm_start. Have to extend vma. */
......
...@@ -910,7 +910,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * ...@@ -910,7 +910,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
page = radix_tree_lookup(&mapping->page_tree, index); page = radix_tree_lookup(&mapping->page_tree, index);
if (!page) { if (!page) {
read_unlock(&mapping->page_lock); read_unlock(&mapping->page_lock);
handle_ra_thrashing(filp); handle_ra_miss(filp);
goto no_cached_page; goto no_cached_page;
} }
page_cache_get(page); page_cache_get(page);
...@@ -1289,6 +1289,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address ...@@ -1289,6 +1289,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct page *page; struct page *page;
unsigned long size, pgoff, endoff; unsigned long size, pgoff, endoff;
int did_readahead;
pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
...@@ -1302,31 +1303,45 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address ...@@ -1302,31 +1303,45 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
if ((pgoff >= size) && (area->vm_mm == current->mm)) if ((pgoff >= size) && (area->vm_mm == current->mm))
return NULL; return NULL;
/* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */ /*
* The "size" of the file, as far as mmap is concerned, isn't bigger
* than the mapping
*/
if (size > endoff) if (size > endoff)
size = endoff; size = endoff;
did_readahead = 0;
/* /*
* The readahead code wants to be told about each and every page * The readahead code wants to be told about each and every page
* so it can build and shrink its windows appropriately * so it can build and shrink its windows appropriately
*/ */
if (VM_SequentialReadHint(area)) if (VM_SequentialReadHint(area)) {
did_readahead = 1;
page_cache_readahead(area->vm_file, pgoff); page_cache_readahead(area->vm_file, pgoff);
}
/* /*
* If the offset is outside the mapping size we're off the end * If the offset is outside the mapping size we're off the end
* of a privately mapped file, so we need to map a zero page. * of a privately mapped file, so we need to map a zero page.
*/ */
if ((pgoff < size) && !VM_RandomReadHint(area)) if ((pgoff < size) && !VM_RandomReadHint(area)) {
did_readahead = 1;
page_cache_readaround(file, pgoff); page_cache_readaround(file, pgoff);
}
/* /*
* Do we have something in the page cache already? * Do we have something in the page cache already?
*/ */
retry_find: retry_find:
page = find_get_page(mapping, pgoff); page = find_get_page(mapping, pgoff);
if (!page) if (!page) {
if (did_readahead) {
handle_ra_miss(file);
did_readahead = 0;
}
goto no_cached_page; goto no_cached_page;
}
/* /*
* Ok, found a page in the page cache, now we need to check * Ok, found a page in the page cache, now we need to check
......
...@@ -61,6 +61,7 @@ read_pages(struct file *file, struct address_space *mapping, ...@@ -61,6 +61,7 @@ read_pages(struct file *file, struct address_space *mapping,
* Together, these form the "current window". * Together, these form the "current window".
* Together, start and size represent the `readahead window'. * Together, start and size represent the `readahead window'.
* next_size: The number of pages to read on the next readahead miss. * next_size: The number of pages to read on the next readahead miss.
* Has the magical value -1UL if readahead has been disabled.
* prev_page: The page which the readahead algorithm most-recently inspected. * prev_page: The page which the readahead algorithm most-recently inspected.
* prev_page is mainly an optimisation: if page_cache_readahead * prev_page is mainly an optimisation: if page_cache_readahead
* sees that it is again being called for a page which it just * sees that it is again being called for a page which it just
...@@ -68,6 +69,7 @@ read_pages(struct file *file, struct address_space *mapping, ...@@ -68,6 +69,7 @@ read_pages(struct file *file, struct address_space *mapping,
* changes. * changes.
* ahead_start, * ahead_start,
* ahead_size: Together, these form the "ahead window". * ahead_size: Together, these form the "ahead window".
* ra_pages: The externally controlled max readahead for this fd.
* *
* The readahead code manages two windows - the "current" and the "ahead" * The readahead code manages two windows - the "current" and the "ahead"
* windows. The intent is that while the application is walking the pages * windows. The intent is that while the application is walking the pages
...@@ -120,8 +122,10 @@ read_pages(struct file *file, struct address_space *mapping, ...@@ -120,8 +122,10 @@ read_pages(struct file *file, struct address_space *mapping,
* the pages first, then submits them all for I/O. This avoids the very bad * the pages first, then submits them all for I/O. This avoids the very bad
* behaviour which would occur if page allocations are causing VM writeback. * behaviour which would occur if page allocations are causing VM writeback.
* We really don't want to intermingle reads and writes like that. * We really don't want to intermingle reads and writes like that.
*
* Returns the number of pages which actually had IO started against them.
*/ */
void do_page_cache_readahead(struct file *file, int do_page_cache_readahead(struct file *file,
unsigned long offset, unsigned long nr_to_read) unsigned long offset, unsigned long nr_to_read)
{ {
struct address_space *mapping = file->f_dentry->d_inode->i_mapping; struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
...@@ -130,10 +134,10 @@ void do_page_cache_readahead(struct file *file, ...@@ -130,10 +134,10 @@ void do_page_cache_readahead(struct file *file,
unsigned long end_index; /* The last page we want to read */ unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool); LIST_HEAD(page_pool);
int page_idx; int page_idx;
int nr_to_really_read = 0; int ret = 0;
if (inode->i_size == 0) if (inode->i_size == 0)
return; goto out;
end_index = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT); end_index = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT);
...@@ -158,7 +162,7 @@ void do_page_cache_readahead(struct file *file, ...@@ -158,7 +162,7 @@ void do_page_cache_readahead(struct file *file,
break; break;
page->index = page_offset; page->index = page_offset;
list_add(&page->list, &page_pool); list_add(&page->list, &page_pool);
nr_to_really_read++; ret++;
} }
read_unlock(&mapping->page_lock); read_unlock(&mapping->page_lock);
...@@ -167,10 +171,36 @@ void do_page_cache_readahead(struct file *file, ...@@ -167,10 +171,36 @@ void do_page_cache_readahead(struct file *file,
* uptodate then the caller will launch readpage again, and * uptodate then the caller will launch readpage again, and
* will then handle the error. * will then handle the error.
*/ */
read_pages(file, mapping, &page_pool, nr_to_really_read); if (ret) {
read_pages(file, mapping, &page_pool, ret);
blk_run_queues(); blk_run_queues();
}
BUG_ON(!list_empty(&page_pool)); BUG_ON(!list_empty(&page_pool));
return; out:
return ret;
}
/*
* Check how effective readahead is being. If the amount of started IO is
* less than expected then the file is partly or fully in pagecache and
* readahead isn't helping. Shrink the window.
*
* But don't shrink it too much - the application may read the same page
* occasionally.
*/
static inline void
check_ra_success(struct file_ra_state *ra, pgoff_t attempt,
pgoff_t actual, pgoff_t orig_next_size)
{
if (actual == 0) {
if (orig_next_size > 1) {
ra->next_size = orig_next_size - 1;
if (ra->ahead_size)
ra->ahead_size = ra->next_size;
} else {
ra->next_size = -1UL;
}
}
} }
/* /*
...@@ -180,25 +210,32 @@ void do_page_cache_readahead(struct file *file, ...@@ -180,25 +210,32 @@ void do_page_cache_readahead(struct file *file,
void page_cache_readahead(struct file *file, unsigned long offset) void page_cache_readahead(struct file *file, unsigned long offset)
{ {
struct file_ra_state *ra = &file->f_ra; struct file_ra_state *ra = &file->f_ra;
unsigned long max; unsigned max;
unsigned long min; unsigned min;
unsigned orig_next_size;
unsigned actual;
/* /*
* Here we detect the case where the application is performing * Here we detect the case where the application is performing
* sub-page sized reads. We avoid doing extra work and bogusly * sub-page sized reads. We avoid doing extra work and bogusly
* perturbing the readahead window expansion logic. * perturbing the readahead window expansion logic.
* If next_size is zero, this is the very first read for this * If next_size is zero, this is the very first read for this
* file handle. * file handle, or the window is maximally shrunk.
*/ */
if (offset == ra->prev_page) { if (offset == ra->prev_page) {
if (ra->next_size != 0) if (ra->next_size != 0)
goto out; goto out;
} }
if (ra->next_size == -1UL)
goto out; /* Maximally shrunk */
max = get_max_readahead(file); max = get_max_readahead(file);
if (max == 0) if (max == 0)
goto out; /* No readahead */ goto out; /* No readahead */
min = get_min_readahead(file); min = get_min_readahead(file);
orig_next_size = ra->next_size;
if (ra->next_size == 0 && offset == 0) { if (ra->next_size == 0 && offset == 0) {
/* /*
...@@ -224,8 +261,6 @@ void page_cache_readahead(struct file *file, unsigned long offset) ...@@ -224,8 +261,6 @@ void page_cache_readahead(struct file *file, unsigned long offset)
* window by 25%. * window by 25%.
*/ */
ra->next_size -= ra->next_size / 4; ra->next_size -= ra->next_size / 4;
if (ra->next_size < min)
ra->next_size = min;
} }
if (ra->next_size > max) if (ra->next_size > max)
...@@ -272,19 +307,21 @@ void page_cache_readahead(struct file *file, unsigned long offset) ...@@ -272,19 +307,21 @@ void page_cache_readahead(struct file *file, unsigned long offset)
ra->ahead_start = 0; /* Invalidate these */ ra->ahead_start = 0; /* Invalidate these */
ra->ahead_size = 0; ra->ahead_size = 0;
do_page_cache_readahead(file, offset, ra->size); actual = do_page_cache_readahead(file, offset, ra->size);
check_ra_success(ra, ra->size, actual, orig_next_size);
} else { } else {
/* /*
* This read request is within the current window. It * This read request is within the current window. It is time
* is time to submit I/O for the ahead window while * to submit I/O for the ahead window while the application is
* the application is crunching through the current * crunching through the current window.
* window.
*/ */
if (ra->ahead_start == 0) { if (ra->ahead_start == 0) {
ra->ahead_start = ra->start + ra->size; ra->ahead_start = ra->start + ra->size;
ra->ahead_size = ra->next_size; ra->ahead_size = ra->next_size;
do_page_cache_readahead(file, actual = do_page_cache_readahead(file,
ra->ahead_start, ra->ahead_size); ra->ahead_start, ra->ahead_size);
check_ra_success(ra, ra->ahead_size,
actual, orig_next_size);
} }
} }
out: out:
...@@ -298,38 +335,55 @@ void page_cache_readahead(struct file *file, unsigned long offset) ...@@ -298,38 +335,55 @@ void page_cache_readahead(struct file *file, unsigned long offset)
*/ */
void page_cache_readaround(struct file *file, unsigned long offset) void page_cache_readaround(struct file *file, unsigned long offset)
{ {
struct file_ra_state *ra = &file->f_ra;
if (ra->next_size != -1UL) {
const unsigned long min = get_min_readahead(file) * 2; const unsigned long min = get_min_readahead(file) * 2;
unsigned long target; unsigned long target;
unsigned long backward; unsigned long backward;
if (file->f_ra.next_size < min) /*
file->f_ra.next_size = min; * If next_size is zero then leave it alone, because that's a
* readahead startup state.
*/
if (ra->next_size && ra->next_size < min)
ra->next_size = min;
target = offset; target = offset;
backward = file->f_ra.next_size / 4; backward = ra->next_size / 4;
if (backward > target) if (backward > target)
target = 0; target = 0;
else else
target -= backward; target -= backward;
page_cache_readahead(file, target); page_cache_readahead(file, target);
}
} }
/* /*
* handle_ra_thrashing() is called when it is known that a page which should * handle_ra_miss() is called when it is known that a page which should have
* have been present (it's inside the readahead window) was in fact evicted by * been present in the pagecache (we just did some readahead there) was in fact
* the VM. * not found. This will happen if it was evicted by the VM (readahead
* thrashing) or if the readahead window is maximally shrunk.
*
* If the window has been maximally shrunk (next_size == 0) then bump it up
* again to resume readahead.
* *
* We shrink the readahead window by three pages. This is because we grow it * Otherwise we're thrashing, so shrink the readahead window by three pages.
* by two pages on a readahead hit. Theory being that the readahead window * This is because it is grown by two pages on a readahead hit. Theory being
* size will stabilise around the maximum level at which there isn't any * that the readahead window size will stabilise around the maximum level at
* thrashing. * which there is no thrashing.
*/ */
void handle_ra_thrashing(struct file *file) void handle_ra_miss(struct file *file)
{ {
struct file_ra_state *ra = &file->f_ra;
const unsigned long min = get_min_readahead(file); const unsigned long min = get_min_readahead(file);
file->f_ra.next_size -= 3; if (ra->next_size == -1UL) {
if (file->f_ra.next_size < min) ra->next_size = min;
file->f_ra.next_size = min; } else {
ra->next_size -= 3;
if (ra->next_size < min)
ra->next_size = min;
}
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment