Commit 8a9e7331 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Resource management for NFS...

From: Trond Myklebust <trond.myklebust@fys.uio.no>

The patch fixes some problems with NFS under heavy writeout.

NFS pages can be in a clean but unreclaimable state.  They are unreclaimable
because the server has not yet acked the write - we may need to "redirty"
them if the server crashes.

These are referred to as "unstable" pages.  We need to count them alongside
dirty and writeback pages when making flushing and throttling decisions.
Otherwise the machine can be flooded with these pages and the VM has
problems.
parent 95751430
...@@ -274,8 +274,14 @@ nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) ...@@ -274,8 +274,14 @@ nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
err = nfs_flush_file(inode, NULL, 0, 0, 0); err = nfs_flush_file(inode, NULL, 0, 0, 0);
if (err < 0) if (err < 0)
goto out; goto out;
if (is_sync) if (wbc->sync_mode == WB_SYNC_HOLD)
goto out;
if (is_sync && wbc->sync_mode == WB_SYNC_ALL) {
err = nfs_wb_all(inode); err = nfs_wb_all(inode);
} else
nfs_commit_file(inode, NULL, 0, 0, 0);
/* Avoid races. Tell upstream we've done all we were told to do */
wbc->nr_to_write = 0;
out: out:
return err; return err;
} }
...@@ -363,6 +369,7 @@ nfs_mark_request_dirty(struct nfs_page *req) ...@@ -363,6 +369,7 @@ nfs_mark_request_dirty(struct nfs_page *req)
nfs_list_add_request(req, &nfsi->dirty); nfs_list_add_request(req, &nfsi->dirty);
nfsi->ndirty++; nfsi->ndirty++;
spin_unlock(&nfs_wreq_lock); spin_unlock(&nfs_wreq_lock);
inc_page_state(nr_dirty);
mark_inode_dirty(inode); mark_inode_dirty(inode);
} }
...@@ -390,6 +397,7 @@ nfs_mark_request_commit(struct nfs_page *req) ...@@ -390,6 +397,7 @@ nfs_mark_request_commit(struct nfs_page *req)
nfs_list_add_request(req, &nfsi->commit); nfs_list_add_request(req, &nfsi->commit);
nfsi->ncommit++; nfsi->ncommit++;
spin_unlock(&nfs_wreq_lock); spin_unlock(&nfs_wreq_lock);
inc_page_state(nr_unstable);
mark_inode_dirty(inode); mark_inode_dirty(inode);
} }
#endif #endif
...@@ -457,6 +465,7 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, un ...@@ -457,6 +465,7 @@ nfs_scan_dirty(struct inode *inode, struct list_head *dst, struct file *file, un
int res; int res;
res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages); res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
nfsi->ndirty -= res; nfsi->ndirty -= res;
sub_page_state(nr_dirty,res);
if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty)) if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n"); printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
return res; return res;
...@@ -481,6 +490,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, u ...@@ -481,6 +490,7 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, struct file *file, u
int res; int res;
res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages); res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages);
nfsi->ncommit -= res; nfsi->ncommit -= res;
sub_page_state(nr_unstable,res);
if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit)) if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n"); printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
return res; return res;
......
...@@ -75,6 +75,7 @@ ...@@ -75,6 +75,7 @@
#define PG_reclaim 18 /* To be reclaimed asap */ #define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */ #define PG_compound 19 /* Part of a compound page */
/* /*
* Global page accounting. One instance per CPU. Only unsigned longs are * Global page accounting. One instance per CPU. Only unsigned longs are
* allowed. * allowed.
...@@ -82,6 +83,7 @@ ...@@ -82,6 +83,7 @@
struct page_state { struct page_state {
unsigned long nr_dirty; /* Dirty writeable pages */ unsigned long nr_dirty; /* Dirty writeable pages */
unsigned long nr_writeback; /* Pages under writeback */ unsigned long nr_writeback; /* Pages under writeback */
unsigned long nr_unstable; /* NFS unstable pages */
unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_page_table_pages;/* Pages used for pagetables */
unsigned long nr_mapped; /* mapped into pagetables */ unsigned long nr_mapped; /* mapped into pagetables */
unsigned long nr_slab; /* In slab */ unsigned long nr_slab; /* In slab */
...@@ -130,6 +132,7 @@ extern void get_full_page_state(struct page_state *ret); ...@@ -130,6 +132,7 @@ extern void get_full_page_state(struct page_state *ret);
#define inc_page_state(member) mod_page_state(member, 1UL) #define inc_page_state(member) mod_page_state(member, 1UL)
#define dec_page_state(member) mod_page_state(member, 0UL - 1) #define dec_page_state(member) mod_page_state(member, 0UL - 1)
#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
/* /*
......
...@@ -138,6 +138,7 @@ get_dirty_limits(struct page_state *ps, long *background, long *dirty) ...@@ -138,6 +138,7 @@ get_dirty_limits(struct page_state *ps, long *background, long *dirty)
void balance_dirty_pages(struct address_space *mapping) void balance_dirty_pages(struct address_space *mapping)
{ {
struct page_state ps; struct page_state ps;
long nr_reclaimable;
long background_thresh; long background_thresh;
long dirty_thresh; long dirty_thresh;
unsigned long pages_written = 0; unsigned long pages_written = 0;
...@@ -145,8 +146,7 @@ void balance_dirty_pages(struct address_space *mapping) ...@@ -145,8 +146,7 @@ void balance_dirty_pages(struct address_space *mapping)
struct backing_dev_info *bdi = mapping->backing_dev_info; struct backing_dev_info *bdi = mapping->backing_dev_info;
get_dirty_limits(&ps, &background_thresh, &dirty_thresh); for (;;) {
while (ps.nr_dirty + ps.nr_writeback > dirty_thresh) {
struct writeback_control wbc = { struct writeback_control wbc = {
.bdi = bdi, .bdi = bdi,
.sync_mode = WB_SYNC_NONE, .sync_mode = WB_SYNC_NONE,
...@@ -154,24 +154,37 @@ void balance_dirty_pages(struct address_space *mapping) ...@@ -154,24 +154,37 @@ void balance_dirty_pages(struct address_space *mapping)
.nr_to_write = write_chunk, .nr_to_write = write_chunk,
}; };
get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
break;
dirty_exceeded = 1; dirty_exceeded = 1;
if (ps.nr_dirty) /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
* Unstable writes are a feature of certain networked
* filesystems (i.e. NFS) in which data may have been
* written to the server's write cache, but has not yet
* been flushed to permanent storage.
*/
if (nr_reclaimable) {
writeback_inodes(&wbc); writeback_inodes(&wbc);
get_dirty_limits(&ps, &background_thresh,
get_dirty_limits(&ps, &background_thresh, &dirty_thresh); &dirty_thresh);
if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
break; if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
pages_written += write_chunk - wbc.nr_to_write; break;
if (pages_written >= write_chunk) pages_written += write_chunk - wbc.nr_to_write;
break; /* We've done our duty */ if (pages_written >= write_chunk)
break; /* We've done our duty */
}
blk_congestion_wait(WRITE, HZ/10); blk_congestion_wait(WRITE, HZ/10);
} }
if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh) if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
dirty_exceeded = 0; dirty_exceeded = 0;
if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh) if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
pdflush_operation(background_writeout, 0); pdflush_operation(background_writeout, 0);
} }
...@@ -231,7 +244,8 @@ static void background_writeout(unsigned long _min_pages) ...@@ -231,7 +244,8 @@ static void background_writeout(unsigned long _min_pages)
long dirty_thresh; long dirty_thresh;
get_dirty_limits(&ps, &background_thresh, &dirty_thresh); get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
if (ps.nr_dirty < background_thresh && min_pages <= 0) if (ps.nr_dirty + ps.nr_unstable < background_thresh
&& min_pages <= 0)
break; break;
wbc.encountered_congestion = 0; wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.nr_to_write = MAX_WRITEBACK_PAGES;
...@@ -302,7 +316,7 @@ static void wb_kupdate(unsigned long arg) ...@@ -302,7 +316,7 @@ static void wb_kupdate(unsigned long arg)
oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies; start_jif = jiffies;
next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
nr_to_write = ps.nr_dirty; nr_to_write = ps.nr_dirty + ps.nr_unstable;
while (nr_to_write > 0) { while (nr_to_write > 0) {
wbc.encountered_congestion = 0; wbc.encountered_congestion = 0;
wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.nr_to_write = MAX_WRITEBACK_PAGES;
......
...@@ -936,11 +936,13 @@ void show_free_areas(void) ...@@ -936,11 +936,13 @@ void show_free_areas(void)
K(nr_free_pages()), K(nr_free_pages()),
K(nr_free_highpages())); K(nr_free_highpages()));
printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u\n", printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
"unstable:%lu free:%u\n",
active, active,
inactive, inactive,
ps.nr_dirty, ps.nr_dirty,
ps.nr_writeback, ps.nr_writeback,
ps.nr_unstable,
nr_free_pages()); nr_free_pages());
for_each_zone(zone) { for_each_zone(zone) {
...@@ -1439,6 +1441,7 @@ struct seq_operations fragmentation_op = { ...@@ -1439,6 +1441,7 @@ struct seq_operations fragmentation_op = {
static char *vmstat_text[] = { static char *vmstat_text[] = {
"nr_dirty", "nr_dirty",
"nr_writeback", "nr_writeback",
"nr_unstable",
"nr_page_table_pages", "nr_page_table_pages",
"nr_mapped", "nr_mapped",
"nr_slab", "nr_slab",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment