Commit 3a025760 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: page_alloc: spill to remote nodes before waking kswapd

On NUMA systems, a node may start thrashing cache or even swap anonymous
pages while there are still free pages on remote nodes.

This is a result of commits 81c0a2bb ("mm: page_alloc: fair zone
allocator policy") and fff4068c ("mm: page_alloc: revert NUMA aspect
of fair allocation policy").

Before those changes, the allocator would first try all allowed zones,
including those on remote nodes, before waking any kswapds.  But now,
the allocator fastpath doubles as the fairness pass, which in turn can
only consider the local node to prevent remote spilling based on
exhausted fairness batches alone.  Remote nodes are only considered in
the slowpath, after the kswapds are woken up.  But if remote nodes still
have free memory, kswapd should not be woken to rebalance the local node
or it may thrash cash or swap prematurely.

Fix this by adding one more unfair pass over the zonelist that is
allowed to spill to remote nodes after the local fairness pass fails but
before entering the slowpath and waking the kswapds.

This also gets rid of the GFP_THISNODE exemption from the fairness
protocol because the unfair pass is no longer tied to kswapd, which
GFP_THISNODE is not allowed to wake up.

However, because remote spills can be more frequent now - we prefer them
over local kswapd reclaim - the allocation batches on remote nodes could
underflow more heavily.  When resetting the batches, use
atomic_long_read() directly instead of zone_page_state() to calculate the
delta as the latter filters negative counter values.
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Acked-by: default avatarMel Gorman <mgorman@suse.de>
Cc: <stable@kernel.org>		[3.12+]
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent d715ae08
...@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, ...@@ -370,5 +370,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */
#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
#define ALLOC_FAIR 0x100 /* fair zone allocation */
#endif /* __MM_INTERNAL_H */ #endif /* __MM_INTERNAL_H */
...@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) ...@@ -1239,15 +1239,6 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
} }
local_irq_restore(flags); local_irq_restore(flags);
} }
static bool gfp_thisnode_allocation(gfp_t gfp_mask)
{
return (gfp_mask & GFP_THISNODE) == GFP_THISNODE;
}
#else
static bool gfp_thisnode_allocation(gfp_t gfp_mask)
{
return false;
}
#endif #endif
/* /*
...@@ -1584,12 +1575,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, ...@@ -1584,12 +1575,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
get_pageblock_migratetype(page)); get_pageblock_migratetype(page));
} }
/* __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
* NOTE: GFP_THISNODE allocations do not partake in the kswapd
* aging protocol, so they can't be fair.
*/
if (!gfp_thisnode_allocation(gfp_flags))
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
__count_zone_vm_events(PGALLOC, zone, 1 << order); __count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags); zone_statistics(preferred_zone, zone, gfp_flags);
...@@ -1955,23 +1941,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -1955,23 +1941,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
* zone size to ensure fair page aging. The zone a * zone size to ensure fair page aging. The zone a
* page was allocated in should have no effect on the * page was allocated in should have no effect on the
* time the page has in memory before being reclaimed. * time the page has in memory before being reclaimed.
*
* Try to stay in local zones in the fastpath. If
* that fails, the slowpath is entered, which will do
* another pass starting with the local zones, but
* ultimately fall back to remote zones that do not
* partake in the fairness round-robin cycle of this
* zonelist.
*
* NOTE: GFP_THISNODE allocations do not partake in
* the kswapd aging protocol, so they can't be fair.
*/ */
if ((alloc_flags & ALLOC_WMARK_LOW) && if (alloc_flags & ALLOC_FAIR) {
!gfp_thisnode_allocation(gfp_mask)) {
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue;
if (!zone_local(preferred_zone, zone)) if (!zone_local(preferred_zone, zone))
continue; continue;
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue;
} }
/* /*
* When allocating a page cache page for writing, we * When allocating a page cache page for writing, we
...@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, ...@@ -2409,32 +2384,40 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
return page; return page;
} }
static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, static void reset_alloc_batches(struct zonelist *zonelist,
struct zonelist *zonelist, enum zone_type high_zoneidx,
enum zone_type high_zoneidx, struct zone *preferred_zone)
struct zone *preferred_zone)
{ {
struct zoneref *z; struct zoneref *z;
struct zone *zone; struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
if (!(gfp_mask & __GFP_NO_KSWAPD))
wakeup_kswapd(zone, order, zone_idx(preferred_zone));
/* /*
* Only reset the batches of zones that were actually * Only reset the batches of zones that were actually
* considered in the fast path, we don't want to * considered in the fairness pass, we don't want to
* thrash fairness information for zones that are not * trash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle. * actually part of this zonelist's round-robin cycle.
*/ */
if (!zone_local(preferred_zone, zone)) if (!zone_local(preferred_zone, zone))
continue; continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH, mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - high_wmark_pages(zone) - low_wmark_pages(zone) -
low_wmark_pages(zone) - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
zone_page_state(zone, NR_ALLOC_BATCH));
} }
} }
static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist,
enum zone_type high_zoneidx,
struct zone *preferred_zone)
{
struct zoneref *z;
struct zone *zone;
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
wakeup_kswapd(zone, order, zone_idx(preferred_zone));
}
static inline int static inline int
gfp_to_alloc_flags(gfp_t gfp_mask) gfp_to_alloc_flags(gfp_t gfp_mask)
{ {
...@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, ...@@ -2523,12 +2506,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
* allowed per node queues are empty and that nodes are * allowed per node queues are empty and that nodes are
* over allocated. * over allocated.
*/ */
if (gfp_thisnode_allocation(gfp_mask)) if (IS_ENABLED(CONFIG_NUMA) &&
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage; goto nopage;
restart: restart:
prepare_slowpath(gfp_mask, order, zonelist, if (!(gfp_mask & __GFP_NO_KSWAPD))
high_zoneidx, preferred_zone); wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
/* /*
* OK, we're below the kswapd watermark and have kicked background * OK, we're below the kswapd watermark and have kicked background
...@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, ...@@ -2712,7 +2696,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL; struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask); int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie; unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
struct mem_cgroup *memcg = NULL; struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask; gfp_mask &= gfp_allowed_mask;
...@@ -2753,11 +2737,28 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, ...@@ -2753,11 +2737,28 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA; alloc_flags |= ALLOC_CMA;
#endif #endif
retry:
/* First allocation attempt */ /* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags, zonelist, high_zoneidx, alloc_flags,
preferred_zone, migratetype); preferred_zone, migratetype);
if (unlikely(!page)) { if (unlikely(!page)) {
/*
* The first pass makes sure allocations are spread
* fairly within the local node. However, the local
* node might have free pages left after the fairness
* batches are exhausted, and remote zones haven't
* even been considered yet. Try once more without
* fairness, and include remote zones now, before
* entering the slowpath and waking kswapd: prefer
* spilling to a remote zone over swapping locally.
*/
if (alloc_flags & ALLOC_FAIR) {
reset_alloc_batches(zonelist, high_zoneidx,
preferred_zone);
alloc_flags &= ~ALLOC_FAIR;
goto retry;
}
/* /*
* Runtime PM, block IO and its error handling path * Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not * can deadlock because I/O on the device might not
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment