[PATCH] strengthen the `incremental min' logic in the page

Strengthen the `incremental min' logic in the page allocator. Currently it is allowing the allocation to succeed if the zone has free_pages >= pages_high. This was to avoid a lockup corner case in which all the zones were at pages_high so reclaim wasn't doing anything, but the incremental min refused to take pages from those zones anyway. But we want the incremental min zone protection to work. So: - Only allow the allocator to dip below the incremental min if he cannot run direct reclaim. - Change the page reclaim code so that on the direct reclaim path, the caller can free pages beyond ->pages_high. So if the incremental min test fails, the caller will go and free some more memory. Eventually, the caller will have freed enough memory for the incremental min test to pass against one of the zones.

[PATCH] strengthen the `incremental min' logic in the page
Strengthen the `incremental min' logic in the page allocator. Currently it is allowing the allocation to succeed if the zone has free_pages >= pages_high. This was to avoid a lockup corner case in which all the zones were at pages_high so reclaim wasn't doing anything, but the incremental min refused to take pages from those zones anyway. But we want the incremental min zone protection to work. So: - Only allow the allocator to dip below the incremental min if he cannot run direct reclaim. - Change the page reclaim code so that on the direct reclaim path, the caller can free pages beyond ->pages_high. So if the incremental min test fails, the caller will go and free some more memory. Eventually, the caller will have freed enough memory for the incremental min test to pass against one of the zones.
fee2b68d · Andrew Morton · Linus Torvalds · 53bf7bef · fee2b68d · fee2b68d
Commit fee2b68d authored Nov 21, 2002 by Andrew Morton Committed by Linus Torvalds Nov 21, 2002
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 22 deletions

mm/page_alloc.c mm/page_alloc.c +22 -7

mm/vmscan.c mm/vmscan.c +14 -15

No files found.
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -411,12 +411,25 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 }

 /*
- * This is the 'heart' of the zoned buddy allocator:
+ * This is the 'heart' of the zoned buddy allocator.
+ *
+ * Herein lies the mysterious "incremental min".  That's the
+ *
+ *	min += z->pages_low;
+ *
+ * thing.  The intent here is to provide additional protection to low zones for
+ * allocation requests which _could_ use higher zones.  So a GFP_HIGHMEM
+ * request is not allowed to dip as deeply into the normal zone as a GFP_KERNEL
+ * request.  This preserves additional space in those lower zones for requests
+ * which really do need memory from those zones.  It means that on a decent
+ * sized machine, GFP_HIGHMEM and GFP_KERNEL requests basically leave the DMA
+ * zone untouched.
 */
 struct page *
 __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
+	const int wait = gfp_mask & __GFP_WAIT;
 	unsigned long min;
 	struct zone **zones, *classzone;
 	struct page *page;
@@ -424,7 +437,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	int i;
 	int cold;

-	if (gfp_mask & __GFP_WAIT)
+	if (wait)
 		might_sleep();

 	cold = 0;
@@ -441,9 +454,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];

-		/* the incremental min is allegedly to discourage fallback */
 		min += z->pages_low;
-		if (z->free_pages > min || z->free_pages >= z->pages_high) {
+		if (z->free_pages > min ||
+				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
@@ -468,7 +481,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		if (gfp_mask & __GFP_HIGH)
 			local_min >>= 2;
 		min += local_min;
-		if (z->free_pages > min || z->free_pages >= z->pages_high) {
+		if (z->free_pages > min ||
+				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
@@ -490,7 +504,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	}

 	/* Atomic allocations - we can't balance anything */
-	if (!(gfp_mask & __GFP_WAIT))
+	if (!wait)
 		goto nopage;

 	inc_page_state(allocstall);
@@ -505,7 +519,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		struct zone *z = zones[i];

 		min += z->pages_min;
-		if (z->free_pages > min || z->free_pages >= z->pages_high) {
+		if (z->free_pages > min ||
+				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;

--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -712,28 +712,28 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
 * This is the direct reclaim path, for page-allocating processes.  We only
 * try to reclaim pages from zones which will satisfy the caller's allocation
 * request.
+ *
+ * We reclaim from a zone even if that zone is over pages_high.  Because:
+ * a) The caller may be trying to free *extra* pages to satisfy a higher-order
+ *    allocation or
+ * b) The zones may be over pages_high but they must go *over* pages_high to
+ *    satisfy the `incremental min' zone defense algorithm.
+ *
+ * Returns the number of reclaimed pages.
 */
 static int
 shrink_caches(struct zone *classzone, int priority, int *total_scanned,
-		int gfp_mask, const int nr_pages, int order,
-		struct page_state *ps)
+		int gfp_mask, const int nr_pages, struct page_state *ps)
 {
 	struct zone *first_classzone;
 	struct zone *zone;
-	int nr_mapped = 0;
 	int ret = 0;

 	first_classzone = classzone->zone_pgdat->node_zones;
 	for (zone = classzone; zone >= first_classzone; zone--) {
+		int to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX);
+		int nr_mapped = 0;
 		int max_scan;
-		int to_reclaim;
-
-		to_reclaim = zone->pages_high - zone->free_pages;
-		if (order == 0 && to_reclaim < 0)
-			continue;	/* zone has enough memory */
-
-		to_reclaim = min(to_reclaim, SWAP_CLUSTER_MAX);
-		to_reclaim = max(to_reclaim, nr_pages);

 		/*
 		 * If we cannot reclaim `nr_pages' pages by scanning twice
@@ -744,8 +744,7 @@ shrink_caches(struct zone *classzone, int priority, int *total_scanned,
 			max_scan = to_reclaim * 2;
 		ret += shrink_zone(zone, max_scan, gfp_mask,
 				to_reclaim, &nr_mapped, ps, priority);
-		*total_scanned += max_scan;
-		*total_scanned += nr_mapped;
+		*total_scanned += max_scan + nr_mapped;
 		if (ret >= nr_pages)
 			break;
 	}
@@ -786,11 +785,11 @@ try_to_free_pages(struct zone *classzone,
 		get_page_state(&ps);
 		nr_reclaimed += shrink_caches(classzone, priority,
 					&total_scanned, gfp_mask,
-					nr_pages, order, &ps);
+					nr_pages, &ps);
 		if (nr_reclaimed >= nr_pages)
 			return 1;
 		if (total_scanned == 0)
-			return 1;	/* All zones had enough free memory */
+			printk("%s: I am buggy\n", __FUNCTION__);
 		if (!(gfp_mask & __GFP_FS))
 			break;		/* Let the caller handle it */
 		/*