[PATCH] hot-n-cold pages: bulk page allocator

This is the hot-n-cold-pages series. It introduces a per-cpu lockless LIFO pool in front of the page allocator. For three reasons: 1: To reduce lock contention on the buddy lock: we allocate and free pages in, typically, 16-page chunks. 2: To return cache-warm pages to page allocation requests. 3: As infrastructure for a page reservation API which can be used to ensure that the GFP_ATOMIC radix-tree node and pte_chain allocations cannot fail. That code is not complete, and does not absolutely require hot-n-cold pages. It'll work OK though. We add two queues per CPU. The "hot" queue contains pages which the freeing code thought were likely to be cache-hot. By default, new allocations are satisfied from this queue. The "cold" queue contains pages which the freeing code expected to be cache-cold. The cold queue is mainly for lock amortisation, although it is possible to explicitly allocate cold pages. The readahead code does that. I have been hot and cold on these patches for quite some time - the benefit is not great. - 4% speedup in Randy Hron's benching of the autoconf regression tests on a 4-way. Most of this came from savings in pte_alloc and pmd_alloc: the pagetable clearing code liked the warmer pages (some architectures still have the pgt_cache, and can perhaps do away with them). - 1% to 2% speedup in kernel compiles on my 4-way and Martin's 32-way. - 60% speedup in a little test program which writes 80 kbytes to a file and ftruncates it to zero again. Ran four instances of that on 4-way and it loved the cache warmth. - 2.5% speedup in Specweb testing on 8-way - The thing which won me over: an 11% increase in throughput of the SDET benchmark on an 8-way PIII: with hot & cold: RESULT for 8 users is 17971 +12.1% RESULT for 16 users is 17026 +12.0% RESULT for 32 users is 17009 +10.4% RESULT for 64 users is 16911 +10.3% without: RESULT for 8 users is 16038 RESULT for 16 users is 15200 RESULT for 32 users is 15406 RESULT for 64 users is 15331 SDET is a very old SPEC test which simulates a development environment with a large number of users. Lots of users running a mix of shell commands, basically. These patches were written by Martin Bligh and myself. This one implements rmqueue_bulk() - a function for removing multiple pages of a given order from the buddy lists. This is for lock amortisation: take the highly-contended zone->lock with less frequency, do more work once it has been acquired.

[PATCH] hot-n-cold pages: bulk page allocator
This is the hot-n-cold-pages series. It introduces a per-cpu lockless LIFO pool in front of the page allocator. For three reasons: 1: To reduce lock contention on the buddy lock: we allocate and free pages in, typically, 16-page chunks. 2: To return cache-warm pages to page allocation requests. 3: As infrastructure for a page reservation API which can be used to ensure that the GFP_ATOMIC radix-tree node and pte_chain allocations cannot fail. That code is not complete, and does not absolutely require hot-n-cold pages. It'll work OK though. We add two queues per CPU. The "hot" queue contains pages which the freeing code thought were likely to be cache-hot. By default, new allocations are satisfied from this queue. The "cold" queue contains pages which the freeing code expected to be cache-cold. The cold queue is mainly for lock amortisation, although it is possible to explicitly allocate cold pages. The readahead code does that. I have been hot and cold on these patches for quite some time - the benefit is not great. - 4% speedup in Randy Hron's benching of the autoconf regression tests on a 4-way. Most of this came from savings in pte_alloc and pmd_alloc: the pagetable clearing code liked the warmer pages (some architectures still have the pgt_cache, and can perhaps do away with them). - 1% to 2% speedup in kernel compiles on my 4-way and Martin's 32-way. - 60% speedup in a little test program which writes 80 kbytes to a file and ftruncates it to zero again. Ran four instances of that on 4-way and it loved the cache warmth. - 2.5% speedup in Specweb testing on 8-way - The thing which won me over: an 11% increase in throughput of the SDET benchmark on an 8-way PIII: with hot & cold: RESULT for 8 users is 17971 +12.1% RESULT for 16 users is 17026 +12.0% RESULT for 32 users is 17009 +10.4% RESULT for 64 users is 16911 +10.3% without: RESULT for 8 users is 16038 RESULT for 16 users is 15200 RESULT for 32 users is 15406 RESULT for 64 users is 15331 SDET is a very old SPEC test which simulates a development environment with a large number of users. Lots of users running a mix of shell commands, basically. These patches were written by Martin Bligh and myself. This one implements rmqueue_bulk() - a function for removing multiple pages of a given order from the buddy lists. This is for lock amortisation: take the highly-contended zone->lock with less frequency, do more work once it has been acquired.
38e419f5 · Andrew Morton · Linus Torvalds · afce7191 · 38e419f5
Commit 38e419f5 authored Oct 29, 2002 by Andrew Morton Committed by Linus Torvalds Oct 29, 2002
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 26 deletions

mm/page_alloc.c mm/page_alloc.c +75 -26

No files found.
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -210,44 +210,93 @@ static inline void prep_new_page(struct page *page)
 	set_page_count(page, 1);
 }

-static struct page *rmqueue(struct zone *zone, unsigned int order)
+/* 
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order)
 {
-	struct free_area *area = zone->free_area + order;
-	unsigned int curr_order = order;
+	struct free_area * area;
+	unsigned int current_order = order;
 	struct list_head *head, *curr;
-	unsigned long flags;
 	struct page *page;
+	unsigned int index;

-	spin_lock_irqsave(&zone->lock, flags);
-	do {
+	for (current_order=order; current_order < MAX_ORDER; ++current_order) {
+		area = zone->free_area + current_order;
 		head = &area->free_list;
 		curr = head->next;

-		if (curr != head) {
-			unsigned int index;
+		if (list_empty(&area->free_list))
+			continue;

-			page = list_entry(curr, struct page, list);
-			BUG_ON(bad_range(zone, page));
-			list_del(curr);
-			index = page - zone->zone_mem_map;
-			if (curr_order != MAX_ORDER-1)
-				MARK_USED(index, curr_order, area);
-			zone->free_pages -= 1UL << order;
+		page = list_entry(curr, struct page, list);
+		BUG_ON(bad_range(zone, page));
+		list_del(curr);
+		index = page - zone->zone_mem_map;
+		if (current_order != MAX_ORDER-1)
+			MARK_USED(index, current_order, area);
+		zone->free_pages -= 1UL << order;
+		page = expand(zone, page, index, order, current_order, area);
+		return page;
+	}

-			page = expand(zone, page, index, order, curr_order, area);
-			spin_unlock_irqrestore(&zone->lock, flags);
+	return NULL;
+}

-			if (bad_range(zone, page))
-				BUG();
-			prep_new_page(page);
-			return page;	
-		}
-		curr_order++;
-		area++;
-	} while (curr_order < MAX_ORDER);
+/* Obtain a single element from the buddy allocator */
+static struct page *rmqueue(struct zone *zone, unsigned int order)
+{
+	unsigned long flags;
+	struct page *page;
+
+	spin_lock_irqsave(&zone->lock, flags);
+	page = __rmqueue(zone, order);
 	spin_unlock_irqrestore(&zone->lock, flags);

-	return NULL;
+	if (page != NULL) {
+		BUG_ON(bad_range(zone, page));
+		prep_new_page(page);
+	}
+	return page;
+}
+
+/* 
+ * Obtain a specified number of elements from the buddy allocator, all under
+ * a single hold of the lock, for efficiency.  Add them to the supplied list.
+ * Returns the number of new pages which were placed at *list.
+ */
+static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+			unsigned long count, struct list_head *list)
+{
+	unsigned long flags;
+	int i, allocated = 0;
+	struct page *page;
+	struct list_head *curr;
+	LIST_HEAD(temp);
+	
+	spin_lock_irqsave(&zone->lock, flags);
+	for (i = 0; i < count; ++i) {
+		page = __rmqueue(zone, order);
+		if (page == NULL)
+			break;
+		++allocated;
+		list_add(&page->list, &temp);
+	}
+	spin_unlock_irqrestore(&zone->lock, flags);
+
+	/*
+	 * This may look inefficient because we're walking the list again,
+	 * but the cachelines are hot, so it's very cheap, and this way we
+	 * can drop the zone lock much earlier
+	 */
+	list_for_each(curr, &temp) {
+		page = list_entry(curr, struct page, list);
+		BUG_ON(bad_range(zone, page));
+		prep_new_page(page);
+	}
+	list_splice(&temp, list->prev);
+	return allocated;
 }

 #ifdef CONFIG_SOFTWARE_SUSPEND