[PATCH] low-latency zap_page_range

zap_page_range and truncate are the two main latency problems in the VM/VFS. The radix-tree-based truncate grinds that into the dust, but no algorithmic fixes for pagetable takedown have presented themselves... Patch from Robert Love. Attached patch implements a low latency version of "zap_page_range()". Calls with even moderately large page ranges result in very long lock held times and consequently very long periods of non-preemptibility. This function is in my list of the top 3 worst offenders. It is gross. This new version reimplements zap_page_range() as a loop over ZAP_BLOCK_SIZE chunks. After each iteration, if a reschedule is pending, we drop page_table_lock and automagically preempt. Note we can not blindly drop the locks and reschedule (e.g. for the non-preempt case) since there is a possibility to enter this codepath holding other locks. ... I am sure you are familar with all this, its the same deal as your low-latency work. This patch implements the "cond_resched_lock()" as we discussed sometime back. I think this solution should be acceptable to you and Linus. There are other misc. cleanups, too. This new zap_page_range() yields latency too-low-to-benchmark: <<1ms.

[PATCH] low-latency zap_page_range
zap_page_range and truncate are the two main latency problems in the VM/VFS. The radix-tree-based truncate grinds that into the dust, but no algorithmic fixes for pagetable takedown have presented themselves... Patch from Robert Love. Attached patch implements a low latency version of "zap_page_range()". Calls with even moderately large page ranges result in very long lock held times and consequently very long periods of non-preemptibility. This function is in my list of the top 3 worst offenders. It is gross. This new version reimplements zap_page_range() as a loop over ZAP_BLOCK_SIZE chunks. After each iteration, if a reschedule is pending, we drop page_table_lock and automagically preempt. Note we can not blindly drop the locks and reschedule (e.g. for the non-preempt case) since there is a possibility to enter this codepath holding other locks. ... I am sure you are familar with all this, its the same deal as your low-latency work. This patch implements the "cond_resched_lock()" as we discussed sometime back. I think this solution should be acceptable to you and Linus. There are other misc. cleanups, too. This new zap_page_range() yields latency too-low-to-benchmark: <<1ms.
e572ef2e · Andrew Morton · Christoph Hellwig · 697f3abe · e572ef2e · e572ef2e
Commit e572ef2e authored Sep 15, 2002 by Andrew Morton Committed by Christoph Hellwig Sep 15, 2002
Show whitespace changes
Inline Side-by-side

Showing with 72 additions and 18 deletions

include/linux/sched.h include/linux/sched.h +28 -0

mm/memory.c mm/memory.c +44 -18

No files found.
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -956,6 +956,34 @@ static inline void cond_resched(void)
 		__cond_resched();
 }
+#ifdef CONFIG_PREEMPT
+/*
+ * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * Note: this does not assume the given lock is the _only_ lock held.
+ * The kernel preemption counter gives us "free" checking that we are
+ * atomic -- let's use it.
+ */
+static inline void cond_resched_lock(spinlock_t * lock)
+{
+	if (need_resched() && preempt_count() == 1) {
+		_raw_spin_unlock(lock);
+		preempt_enable_no_resched();
+		__cond_resched();
+		spin_lock(lock);
+	}
+}
+#else
+static inline void cond_resched_lock(spinlock_t * lock)
+{
+}
+#endif
 /* Reevaluate whether the task has signals pending delivery.
   This is required every time the blocked sigset_t changes.
   Athread cathreaders should have t->sigmask_lock.  */

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -389,8 +389,8 @@ void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, unsigned lo
 {
 	pgd_t * dir;
-	if (address >= end)
+	BUG_ON(address >= end);
-		BUG();
 	dir = pgd_offset(vma->vm_mm, address);
 	tlb_start_vma(tlb, vma);
 	do {
@@ -401,30 +401,56 @@ void unmap_page_range(mmu_gather_t *tlb, struct vm_area_struct *vma, unsigned lo
 	tlb_end_vma(tlb, vma);
 }
-/*
+/* Dispose of an entire mmu_gather_t per rescheduling point */
- * remove user pages in a given range.
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(FREE_PTE_NR * PAGE_SIZE)
+#endif
+/* For UP, 256 pages at a time gives nice low latency */
+#if !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(256 * PAGE_SIZE)
+#endif
+/* No preempt: go for the best straight-line efficiency */
+#if !defined(CONFIG_PREEMPT)
+#define ZAP_BLOCK_SIZE	(~(0UL))
+#endif
+/**
+ * zap_page_range - remove user pages in a given range
+ * @vma: vm_area_struct holding the applicable pages
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
 */
 void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	mmu_gather_t *tlb;
-	unsigned long start = address, end = address + size;
+	unsigned long end, block;
+	spin_lock(&mm->page_table_lock);
  	/*
-	 * This is a long-lived spinlock. That's fine.
+ 	 * This was once a long-held spinlock.  Now we break the
-	 * There's no contention, because the page table
+ 	 * work up into ZAP_BLOCK_SIZE units and relinquish the
-	 * lock only protects against kswapd anyway, and
+ 	 * lock after each interation.  This drastically lowers
-	 * even if kswapd happened to be looking at this
+ 	 * lock contention and allows for a preemption point.
-	 * process we _want_ it to get stuck.
  	 */
-	if (address >= end)
+	while (size) {
-		BUG();
+		block = (size > ZAP_BLOCK_SIZE) ? ZAP_BLOCK_SIZE : size;
-	spin_lock(&mm->page_table_lock);
+ 		end = address + block;
-	flush_cache_range(vma, address, end);
+ 		flush_cache_range(vma, address, end);
 		tlb = tlb_gather_mmu(mm, 0);
 		unmap_page_range(tlb, vma, address, end);
-	tlb_finish_mmu(tlb, start, end);
+ 		tlb_finish_mmu(tlb, address, end);
+ 		cond_resched_lock(&mm->page_table_lock);
+ 		address += block;
+ 		size -= block;
+ 	}
 	spin_unlock(&mm->page_table_lock);
 }