[PATCH] low-latency pagetable teardown

Pagetable teardown can hold page_table_lock for extremely long periods - hundreds of milliseconds. This is pretty much the final source of high scheduling latency in the core kernel. We fixed it for zap_page_range() by chunking the work up and dropping the lock occasionally if needed. But that did not fix exit_mmap() and unmap_region(). So what this patch does is to create an uber-zapper "unmap_vmas()" which provides all the vma-walking, page unmapping and low-latency lock-dropping which zap_page_range(), exit_mmap() and unmap_region() require. Those three functions are updated to call unmap_vmas(). It's actually a bit of a cleanup...

[PATCH] low-latency pagetable teardown
Pagetable teardown can hold page_table_lock for extremely long periods - hundreds of milliseconds. This is pretty much the final source of high scheduling latency in the core kernel. We fixed it for zap_page_range() by chunking the work up and dropping the lock occasionally if needed. But that did not fix exit_mmap() and unmap_region(). So what this patch does is to create an uber-zapper "unmap_vmas()" which provides all the vma-walking, page unmapping and low-latency lock-dropping which zap_page_range(), exit_mmap() and unmap_region() require. Those three functions are updated to call unmap_vmas(). It's actually a bit of a cleanup...
b4adddd6 · Andrew Morton · Richard Henderson · 670fe925 · b4adddd6 · b4adddd6
Commit b4adddd6 authored Jan 10, 2003 by Andrew Morton Committed by Richard Henderson Jan 10, 2003
Hide whitespace changes
Inline Side-by-side

Showing with 140 additions and 89 deletions

include/linux/mm.h include/linux/mm.h +20 -8

mm/memory.c mm/memory.c +95 -27

mm/mmap.c mm/mmap.c +25 -54

No files found.
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -141,6 +141,7 @@ struct vm_operations_struct {

 /* forward declaration; pte_chain is meant to be internal to rmap.c */
 struct pte_chain;
+struct mmu_gather;

 /*
 * Each physical page in the system has a struct page associated with
@@ -357,15 +358,26 @@ extern struct page *mem_map;

 extern void show_free_areas(void);

-struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int unused);
+struct page *shmem_nopage(struct vm_area_struct * vma,
+			unsigned long address, int unused);
 struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
-extern void shmem_lock(struct file * file, int lock);
-extern int shmem_zero_setup(struct vm_area_struct *);
-
-extern void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size);
-extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma);
-extern int remap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long to, unsigned long size, pgprot_t prot);
-extern int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot);
+void shmem_lock(struct file * file, int lock);
+int shmem_zero_setup(struct vm_area_struct *);
+
+void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+			unsigned long size);
+int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+		struct vm_area_struct *start_vma, unsigned long start_addr,
+		unsigned long end_addr, unsigned long *nr_accounted);
+void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+			unsigned long address, unsigned long size);
+void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
+int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma);
+int remap_page_range(struct vm_area_struct *vma, unsigned long from,
+			unsigned long to, unsigned long size, pgprot_t prot);
+int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
+			unsigned long size, pgprot_t prot);

 extern int vmtruncate(struct inode * inode, loff_t offset);
 extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -454,8 +454,6 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,

 	BUG_ON(address >= end);

-	lru_add_drain();
-
 	dir = pgd_offset(vma->vm_mm, address);
 	tlb_start_vma(tlb, vma);
 	do {
@@ -481,17 +479,106 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 #define ZAP_BLOCK_SIZE	(~(0UL))
 #endif

+/**
+ * unmap_vmas - unmap a range of memory covered by a list of vma's
+ * @tlbp: address of the caller's struct mmu_gather
+ * @mm: the controlling mm_struct
+ * @vma: the starting vma
+ * @start_addr: virtual address at which to start unmapping
+ * @end_addr: virtual address at which to end unmapping
+ * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
+ *
+ * Returns the number of vma's which were covered by the unmapping.
+ *
+ * Unmap all pages in the vma list.  Called under page_table_lock.
+ *
+ * We aim to not hold page_table_lock for too long (for scheduling latency
+ * reasons).  So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to
+ * return the ending mmu_gather to the caller.
+ *
+ * Only addresses between `start' and `end' will be unmapped.
+ *
+ * The VMA list must be sorted in ascending virtual address order.
+ *
+ * unmap_vmas() assumes that the caller will flush the whole unmapped address
+ * range after unmap_vmas() returns.  So the only responsibility here is to
+ * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
+ * drops the lock and schedules.
+ */
+int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+		struct vm_area_struct *vma, unsigned long start_addr,
+		unsigned long end_addr, unsigned long *nr_accounted)
+{
+	unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+	unsigned long tlb_start;	/* For tlb_finish_mmu */
+	int tlb_start_valid = 0;
+	int ret = 0;
+
+	if (vma) {	/* debug.  killme. */
+		if (end_addr <= vma->vm_start)
+			printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n",
+				__FUNCTION__, end_addr, vma->vm_start);
+		if (start_addr >= vma->vm_end)
+			printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n",
+				__FUNCTION__, start_addr, vma->vm_end);
+	}
+
+	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
+		unsigned long start;
+		unsigned long end;
+
+		start = max(vma->vm_start, start_addr);
+		if (start >= vma->vm_end)
+			continue;
+		end = min(vma->vm_end, end_addr);
+		if (end <= vma->vm_start)
+			continue;
+
+		if (vma->vm_flags & VM_ACCOUNT)
+			*nr_accounted += (end - start) >> PAGE_SHIFT;
+
+		ret++;
+		while (start != end) {
+			unsigned long block = min(zap_bytes, end - start);
+
+			if (!tlb_start_valid) {
+				tlb_start = start;
+				tlb_start_valid = 1;
+			}
+
+			unmap_page_range(*tlbp, vma, start, start + block);
+			start += block;
+			zap_bytes -= block;
+			if (zap_bytes != 0)
+				continue;
+			if (need_resched()) {
+				tlb_finish_mmu(*tlbp, tlb_start, start);
+				cond_resched_lock(&mm->page_table_lock);
+				*tlbp = tlb_gather_mmu(mm, 0);
+				tlb_start_valid = 0;
+			}
+			zap_bytes = ZAP_BLOCK_SIZE;
+		}
+		if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end)
+			printk("%s: VMA list is not sorted correctly!\n",
+				__FUNCTION__);		
+	}
+	return ret;
+}
+
 /**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 */
-void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size)
+void zap_page_range(struct vm_area_struct *vma,
+			unsigned long address, unsigned long size)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct mmu_gather *tlb;
-	unsigned long end, block;
+	unsigned long end = address + size;
+	unsigned long nr_accounted = 0;

 	might_sleep();

@@ -501,30 +588,11 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned
 	}

 	lru_add_drain();
-
 	spin_lock(&mm->page_table_lock);
-
-  	/*
- 	 * This was once a long-held spinlock.  Now we break the
- 	 * work up into ZAP_BLOCK_SIZE units and relinquish the
- 	 * lock after each interation.  This drastically lowers
- 	 * lock contention and allows for a preemption point.
-  	 */
-	while (size) {
-		block = (size > ZAP_BLOCK_SIZE) ? ZAP_BLOCK_SIZE : size;
- 		end = address + block;
- 
- 		flush_cache_range(vma, address, end);
- 		tlb = tlb_gather_mmu(mm, 0);
- 		unmap_page_range(tlb, vma, address, end);
- 		tlb_finish_mmu(tlb, address, end);
- 
- 		cond_resched_lock(&mm->page_table_lock);
- 
- 		address += block;
- 		size -= block;
- 	}
-
+	flush_cache_range(vma, address, end);
+	tlb = tlb_gather_mmu(mm, 0);
+	unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted);
+	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
 }


--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -23,9 +23,6 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>

-extern void unmap_page_range(struct mmu_gather *,struct vm_area_struct *vma, unsigned long address, unsigned long size);
-extern void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
-
 /*
 * WARNING: the debugging will use recursive algorithms so never enable this
 * unless you know what you are doing.
@@ -1003,29 +1000,18 @@ static void unmap_vma_list(struct mm_struct *mm,
 * Called with the page table lock held.
 */
 static void unmap_region(struct mm_struct *mm,
-	struct vm_area_struct *mpnt,
+	struct vm_area_struct *vma,
 	struct vm_area_struct *prev,
 	unsigned long start,
 	unsigned long end)
 {
 	struct mmu_gather *tlb;
+	unsigned long nr_accounted = 0;

+	lru_add_drain();
 	tlb = tlb_gather_mmu(mm, 0);
-
-	do {
-		unsigned long from, to, len;
-
-		from = start < mpnt->vm_start ? mpnt->vm_start : start;
-		to = end > mpnt->vm_end ? mpnt->vm_end : end;
-
-		unmap_page_range(tlb, mpnt, from, to);
-
-		if (mpnt->vm_flags & VM_ACCOUNT) {
-			len = to - from;
-			vm_unacct_memory(len >> PAGE_SHIFT);
-		}
-	} while ((mpnt = mpnt->vm_next) != NULL);
-
+	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
+	vm_unacct_memory(nr_accounted);
 	free_pgtables(tlb, prev, start, end);
 	tlb_finish_mmu(tlb, start, end);
 }
@@ -1271,43 +1257,28 @@ void build_mmap_rb(struct mm_struct * mm)
 }

 /* Release all mmaps. */
-void exit_mmap(struct mm_struct * mm)
+void exit_mmap(struct mm_struct *mm)
 {
 	struct mmu_gather *tlb;
-	struct vm_area_struct * mpnt;
+	struct vm_area_struct *vma;
+	unsigned long nr_accounted = 0;

 	profile_exit_mmap(mm);
 
+	lru_add_drain();
+
 	spin_lock(&mm->page_table_lock);

 	tlb = tlb_gather_mmu(mm, 1);
-
 	flush_cache_mm(mm);
-	mpnt = mm->mmap;
-	while (mpnt) {
-		unsigned long start = mpnt->vm_start;
-		unsigned long end = mpnt->vm_end;
-
-		/*
-		 * If the VMA has been charged for, account for its
-		 * removal
-		 */
-		if (mpnt->vm_flags & VM_ACCOUNT)
-			vm_unacct_memory((end - start) >> PAGE_SHIFT);
-
-		mm->map_count--;
-		unmap_page_range(tlb, mpnt, start, end);
-		mpnt = mpnt->vm_next;
-	}
-
-	/* This is just debugging */
-	if (mm->map_count)
-		BUG();
-
+	mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
+					TASK_SIZE, &nr_accounted);
+	vm_unacct_memory(nr_accounted);
+	BUG_ON(mm->map_count);	/* This is just debugging */
 	clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
 	tlb_finish_mmu(tlb, 0, TASK_SIZE);

-	mpnt = mm->mmap;
+	vma = mm->mmap;
 	mm->mmap = mm->mmap_cache = NULL;
 	mm->mm_rb = RB_ROOT;
 	mm->rss = 0;
@@ -1320,17 +1291,17 @@ void exit_mmap(struct mm_struct * mm)
 	 * Walk the list again, actually closing and freeing it
 	 * without holding any MM locks.
 	 */
-	while (mpnt) {
-		struct vm_area_struct * next = mpnt->vm_next;
-		remove_shared_vm_struct(mpnt);
-		if (mpnt->vm_ops) {
-			if (mpnt->vm_ops->close)
-				mpnt->vm_ops->close(mpnt);
+	while (vma) {
+		struct vm_area_struct *next = vma->vm_next;
+		remove_shared_vm_struct(vma);
+		if (vma->vm_ops) {
+			if (vma->vm_ops->close)
+				vma->vm_ops->close(vma);
 		}
-		if (mpnt->vm_file)
-			fput(mpnt->vm_file);
-		kmem_cache_free(vm_area_cachep, mpnt);
-		mpnt = next;
+		if (vma->vm_file)
+			fput(vma->vm_file);
+		kmem_cache_free(vm_area_cachep, vma);
+		vma = next;
 	}
 		
 }