Commit 3bc2b6a7 authored by Muchun Song's avatar Muchun Song Committed by Linus Torvalds

mm: sparsemem: split the huge PMD mapping of vmemmap pages

Patch series "Split huge PMD mapping of vmemmap pages", v4.

In order to reduce the difficulty of code review in series[1].  We disable
huge PMD mapping of vmemmap pages when that feature is enabled.  In this
series, we do not disable huge PMD mapping of vmemmap pages anymore.  We
will split huge PMD mapping when needed.  When HugeTLB pages are freed
from the pool we do not attempt coalasce and move back to a PMD mapping
because it is much more complex.

[1] https://lore.kernel.org/linux-doc/20210510030027.56044-1-songmuchun@bytedance.com/

This patch (of 3):

In [1], PMD mappings of vmemmap pages were disabled if the the feature
hugetlb_free_vmemmap was enabled.  This was done to simplify the initial
implementation of vmmemap freeing for hugetlb pages.  Now, remove this
simplification by allowing PMD mapping and switching to PTE mappings as
needed for allocated hugetlb pages.

When a hugetlb page is allocated, the vmemmap page tables are walked to
free vmemmap pages.  During this walk, split huge PMD mappings to PTE
mappings as required.  In the unlikely case PTE pages can not be
allocated, return error(ENOMEM) and do not optimize vmemmap of the hugetlb
page.

When HugeTLB pages are freed from the pool, we do not attempt to
coalesce and move back to a PMD mapping because it is much more complex.

[1] https://lkml.kernel.org/r/20210510030027.56044-8-songmuchun@bytedance.com

Link: https://lkml.kernel.org/r/20210616094915.34432-1-songmuchun@bytedance.com
Link: https://lkml.kernel.org/r/20210616094915.34432-2-songmuchun@bytedance.comSigned-off-by: default avatarMuchun Song <songmuchun@bytedance.com>
Reviewed-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 8cc5fcbb
...@@ -3076,8 +3076,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip) ...@@ -3076,8 +3076,8 @@ static inline void print_vma_addr(char *prefix, unsigned long rip)
} }
#endif #endif
void vmemmap_remap_free(unsigned long start, unsigned long end, int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse); unsigned long reuse);
int vmemmap_remap_alloc(unsigned long start, unsigned long end, int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask); unsigned long reuse, gfp_t gfp_mask);
......
...@@ -258,9 +258,8 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head) ...@@ -258,9 +258,8 @@ void free_huge_page_vmemmap(struct hstate *h, struct page *head)
* to the page which @vmemmap_reuse is mapped to, then free the pages * to the page which @vmemmap_reuse is mapped to, then free the pages
* which the range [@vmemmap_addr, @vmemmap_end] is mapped to. * which the range [@vmemmap_addr, @vmemmap_end] is mapped to.
*/ */
vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse); if (!vmemmap_remap_free(vmemmap_addr, vmemmap_end, vmemmap_reuse))
SetHPageVmemmapOptimized(head);
SetHPageVmemmapOptimized(head);
} }
void __init hugetlb_vmemmap_init(struct hstate *h) void __init hugetlb_vmemmap_init(struct hstate *h)
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
* struct vmemmap_remap_walk - walk vmemmap page table * struct vmemmap_remap_walk - walk vmemmap page table
* *
* @remap_pte: called for each lowest-level entry (PTE). * @remap_pte: called for each lowest-level entry (PTE).
* @nr_walked: the number of walked pte.
* @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_page: the page which is reused for the tail vmemmap pages.
* @reuse_addr: the virtual address of the @reuse_page page. * @reuse_addr: the virtual address of the @reuse_page page.
* @vmemmap_pages: the list head of the vmemmap pages that can be freed * @vmemmap_pages: the list head of the vmemmap pages that can be freed
...@@ -46,11 +47,44 @@ ...@@ -46,11 +47,44 @@
struct vmemmap_remap_walk { struct vmemmap_remap_walk {
void (*remap_pte)(pte_t *pte, unsigned long addr, void (*remap_pte)(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk); struct vmemmap_remap_walk *walk);
unsigned long nr_walked;
struct page *reuse_page; struct page *reuse_page;
unsigned long reuse_addr; unsigned long reuse_addr;
struct list_head *vmemmap_pages; struct list_head *vmemmap_pages;
}; };
static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
struct vmemmap_remap_walk *walk)
{
pmd_t __pmd;
int i;
unsigned long addr = start;
struct page *page = pmd_page(*pmd);
pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
if (!pgtable)
return -ENOMEM;
pmd_populate_kernel(&init_mm, &__pmd, pgtable);
for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
pte_t entry, *pte;
pgprot_t pgprot = PAGE_KERNEL;
entry = mk_pte(page + i, pgprot);
pte = pte_offset_kernel(&__pmd, addr);
set_pte_at(&init_mm, addr, pte, entry);
}
/* Make pte visible before pmd. See comment in __pte_alloc(). */
smp_wmb();
pmd_populate_kernel(&init_mm, pmd, pgtable);
flush_tlb_kernel_range(start, start + PMD_SIZE);
return 0;
}
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long end,
struct vmemmap_remap_walk *walk) struct vmemmap_remap_walk *walk)
...@@ -69,58 +103,80 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, ...@@ -69,58 +103,80 @@ static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
*/ */
addr += PAGE_SIZE; addr += PAGE_SIZE;
pte++; pte++;
walk->nr_walked++;
} }
for (; addr != end; addr += PAGE_SIZE, pte++) for (; addr != end; addr += PAGE_SIZE, pte++) {
walk->remap_pte(pte, addr, walk); walk->remap_pte(pte, addr, walk);
walk->nr_walked++;
}
} }
static void vmemmap_pmd_range(pud_t *pud, unsigned long addr, static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
unsigned long end, unsigned long end,
struct vmemmap_remap_walk *walk) struct vmemmap_remap_walk *walk)
{ {
pmd_t *pmd; pmd_t *pmd;
unsigned long next; unsigned long next;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
do { do {
BUG_ON(pmd_leaf(*pmd)); if (pmd_leaf(*pmd)) {
int ret;
ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
if (ret)
return ret;
}
next = pmd_addr_end(addr, end); next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk); vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end); } while (pmd++, addr = next, addr != end);
return 0;
} }
static void vmemmap_pud_range(p4d_t *p4d, unsigned long addr, static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
unsigned long end, unsigned long end,
struct vmemmap_remap_walk *walk) struct vmemmap_remap_walk *walk)
{ {
pud_t *pud; pud_t *pud;
unsigned long next; unsigned long next;
pud = pud_offset(p4d, addr); pud = pud_offset(p4d, addr);
do { do {
int ret;
next = pud_addr_end(addr, end); next = pud_addr_end(addr, end);
vmemmap_pmd_range(pud, addr, next, walk); ret = vmemmap_pmd_range(pud, addr, next, walk);
if (ret)
return ret;
} while (pud++, addr = next, addr != end); } while (pud++, addr = next, addr != end);
return 0;
} }
static void vmemmap_p4d_range(pgd_t *pgd, unsigned long addr, static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
unsigned long end, unsigned long end,
struct vmemmap_remap_walk *walk) struct vmemmap_remap_walk *walk)
{ {
p4d_t *p4d; p4d_t *p4d;
unsigned long next; unsigned long next;
p4d = p4d_offset(pgd, addr); p4d = p4d_offset(pgd, addr);
do { do {
int ret;
next = p4d_addr_end(addr, end); next = p4d_addr_end(addr, end);
vmemmap_pud_range(p4d, addr, next, walk); ret = vmemmap_pud_range(p4d, addr, next, walk);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end); } while (p4d++, addr = next, addr != end);
return 0;
} }
static void vmemmap_remap_range(unsigned long start, unsigned long end, static int vmemmap_remap_range(unsigned long start, unsigned long end,
struct vmemmap_remap_walk *walk) struct vmemmap_remap_walk *walk)
{ {
unsigned long addr = start; unsigned long addr = start;
unsigned long next; unsigned long next;
...@@ -131,8 +187,12 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end, ...@@ -131,8 +187,12 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
pgd = pgd_offset_k(addr); pgd = pgd_offset_k(addr);
do { do {
int ret;
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
vmemmap_p4d_range(pgd, addr, next, walk); ret = vmemmap_p4d_range(pgd, addr, next, walk);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
/* /*
...@@ -141,6 +201,8 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end, ...@@ -141,6 +201,8 @@ static void vmemmap_remap_range(unsigned long start, unsigned long end,
* belongs to the range. * belongs to the range.
*/ */
flush_tlb_kernel_range(start + PAGE_SIZE, end); flush_tlb_kernel_range(start + PAGE_SIZE, end);
return 0;
} }
/* /*
...@@ -179,10 +241,27 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, ...@@ -179,10 +241,27 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
pte_t entry = mk_pte(walk->reuse_page, pgprot); pte_t entry = mk_pte(walk->reuse_page, pgprot);
struct page *page = pte_page(*pte); struct page *page = pte_page(*pte);
list_add(&page->lru, walk->vmemmap_pages); list_add_tail(&page->lru, walk->vmemmap_pages);
set_pte_at(&init_mm, addr, pte, entry); set_pte_at(&init_mm, addr, pte, entry);
} }
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
struct vmemmap_remap_walk *walk)
{
pgprot_t pgprot = PAGE_KERNEL;
struct page *page;
void *to;
BUG_ON(pte_page(*pte) != walk->reuse_page);
page = list_first_entry(walk->vmemmap_pages, struct page, lru);
list_del(&page->lru);
to = page_to_virt(page);
copy_page(to, (void *)walk->reuse_addr);
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
}
/** /**
* vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
* to the page which @reuse is mapped to, then free vmemmap * to the page which @reuse is mapped to, then free vmemmap
...@@ -193,12 +272,12 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, ...@@ -193,12 +272,12 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* remap. * remap.
* @reuse: reuse address. * @reuse: reuse address.
* *
* Note: This function depends on vmemmap being base page mapped. Please make * Return: %0 on success, negative error code otherwise.
* sure that we disable PMD mapping of vmemmap pages when calling this function.
*/ */
void vmemmap_remap_free(unsigned long start, unsigned long end, int vmemmap_remap_free(unsigned long start, unsigned long end,
unsigned long reuse) unsigned long reuse)
{ {
int ret;
LIST_HEAD(vmemmap_pages); LIST_HEAD(vmemmap_pages);
struct vmemmap_remap_walk walk = { struct vmemmap_remap_walk walk = {
.remap_pte = vmemmap_remap_pte, .remap_pte = vmemmap_remap_pte,
...@@ -221,25 +300,31 @@ void vmemmap_remap_free(unsigned long start, unsigned long end, ...@@ -221,25 +300,31 @@ void vmemmap_remap_free(unsigned long start, unsigned long end,
*/ */
BUG_ON(start - reuse != PAGE_SIZE); BUG_ON(start - reuse != PAGE_SIZE);
vmemmap_remap_range(reuse, end, &walk); mmap_write_lock(&init_mm);
free_vmemmap_page_list(&vmemmap_pages); ret = vmemmap_remap_range(reuse, end, &walk);
} mmap_write_downgrade(&init_mm);
static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, if (ret && walk.nr_walked) {
struct vmemmap_remap_walk *walk) end = reuse + walk.nr_walked * PAGE_SIZE;
{ /*
pgprot_t pgprot = PAGE_KERNEL; * vmemmap_pages contains pages from the previous
struct page *page; * vmemmap_remap_range call which failed. These
void *to; * are pages which were removed from the vmemmap.
* They will be restored in the following call.
*/
walk = (struct vmemmap_remap_walk) {
.remap_pte = vmemmap_restore_pte,
.reuse_addr = reuse,
.vmemmap_pages = &vmemmap_pages,
};
BUG_ON(pte_page(*pte) != walk->reuse_page); vmemmap_remap_range(reuse, end, &walk);
}
mmap_read_unlock(&init_mm);
page = list_first_entry(walk->vmemmap_pages, struct page, lru); free_vmemmap_page_list(&vmemmap_pages);
list_del(&page->lru);
to = page_to_virt(page);
copy_page(to, (void *)walk->reuse_addr);
set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); return ret;
} }
static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
...@@ -273,6 +358,8 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, ...@@ -273,6 +358,8 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
* remap. * remap.
* @reuse: reuse address. * @reuse: reuse address.
* @gfp_mask: GFP flag for allocating vmemmap pages. * @gfp_mask: GFP flag for allocating vmemmap pages.
*
* Return: %0 on success, negative error code otherwise.
*/ */
int vmemmap_remap_alloc(unsigned long start, unsigned long end, int vmemmap_remap_alloc(unsigned long start, unsigned long end,
unsigned long reuse, gfp_t gfp_mask) unsigned long reuse, gfp_t gfp_mask)
...@@ -287,12 +374,12 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end, ...@@ -287,12 +374,12 @@ int vmemmap_remap_alloc(unsigned long start, unsigned long end,
/* See the comment in the vmemmap_remap_free(). */ /* See the comment in the vmemmap_remap_free(). */
BUG_ON(start - reuse != PAGE_SIZE); BUG_ON(start - reuse != PAGE_SIZE);
might_sleep_if(gfpflags_allow_blocking(gfp_mask));
if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages)) if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
return -ENOMEM; return -ENOMEM;
mmap_read_lock(&init_mm);
vmemmap_remap_range(reuse, end, &walk); vmemmap_remap_range(reuse, end, &walk);
mmap_read_unlock(&init_mm);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment