Commit 71a2c112 authored by Kirill A. Shutemov's avatar Kirill A. Shutemov Committed by Linus Torvalds

khugepaged: introduce 'max_ptes_shared' tunable

'max_ptes_shared' specifies how many pages can be shared across multiple
processes.  Exceeding the number would block the collapse::

	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared

A higher value may increase memory footprint for some workloads.

By default, at least half of pages has to be not shared.

[colin.king@canonical.com: fix several spelling mistakes]
  Link: http://lkml.kernel.org/r/20200420084241.65433-1-colin.king@canonical.comSigned-off-by: default avatarKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: default avatarColin Ian King <colin.king@canonical.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Tested-by: default avatarZi Yan <ziy@nvidia.com>
Reviewed-by: default avatarWilliam Kucharski <william.kucharski@oracle.com>
Reviewed-by: default avatarZi Yan <ziy@nvidia.com>
Acked-by: default avatarYang Shi <yang.shi@linux.alibaba.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Link: http://lkml.kernel.org/r/20200416160026.16538-9-kirill.shutemov@linux.intel.comSigned-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3917c802
...@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being ...@@ -220,6 +220,13 @@ memory. A lower value can prevent THPs from being
collapsed, resulting fewer pages being collapsed into collapsed, resulting fewer pages being collapsed into
THPs, and lower memory access performance. THPs, and lower memory access performance.
``max_ptes_shared`` specifies how many pages can be shared across multiple
processes. Exceeding the number would block the collapse::
/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_shared
A higher value may increase memory footprint for some workloads.
Boot parameter Boot parameter
============== ==============
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_SUCCEED, "succeeded") \
EM( SCAN_PMD_NULL, "pmd_null") \ EM( SCAN_PMD_NULL, "pmd_null") \
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \
EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \
EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_PAGE_RO, "no_writable_page") \
...@@ -31,7 +33,6 @@ ...@@ -31,7 +33,6 @@
EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\ EM( SCAN_DEL_PAGE_LRU, "could_not_delete_page_from_lru")\
EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \ EM( SCAN_ALLOC_HUGE_PAGE_FAIL, "alloc_huge_page_failed") \
EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \ EM( SCAN_CGROUP_CHARGE_FAIL, "ccgroup_charge_failed") \
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
EM( SCAN_TRUNCATED, "truncated") \ EM( SCAN_TRUNCATED, "truncated") \
EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \ EMe(SCAN_PAGE_HAS_PRIVATE, "page_has_private") \
......
...@@ -28,6 +28,8 @@ enum scan_result { ...@@ -28,6 +28,8 @@ enum scan_result {
SCAN_SUCCEED, SCAN_SUCCEED,
SCAN_PMD_NULL, SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_NONE_PTE,
SCAN_EXCEED_SWAP_PTE,
SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT, SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP, SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO, SCAN_PAGE_RO,
...@@ -47,7 +49,6 @@ enum scan_result { ...@@ -47,7 +49,6 @@ enum scan_result {
SCAN_DEL_PAGE_LRU, SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL, SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL, SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED, SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE, SCAN_PAGE_HAS_PRIVATE,
}; };
...@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); ...@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*/ */
static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10 #define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
...@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = ...@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show, __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
khugepaged_max_ptes_swap_store); khugepaged_max_ptes_swap_store);
static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
}
static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
if (err || max_ptes_shared > HPAGE_PMD_NR-1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
return count;
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
__ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
khugepaged_max_ptes_shared_store);
static struct attribute *khugepaged_attr[] = { static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr, &khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr, &khugepaged_max_ptes_none_attr.attr,
&khugepaged_max_ptes_swap_attr.attr,
&khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr, &pages_to_scan_attr.attr,
&pages_collapsed_attr.attr, &pages_collapsed_attr.attr,
&full_scans_attr.attr, &full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr, &scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr, &alloc_sleep_millisecs_attr.attr,
&khugepaged_max_ptes_swap_attr.attr,
NULL, NULL,
}; };
...@@ -359,6 +389,7 @@ int __init khugepaged_init(void) ...@@ -359,6 +389,7 @@ int __init khugepaged_init(void)
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8; khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0; return 0;
} }
...@@ -557,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, ...@@ -557,7 +588,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{ {
struct page *page = NULL; struct page *page = NULL;
pte_t *_pte; pte_t *_pte;
int none_or_zero = 0, result = 0, referenced = 0; int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false; bool writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR; for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
...@@ -585,6 +616,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, ...@@ -585,6 +616,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page);
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
if (PageCompound(page)) { if (PageCompound(page)) {
struct page *p; struct page *p;
page = compound_head(page); page = compound_head(page);
...@@ -1168,7 +1205,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ...@@ -1168,7 +1205,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{ {
pmd_t *pmd; pmd_t *pmd;
pte_t *pte, *_pte; pte_t *pte, *_pte;
int ret = 0, none_or_zero = 0, result = 0, referenced = 0; int ret = 0, result = 0, referenced = 0;
int none_or_zero = 0, shared = 0;
struct page *page = NULL; struct page *page = NULL;
unsigned long _address; unsigned long _address;
spinlock_t *ptl; spinlock_t *ptl;
...@@ -1240,6 +1278,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ...@@ -1240,6 +1278,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap; goto out_unmap;
} }
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
page = compound_head(page); page = compound_head(page);
/* /*
......
...@@ -78,6 +78,7 @@ struct khugepaged_settings { ...@@ -78,6 +78,7 @@ struct khugepaged_settings {
unsigned int scan_sleep_millisecs; unsigned int scan_sleep_millisecs;
unsigned int max_ptes_none; unsigned int max_ptes_none;
unsigned int max_ptes_swap; unsigned int max_ptes_swap;
unsigned int max_ptes_shared;
unsigned long pages_to_scan; unsigned long pages_to_scan;
}; };
...@@ -277,6 +278,7 @@ static void write_settings(struct settings *settings) ...@@ -277,6 +278,7 @@ static void write_settings(struct settings *settings)
khugepaged->scan_sleep_millisecs); khugepaged->scan_sleep_millisecs);
write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
} }
...@@ -313,6 +315,7 @@ static void save_settings(void) ...@@ -313,6 +315,7 @@ static void save_settings(void)
read_num("khugepaged/scan_sleep_millisecs"), read_num("khugepaged/scan_sleep_millisecs"),
.max_ptes_none = read_num("khugepaged/max_ptes_none"), .max_ptes_none = read_num("khugepaged/max_ptes_none"),
.max_ptes_swap = read_num("khugepaged/max_ptes_swap"), .max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
.pages_to_scan = read_num("khugepaged/pages_to_scan"), .pages_to_scan = read_num("khugepaged/pages_to_scan"),
}; };
success("OK"); success("OK");
...@@ -896,12 +899,90 @@ static void collapse_fork_compound(void) ...@@ -896,12 +899,90 @@ static void collapse_fork_compound(void)
fail("Fail"); fail("Fail");
fill_memory(p, 0, page_size); fill_memory(p, 0, page_size);
write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
fail("Timeout"); fail("Timeout");
else if (check_huge(p)) else if (check_huge(p))
success("OK"); success("OK");
else else
fail("Fail"); fail("Fail");
write_num("khugepaged/max_ptes_shared",
default_settings.khugepaged.max_ptes_shared);
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
exit(exit_status);
}
wait(&wstatus);
exit_status += WEXITSTATUS(wstatus);
printf("Check if parent still has huge page...");
if (check_huge(p))
success("OK");
else
fail("Fail");
validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size);
}
static void collapse_max_ptes_shared()
{
int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
int wstatus;
void *p;
p = alloc_mapping();
printf("Allocate huge page...");
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
fill_memory(p, 0, hpage_pmd_size);
if (check_huge(p))
success("OK");
else
fail("Fail");
printf("Share huge page over fork()...");
if (!fork()) {
/* Do not touch settings on child exit */
skip_settings_restore = true;
exit_status = 0;
if (check_huge(p))
success("OK");
else
fail("Fail");
printf("Trigger CoW on page %d of %d...",
hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
if (!check_huge(p))
success("OK");
else
fail("Fail");
if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
fail("Timeout");
else if (!check_huge(p))
success("OK");
else
fail("Fail");
printf("Trigger CoW on page %d of %d...",
hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
if (!check_huge(p))
success("OK");
else
fail("Fail");
if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
fail("Timeout");
else if (check_huge(p))
success("OK");
else
fail("Fail");
validate_memory(p, 0, hpage_pmd_size); validate_memory(p, 0, hpage_pmd_size);
munmap(p, hpage_pmd_size); munmap(p, hpage_pmd_size);
...@@ -930,6 +1011,7 @@ int main(void) ...@@ -930,6 +1011,7 @@ int main(void)
default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
save_settings(); save_settings();
...@@ -947,6 +1029,7 @@ int main(void) ...@@ -947,6 +1029,7 @@ int main(void)
collapse_compound_extreme(); collapse_compound_extreme();
collapse_fork(); collapse_fork();
collapse_fork_compound(); collapse_fork_compound();
collapse_max_ptes_shared();
restore_settings(0); restore_settings(0);
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment