Commit 074c2381 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm: numa: slow PTE scan rate if migration failures occur

Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226

  Across the board the 4.0-rc1 numbers are much slower, and the degradation
  is far worse when using the large memory footprint configs. Perf points
  straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config:

   -   56.07%    56.07%  [kernel]            [k] default_send_IPI_mask_sequence_phys
      - default_send_IPI_mask_sequence_phys
         - 99.99% physflat_send_IPI_mask
            - 99.37% native_send_call_func_ipi
                 smp_call_function_many
               - native_flush_tlb_others
                  - 99.85% flush_tlb_page
                       ptep_clear_flush
                       try_to_unmap_one
                       rmap_walk
                       try_to_unmap
                       migrate_pages
                       migrate_misplaced_page
                     - handle_mm_fault
                        - 99.73% __do_page_fault
                             trace_do_page_fault
                             do_async_page_fault
                           + async_page_fault
              0.63% native_send_call_func_single_ipi
                 generic_exec_single
                 smp_call_function_single

This is showing excessive migration activity even though excessive
migrations are meant to get throttled.  Normally, the scan rate is tuned
on a per-task basis depending on the locality of faults.  However, if
migrations fail for any reason then the PTE scanner may scan faster if
the faults continue to be remote.  This means there is higher system CPU
overhead and fault trapping at exactly the time we know that migrations
cannot happen.  This patch tracks when migration failures occur and
slows the PTE scanner.
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Reported-by: default avatarDave Chinner <david@fromorbit.com>
Tested-by: default avatarDave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent b191f9b1
...@@ -1625,11 +1625,11 @@ struct task_struct { ...@@ -1625,11 +1625,11 @@ struct task_struct {
/* /*
* numa_faults_locality tracks if faults recorded during the last * numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local. The task scan period is adapted * scan window were remote/local or failed to migrate. The task scan
* based on the locality of the faults with different weights * period is adapted based on the locality of the faults with different
* depending on whether they were shared or private faults * weights depending on whether they were shared or private faults
*/ */
unsigned long numa_faults_locality[2]; unsigned long numa_faults_locality[3];
unsigned long numa_pages_migrated; unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
...@@ -1719,6 +1719,7 @@ struct task_struct { ...@@ -1719,6 +1719,7 @@ struct task_struct {
#define TNF_NO_GROUP 0x02 #define TNF_NO_GROUP 0x02
#define TNF_SHARED 0x04 #define TNF_SHARED 0x04
#define TNF_FAULT_LOCAL 0x08 #define TNF_FAULT_LOCAL 0x08
#define TNF_MIGRATE_FAIL 0x10
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags); extern void task_numa_fault(int last_node, int node, int pages, int flags);
......
...@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, ...@@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p,
/* /*
* If there were no record hinting faults then either the task is * If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest * completely idle or all activity is areas that are not of interest
* to automatic numa balancing. Scan slower * to automatic numa balancing. Related to that, if there were failed
* migration then it implies we are migrating too quickly or the local
* node is overloaded. In either case, scan slower
*/ */
if (local + shared == 0) { if (local + shared == 0 || p->numa_faults_locality[2]) {
p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1); p->numa_scan_period << 1);
...@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) ...@@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated) if (migrated)
p->numa_pages_migrated += pages; p->numa_pages_migrated += pages;
if (flags & TNF_MIGRATE_FAIL)
p->numa_faults_locality[2] += pages;
p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
......
...@@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (migrated) { if (migrated) {
flags |= TNF_MIGRATED; flags |= TNF_MIGRATED;
page_nid = target_nid; page_nid = target_nid;
} } else
flags |= TNF_MIGRATE_FAIL;
goto out; goto out;
clear_pmdnuma: clear_pmdnuma:
......
...@@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (migrated) { if (migrated) {
page_nid = target_nid; page_nid = target_nid;
flags |= TNF_MIGRATED; flags |= TNF_MIGRATED;
} } else
flags |= TNF_MIGRATE_FAIL;
out: out:
if (page_nid != -1) if (page_nid != -1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment