Commit cbee9f88 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Mel Gorman

mm: numa: Add fault driven placement and migration

NOTE: This patch is based on "sched, numa, mm: Add fault driven
	placement and migration policy" but as it throws away all the policy
	to just leave a basic foundation I had to drop the signed-offs-by.

This patch creates a bare-bones method for setting PTEs pte_numa in the
context of the scheduler that when faulted later will be faulted onto the
node the CPU is running on.  In itself this does nothing useful but any
placement policy will fundamentally depend on receiving hints on placement
from fault context and doing something intelligent about it.
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Acked-by: default avatarRik van Riel <riel@redhat.com>
parent a720094d
......@@ -111,6 +111,7 @@ config VSYSCALL
config NUMA
bool "Non Uniform Memory Access (NUMA) Support"
depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL
select ARCH_WANT_NUMA_VARIABLE_LOCALITY
default n
help
Some SH systems have many various memories scattered around
......
......@@ -22,6 +22,8 @@ config X86
def_bool y
select HAVE_AOUT if X86_32
select HAVE_UNSTABLE_SCHED_CLOCK
select ARCH_SUPPORTS_NUMA_BALANCING
select ARCH_WANTS_PROT_NUMA_PROT_NONE
select HAVE_IDE
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
......
......@@ -397,6 +397,17 @@ struct mm_struct {
#endif
#ifdef CONFIG_CPUMASK_OFFSTACK
struct cpumask cpumask_allocation;
#endif
#ifdef CONFIG_NUMA_BALANCING
/*
* numa_next_scan is the next time when the PTEs will me marked
* pte_numa to gather statistics and migrate pages to new nodes
* if necessary
*/
unsigned long numa_next_scan;
/* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq;
#endif
struct uprobes_state uprobes_state;
};
......
......@@ -1479,6 +1479,14 @@ struct task_struct {
short il_next;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
int numa_migrate_seq;
unsigned int numa_scan_period;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
#endif /* CONFIG_NUMA_BALANCING */
struct rcu_head rcu;
/*
......@@ -1553,6 +1561,14 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int node, int pages);
#else
static inline void task_numa_fault(int node, int pages)
{
}
#endif
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
......@@ -1990,6 +2006,10 @@ enum sched_tunable_scaling {
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_settle_count;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
......
......@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies;
p->mm->numa_scan_seq = 0;
}
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
p->numa_work.next = &p->numa_work;
#endif /* CONFIG_NUMA_BALANCING */
}
/*
......
......@@ -26,6 +26,8 @@
#include <linux/slab.h>
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
#include <linux/task_work.h>
#include <trace/events/sched.h>
......@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
#ifdef CONFIG_NUMA_BALANCING
/*
* numa task sample period in ms: 5s
*/
unsigned int sysctl_numa_balancing_scan_period_min = 5000;
unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
static void task_numa_placement(struct task_struct *p)
{
int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
/* FIXME: Scheduling placement policy hints go here */
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
void task_numa_fault(int node, int pages)
{
struct task_struct *p = current;
/* FIXME: Allocate task-specific structure for placement policy here */
task_numa_placement(p);
}
/*
* The expensive part of numa migration is done from task_work context.
* Triggered from task_tick_numa().
*/
void task_numa_work(struct callback_head *work)
{
unsigned long migrate, next_scan, now = jiffies;
struct task_struct *p = current;
struct mm_struct *mm = p->mm;
WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
work->next = work; /* protect against double add */
/*
* Who cares about NUMA placement when they're dying.
*
* NOTE: make sure not to dereference p->mm before this check,
* exit_task_work() happens _after_ exit_mm() so we could be called
* without p->mm even though we still had it when we enqueued this
* work.
*/
if (p->flags & PF_EXITING)
return;
/*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
if (time_before(now, migrate))
return;
if (p->numa_scan_period == 0)
p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
ACCESS_ONCE(mm->numa_scan_seq)++;
{
struct vm_area_struct *vma;
down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (!vma_migratable(vma))
continue;
change_prot_numa(vma, vma->vm_start, vma->vm_end);
}
up_read(&mm->mmap_sem);
}
}
/*
* Drive the periodic memory faults..
*/
void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
struct callback_head *work = &curr->numa_work;
u64 period, now;
/*
* We don't care about NUMA placement if we don't have memory.
*/
if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
return;
/*
* Using runtime rather than walltime has the dual advantage that
* we (mostly) drive the selection from busy threads and that the
* task needs to have done some actual work before we bother with
* NUMA placement.
*/
now = curr->se.sum_exec_runtime;
period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
if (now - curr->node_stamp > period) {
curr->node_stamp = now;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
task_work_add(curr, work, true);
}
}
}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
}
#endif /* CONFIG_NUMA_BALANCING */
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
......@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (sched_feat_numa(NUMA))
task_tick_numa(rq, curr);
}
/*
......
......@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
SCHED_FEAT(FORCE_SD_OVERLAP, false)
SCHED_FEAT(RT_RUNTIME_SHARE, true)
SCHED_FEAT(LB_MIN, false)
/*
* Apply the automatic NUMA scheduling policy
*/
#ifdef CONFIG_NUMA_BALANCING
SCHED_FEAT(NUMA, true)
#endif
......@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
#ifdef CONFIG_NUMA_BALANCING
#define sched_feat_numa(x) sched_feat(x)
#else
#define sched_feat_numa(x) (0)
#endif
static inline u64 global_rt_period(void)
{
return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
......
......@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
static int min_wakeup_granularity_ns; /* 0 usecs */
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
#ifdef CONFIG_SMP
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
#endif
#endif /* CONFIG_SMP */
#endif /* CONFIG_SCHED_DEBUG */
#ifdef CONFIG_COMPACTION
static int min_extfrag_threshold;
......@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
......@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
#endif
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_scan_period_min_ms",
.data = &sysctl_numa_balancing_scan_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
......
......@@ -1046,6 +1046,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
split_huge_page(page);
put_page(page);
return 0;
clear_pmdnuma:
......@@ -1060,8 +1061,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
out_unlock:
spin_unlock(&mm->page_table_lock);
if (page)
if (page) {
put_page(page);
task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
}
return 0;
}
......
......@@ -3454,7 +3454,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *page = NULL;
spinlock_t *ptl;
int current_nid, target_nid;
int current_nid = -1;
int target_nid;
/*
* The "pte" at this point cannot be used safely without
......@@ -3501,6 +3502,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
current_nid = target_nid;
out:
task_numa_fault(current_nid, 1);
return 0;
}
......@@ -3537,6 +3539,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
pte_t pteval = *pte;
struct page *page;
int curr_nid;
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
......@@ -3554,6 +3557,15 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
/* only check non-shared pages */
if (unlikely(page_mapcount(page) != 1))
continue;
pte_unmap_unlock(pte, ptl);
curr_nid = page_to_nid(page);
task_numa_fault(curr_nid, 1);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}
pte_unmap_unlock(orig_pte, ptl);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment