Commit c1c3443c authored by David Rientjes's avatar David Rientjes Committed by H. Peter Anvin

x86, numa: Fake node-to-cpumask for NUMA emulation

It's necessary to fake the node-to-cpumask mapping so that an emulated
node ID returns a cpumask that includes all cpus that have affinity to
the memory it represents.

This is a little intrusive because it requires knowledge of the physical
topology of the system.  setup_physnodes() gives us that information, but
since NUMA emulation ends up altering the physnodes array, it's necessary
to reset it before cpus are brought online.

Accordingly, the physnodes array is moved out of init.data and into
cpuinit.data since it will be needed on cpuup callbacks.

This works regardless of whether numa=fake is used on the command line,
or the setup of the fake node succeeds or fails.  The physnodes array
always contains the physical topology of the machine if CONFIG_NUMA_EMU
is enabled and can be used to setup the correct node-to-cpumask mappings
in all cases since setup_physnodes() is called whenever the array needs
to be repopulated with the correct data.

To fake the actual mappings, numa_add_cpu() and numa_remove_cpu() are
rewritten for CONFIG_NUMA_EMU so that we first find the physical node to
which each cpu has local affinity, then iterate through all online nodes
to find the emulated nodes that have local affinity to that physical
node, and then finally map the cpu to each of those emulated nodes.
Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
LKML-Reference: <alpine.DEB.2.00.1012221701520.3701@chino.kir.corp.google.com>
Signed-off-by: default avatarH. Peter Anvin <hpa@linux.intel.com>
parent f51bf307
...@@ -260,7 +260,7 @@ void __init numa_init_array(void) ...@@ -260,7 +260,7 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU #ifdef CONFIG_NUMA_EMU
/* Numa emulation */ /* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata; static struct bootnode nodes[MAX_NUMNODES] __initdata;
static struct bootnode physnodes[MAX_NUMNODES] __initdata; static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata; static char *cmdline __initdata;
static int __init setup_physnodes(unsigned long start, unsigned long end, static int __init setup_physnodes(unsigned long start, unsigned long end,
...@@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, ...@@ -270,6 +270,7 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
int ret = 0; int ret = 0;
int i; int i;
memset(physnodes, 0, sizeof(physnodes));
#ifdef CONFIG_ACPI_NUMA #ifdef CONFIG_ACPI_NUMA
if (acpi) if (acpi)
nr_nodes = acpi_get_nodes(physnodes); nr_nodes = acpi_get_nodes(physnodes);
...@@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) ...@@ -370,8 +371,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated. * to max_addr. The return value is the number of nodes allocated.
*/ */
static int __init split_nodes_interleave(u64 addr, u64 max_addr, static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
int nr_phys_nodes, int nr_nodes)
{ {
nodemask_t physnode_mask = NODE_MASK_NONE; nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size; u64 size;
...@@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr, ...@@ -402,7 +402,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
return -1; return -1;
} }
for (i = 0; i < nr_phys_nodes; i++) for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end) if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask); node_set(i, physnode_mask);
...@@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn, ...@@ -571,11 +571,9 @@ static int __init numa_emulation(unsigned long start_pfn,
{ {
u64 addr = start_pfn << PAGE_SHIFT; u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT;
int num_phys_nodes;
int num_nodes; int num_nodes;
int i; int i;
num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd);
/* /*
* If the numa=fake command-line contains a 'M' or 'G', it represents * If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N, * the fixed node size. Otherwise, if it is just a single number N,
...@@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn, ...@@ -590,7 +588,7 @@ static int __init numa_emulation(unsigned long start_pfn,
unsigned long n; unsigned long n;
n = simple_strtoul(cmdline, NULL, 0); n = simple_strtoul(cmdline, NULL, 0);
num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n); num_nodes = split_nodes_interleave(addr, max_addr, n);
} }
if (num_nodes < 0) if (num_nodes < 0)
...@@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn, ...@@ -613,6 +611,7 @@ static int __init numa_emulation(unsigned long start_pfn,
nodes[i].end >> PAGE_SHIFT); nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end); setup_node_bootmem(i, nodes[i].start, nodes[i].end);
} }
setup_physnodes(addr, max_addr, acpi, amd);
fake_physnodes(acpi, amd, num_nodes); fake_physnodes(acpi, amd, num_nodes);
numa_init_array(); numa_init_array();
return 0; return 0;
...@@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, ...@@ -628,8 +627,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map); nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU #ifdef CONFIG_NUMA_EMU
setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
acpi, amd);
if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return; return;
setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
acpi, amd);
nodes_clear(node_possible_map); nodes_clear(node_possible_map);
nodes_clear(node_online_map); nodes_clear(node_online_map);
#endif #endif
...@@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu) ...@@ -785,6 +788,7 @@ void __cpuinit numa_clear_node(int cpu)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS #ifndef CONFIG_DEBUG_PER_CPU_MAPS
#ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu) void __cpuinit numa_add_cpu(int cpu)
{ {
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
...@@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu) ...@@ -794,6 +798,51 @@ void __cpuinit numa_remove_cpu(int cpu)
{ {
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
} }
#else
void __cpuinit numa_add_cpu(int cpu)
{
unsigned long addr;
u16 apicid;
int physnid;
int nid = NUMA_NO_NODE;
apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
if (apicid != BAD_APICID)
nid = apicid_to_node[apicid];
if (nid == NUMA_NO_NODE)
nid = early_cpu_to_node(cpu);
BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
/*
* Use the starting address of the emulated node to find which physical
* node it is allocated on.
*/
addr = node_start_pfn(nid) << PAGE_SHIFT;
for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
if (addr >= physnodes[physnid].start &&
addr < physnodes[physnid].end)
break;
/*
* Map the cpu to each emulated node that is allocated on the physical
* node of the cpu's apic id.
*/
for_each_online_node(nid) {
addr = node_start_pfn(nid) << PAGE_SHIFT;
if (addr >= physnodes[physnid].start &&
addr < physnodes[physnid].end)
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
}
}
void __cpuinit numa_remove_cpu(int cpu)
{
int i;
for_each_online_node(i)
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
}
#endif /* !CONFIG_NUMA_EMU */
#else /* CONFIG_DEBUG_PER_CPU_MAPS */ #else /* CONFIG_DEBUG_PER_CPU_MAPS */
...@@ -805,10 +854,18 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) ...@@ -805,10 +854,18 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
int node = early_cpu_to_node(cpu); int node = early_cpu_to_node(cpu);
struct cpumask *mask; struct cpumask *mask;
char buf[64]; char buf[64];
int i;
for_each_online_node(i) {
unsigned long addr;
addr = node_start_pfn(i) << PAGE_SHIFT;
if (addr < physnodes[node].start ||
addr >= physnodes[node].end)
continue;
mask = node_to_cpumask_map[node]; mask = node_to_cpumask_map[node];
if (mask == NULL) { if (mask == NULL) {
printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node); pr_err("node_to_cpumask_map[%i] NULL\n", i);
dump_stack(); dump_stack();
return; return;
} }
...@@ -820,7 +877,9 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable) ...@@ -820,7 +877,9 @@ static void __cpuinit numa_set_cpumask(int cpu, int enable)
cpulist_scnprintf(buf, sizeof(buf), mask); cpulist_scnprintf(buf, sizeof(buf), mask);
printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf); enable ? "numa_add_cpu" : "numa_remove_cpu",
cpu, node, buf);
}
} }
void __cpuinit numa_add_cpu(int cpu) void __cpuinit numa_add_cpu(int cpu)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment