Commit cc74ca30 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'

Yury Norov says:

====================
sched: cpumask: improve on cpumask_local_spread() locality

cpumask_local_spread() currently checks local node for presence of i'th
CPU, and then if it finds nothing makes a flat search among all non-local
CPUs. We can do it better by checking CPUs per NUMA hops.

This has significant performance implications on NUMA machines, for example
when using NUMA-aware allocated memory together with NUMA-aware IRQ
affinity hints.

Performance tests from patch 8 of this series for mellanox network
driver show:

  TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on).
  Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121

  +-------------------------+-----------+------------------+------------------+
  |                         | BW (Gbps) | TX side CPU util | RX side CPU util |
  +-------------------------+-----------+------------------+------------------+
  | Baseline                | 52.3      | 6.4 %            | 17.9 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on TX side only | 52.6      | 5.2 %            | 18.5 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on RX side only | 94.9      | 11.9 %           | 27.2 %           |
  +-------------------------+-----------+------------------+------------------+
  | Applied on both sides   | 95.1      | 8.4 %            | 27.3 %           |
  +-------------------------+-----------+------------------+------------------+

  Bottleneck in RX side is released, reached linerate (~1.8x speedup).
  ~30% less cpu util on TX.
====================

Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 383d9f87 2ac4980c
...@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev) ...@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
static int comp_irqs_request(struct mlx5_core_dev *dev) static int comp_irqs_request(struct mlx5_core_dev *dev)
{ {
struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_eq_table *table = dev->priv.eq_table;
const struct cpumask *prev = cpu_none_mask;
const struct cpumask *mask;
int ncomp_eqs = table->num_comp_eqs; int ncomp_eqs = table->num_comp_eqs;
u16 *cpus; u16 *cpus;
int ret; int ret;
int cpu;
int i; int i;
ncomp_eqs = table->num_comp_eqs; ncomp_eqs = table->num_comp_eqs;
...@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev) ...@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
ret = -ENOMEM; ret = -ENOMEM;
goto free_irqs; goto free_irqs;
} }
for (i = 0; i < ncomp_eqs; i++)
cpus[i] = cpumask_local_spread(i, dev->priv.numa_node); i = 0;
rcu_read_lock();
for_each_numa_hop_mask(mask, dev->priv.numa_node) {
for_each_cpu_andnot(cpu, mask, prev) {
cpus[i] = cpu;
if (++i == ncomp_eqs)
goto spread_done;
}
prev = mask;
}
spread_done:
rcu_read_unlock();
ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs); ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
kfree(cpus); kfree(cpus);
if (ret < 0) if (ret < 0)
......
...@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1, ...@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
nr_cpumask_bits, cpumask_check(cpu)); nr_cpumask_bits, cpumask_check(cpu));
} }
/**
* cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
* @srcp1: the cpumask pointer
* @srcp2: the cpumask pointer
* @srcp3: the cpumask pointer
* @cpu: the N'th cpu to find, starting from 0
*
* Returns >= nr_cpu_ids if such cpu doesn't exist.
*/
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
const struct cpumask *srcp2,
const struct cpumask *srcp3)
{
return find_nth_and_andnot_bit(cpumask_bits(srcp1),
cpumask_bits(srcp2),
cpumask_bits(srcp3),
nr_cpumask_bits, cpumask_check(cpu));
}
#define CPU_BITS_NONE \ #define CPU_BITS_NONE \
{ \ { \
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \
......
...@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long ...@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
unsigned long size, unsigned long n); unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long size, unsigned long n); unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
const unsigned long *addr3, unsigned long size,
unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1, extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size); const unsigned long *addr2, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size); extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
...@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon ...@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
return __find_nth_andnot_bit(addr1, addr2, size, n); return __find_nth_andnot_bit(addr1, addr2, size, n);
} }
/**
* find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
* excluding those set in 3rd region
* @addr1: The 1st address to start the search at
* @addr2: The 2nd address to start the search at
* @addr3: The 3rd address to start the search at
* @size: The maximum number of bits to search
* @n: The number of set bit, which position is needed, counting from 0
*
* Returns the bit number of the N'th set bit.
* If no such, returns @size.
*/
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
const unsigned long *addr2,
const unsigned long *addr3,
unsigned long size, unsigned long n)
{
if (n >= size)
return size;
if (small_const_nbits(size)) {
unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);
return val ? fns(val, n) : size;
}
return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}
#ifndef find_first_and_bit #ifndef find_first_and_bit
/** /**
* find_first_and_bit - find the first set bit in both memory regions * find_first_and_bit - find the first set bit in both memory regions
......
...@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu) ...@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
return cpumask_of_node(cpu_to_node(cpu)); return cpumask_of_node(cpu_to_node(cpu));
} }
#ifdef CONFIG_NUMA
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
return cpumask_nth(cpu, cpus);
}
static inline const struct cpumask *
sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
return ERR_PTR(-EOPNOTSUPP);
}
#endif /* CONFIG_NUMA */
/**
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
* from a given node.
* @mask: the iteration variable.
* @node: the NUMA node to start the search from.
*
* Requires rcu_lock to be held.
*
* Yields cpu_online_mask for @node == NUMA_NO_NODE.
*/
#define for_each_numa_hop_mask(mask, node) \
for (unsigned int __hops = 0; \
mask = (node != NUMA_NO_NODE || __hops) ? \
sched_numa_hop_mask(node, __hops) : \
cpu_online_mask, \
!IS_ERR_OR_NULL(mask); \
__hops++)
#endif /* _LINUX_TOPOLOGY_H */ #endif /* _LINUX_TOPOLOGY_H */
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
* Scheduler topology setup/handling methods * Scheduler topology setup/handling methods
*/ */
#include <linux/bsearch.h>
DEFINE_MUTEX(sched_domains_mutex); DEFINE_MUTEX(sched_domains_mutex);
/* Protected by sched_domains_mutex: */ /* Protected by sched_domains_mutex: */
...@@ -2067,6 +2069,94 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ...@@ -2067,6 +2069,94 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
return found; return found;
} }
struct __cmp_key {
const struct cpumask *cpus;
struct cpumask ***masks;
int node;
int cpu;
int w;
};
static int hop_cmp(const void *a, const void *b)
{
struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
struct cpumask **cur_hop = *(struct cpumask ***)b;
struct __cmp_key *k = (struct __cmp_key *)a;
if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
return 1;
k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
if (k->w <= k->cpu)
return 0;
return -1;
}
/*
* sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
* closest to @cpu from @cpumask.
* cpumask: cpumask to find a cpu from
* cpu: Nth cpu to find
*
* returns: cpu, or nr_cpu_ids when nothing found.
*/
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
struct cpumask ***hop_masks;
int hop, ret = nr_cpu_ids;
rcu_read_lock();
k.masks = rcu_dereference(sched_domains_numa_masks);
if (!k.masks)
goto unlock;
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
hop = hop_masks - k.masks;
ret = hop ?
cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
cpumask_nth_and(cpu, cpus, k.masks[0][node]);
unlock:
rcu_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
/**
* sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
* @node
* @node: The node to count hops from.
* @hops: Include CPUs up to that many hops away. 0 means local node.
*
* Return: On success, a pointer to a cpumask of CPUs at most @hops away from
* @node, an error value otherwise.
*
* Requires rcu_lock to be held. Returned cpumask is only valid within that
* read-side section, copy it if required beyond that.
*
* Note that not all hops are equal in distance; see sched_init_numa() for how
* distances and masks are handled.
* Also note that this is a reflection of sched_domains_numa_masks, which may change
* during the lifetime of the system (offline nodes are taken out of the masks).
*/
const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
struct cpumask ***masks;
if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
return ERR_PTR(-EINVAL);
masks = rcu_dereference(sched_domains_numa_masks);
if (!masks)
return ERR_PTR(-EBUSY);
return masks[hops][node];
}
EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
#endif /* CONFIG_NUMA */ #endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map) static int __sdt_alloc(const struct cpumask *cpu_map)
......
...@@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask) ...@@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
#endif #endif
/** /**
* cpumask_local_spread - select the i'th cpu with local numa cpu's first * cpumask_local_spread - select the i'th cpu based on NUMA distances
* @i: index number * @i: index number
* @node: local numa_node * @node: local numa_node
* *
* This function selects an online CPU according to a numa aware policy; * Returns online CPU according to a numa aware policy; local cpus are returned
* local cpus are returned first, followed by non-local ones, then it * first, followed by non-local ones, then it wraps around.
* wraps around.
* *
* It's not very efficient, but useful for setup. * For those who wants to enumerate all CPUs based on their NUMA distances,
* i.e. call this function in a loop, like:
*
* for (i = 0; i < num_online_cpus(); i++) {
* cpu = cpumask_local_spread(i, node);
* do_something(cpu);
* }
*
* There's a better alternative based on for_each()-like iterators:
*
* for_each_numa_hop_mask(mask, node) {
* for_each_cpu_andnot(cpu, mask, prev)
* do_something(cpu);
* prev = mask;
* }
*
* It's simpler and more verbose than above. Complexity of iterator-based
* enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
* cpumask_local_spread() when called for each cpu is
* O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
*/ */
unsigned int cpumask_local_spread(unsigned int i, int node) unsigned int cpumask_local_spread(unsigned int i, int node)
{ {
...@@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node) ...@@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
/* Wrap: we always want a cpu. */ /* Wrap: we always want a cpu. */
i %= num_online_cpus(); i %= num_online_cpus();
if (node == NUMA_NO_NODE) { cpu = (node == NUMA_NO_NODE) ?
cpu = cpumask_nth(i, cpu_online_mask); cpumask_nth(i, cpu_online_mask) :
if (cpu < nr_cpu_ids) sched_numa_find_nth_cpu(cpu_online_mask, i, node);
return cpu;
} else {
/* NUMA first. */
cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node));
if (cpu < nr_cpu_ids)
return cpu;
i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node)); WARN_ON(cpu >= nr_cpu_ids);
/* Skip NUMA nodes, done above. */
cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node));
if (cpu < nr_cpu_ids)
return cpu; return cpu;
}
BUG();
} }
EXPORT_SYMBOL(cpumask_local_spread); EXPORT_SYMBOL(cpumask_local_spread);
......
...@@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l ...@@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l
} }
EXPORT_SYMBOL(__find_nth_andnot_bit); EXPORT_SYMBOL(__find_nth_andnot_bit);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
const unsigned long *addr2,
const unsigned long *addr3,
unsigned long size, unsigned long n)
{
return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
}
EXPORT_SYMBOL(__find_nth_and_andnot_bit);
#ifndef find_next_and_bit #ifndef find_next_and_bit
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2, unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
unsigned long nbits, unsigned long start) unsigned long nbits, unsigned long start)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment