Commit 4ba4f1af authored by Kan Liang's avatar Kan Liang Committed by Peter Zijlstra

perf: Generic hotplug support for a PMU with a scope

The perf subsystem assumes that the counters of a PMU are per-CPU. So
the user space tool reads a counter from each CPU in the system wide
mode. However, many PMUs don't have a per-CPU counter. The counter is
effective for a scope, e.g., a die or a socket. To address this, a
cpumask is exposed by the kernel driver to restrict to one CPU to stand
for a specific scope. In case the given CPU is removed,
the hotplug support has to be implemented for each such driver.

The codes to support the cpumask and hotplug are very similar.
- Expose a cpumask into sysfs
- Pickup another CPU in the same scope if the given CPU is removed.
- Invoke the perf_pmu_migrate_context() to migrate to a new CPU.
- In event init, always set the CPU in the cpumask to event->cpu

Similar duplicated codes are implemented for each such PMU driver. It
would be good to introduce a generic infrastructure to avoid such
duplication.

5 popular scopes are implemented here, core, die, cluster, pkg, and
the system-wide. The scope can be set when a PMU is registered. If so, a
"cpumask" is automatically exposed for the PMU.

The "cpumask" is from the perf_online_<scope>_mask, which is to track
the active CPU for each scope. They are set when the first CPU of the
scope is online via the generic perf hotplug support. When a
corresponding CPU is removed, the perf_online_<scope>_mask is updated
accordingly and the PMU will be moved to a new CPU from the same scope
if possible.
Signed-off-by: default avatarKan Liang <kan.liang@linux.intel.com>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20240802151643.1691631-2-kan.liang@linux.intel.com
parent cd7bdd9d
...@@ -295,6 +295,19 @@ struct perf_event_pmu_context; ...@@ -295,6 +295,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
/**
* pmu::scope
*/
enum perf_pmu_scope {
PERF_PMU_SCOPE_NONE = 0,
PERF_PMU_SCOPE_CORE,
PERF_PMU_SCOPE_DIE,
PERF_PMU_SCOPE_CLUSTER,
PERF_PMU_SCOPE_PKG,
PERF_PMU_SCOPE_SYS_WIDE,
PERF_PMU_MAX_SCOPE,
};
struct perf_output_handle; struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL)) #define PMU_NULL_DEV ((void *)(~0UL))
...@@ -318,6 +331,11 @@ struct pmu { ...@@ -318,6 +331,11 @@ struct pmu {
*/ */
int capabilities; int capabilities;
/*
* PMU scope
*/
unsigned int scope;
int __percpu *pmu_disable_count; int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context; struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
......
...@@ -436,6 +436,11 @@ static LIST_HEAD(pmus); ...@@ -436,6 +436,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock); static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu; static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask; static cpumask_var_t perf_online_mask;
static cpumask_var_t perf_online_core_mask;
static cpumask_var_t perf_online_die_mask;
static cpumask_var_t perf_online_cluster_mask;
static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache; static struct kmem_cache *perf_event_cache;
/* /*
...@@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev, ...@@ -11578,10 +11583,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
} }
static DEVICE_ATTR_RW(perf_event_mux_interval_ms); static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
{
switch (scope) {
case PERF_PMU_SCOPE_CORE:
return topology_sibling_cpumask(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_die_cpumask(cpu);
case PERF_PMU_SCOPE_CLUSTER:
return topology_cluster_cpumask(cpu);
case PERF_PMU_SCOPE_PKG:
return topology_core_cpumask(cpu);
case PERF_PMU_SCOPE_SYS_WIDE:
return cpu_online_mask;
}
return NULL;
}
static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
{
switch (scope) {
case PERF_PMU_SCOPE_CORE:
return perf_online_core_mask;
case PERF_PMU_SCOPE_DIE:
return perf_online_die_mask;
case PERF_PMU_SCOPE_CLUSTER:
return perf_online_cluster_mask;
case PERF_PMU_SCOPE_PKG:
return perf_online_pkg_mask;
case PERF_PMU_SCOPE_SYS_WIDE:
return perf_online_sys_mask;
}
return NULL;
}
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct pmu *pmu = dev_get_drvdata(dev);
struct cpumask *mask = perf_scope_cpumask(pmu->scope);
if (mask)
return cpumap_print_to_pagebuf(true, buf, mask);
return 0;
}
static DEVICE_ATTR_RO(cpumask);
static struct attribute *pmu_dev_attrs[] = { static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr, &dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr, &dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr, &dev_attr_nr_addr_filters.attr,
&dev_attr_cpumask.attr,
NULL, NULL,
}; };
...@@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int ...@@ -11593,6 +11648,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
if (n == 2 && !pmu->nr_addr_filters) if (n == 2 && !pmu->nr_addr_filters)
return 0; return 0;
/* cpumask */
if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
return 0;
return a->mode; return a->mode;
} }
...@@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type) ...@@ -11677,6 +11736,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc; goto free_pdc;
} }
if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
ret = -EINVAL;
goto free_pdc;
}
pmu->name = name; pmu->name = name;
if (type >= 0) if (type >= 0)
...@@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) ...@@ -11831,6 +11895,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
event_has_any_exclude_flag(event)) event_has_any_exclude_flag(event))
ret = -EINVAL; ret = -EINVAL;
if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
int cpu;
if (pmu_cpumask && cpumask) {
cpu = cpumask_any_and(pmu_cpumask, cpumask);
if (cpu >= nr_cpu_ids)
ret = -ENODEV;
else
event->cpu = cpu;
} else {
ret = -ENODEV;
}
}
if (ret && event->destroy) if (ret && event->destroy)
event->destroy(event); event->destroy(event);
} }
...@@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void) ...@@ -13784,6 +13864,12 @@ static void __init perf_event_init_all_cpus(void)
int cpu; int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL); zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu); swhash = &per_cpu(swevent_htable, cpu);
...@@ -13833,6 +13919,40 @@ static void __perf_event_exit_context(void *__info) ...@@ -13833,6 +13919,40 @@ static void __perf_event_exit_context(void *__info)
raw_spin_unlock(&ctx->lock); raw_spin_unlock(&ctx->lock);
} }
static void perf_event_clear_cpumask(unsigned int cpu)
{
int target[PERF_PMU_MAX_SCOPE];
unsigned int scope;
struct pmu *pmu;
cpumask_clear_cpu(cpu, perf_online_mask);
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
target[scope] = -1;
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
continue;
if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
continue;
target[scope] = cpumask_any_but(cpumask, cpu);
if (target[scope] < nr_cpu_ids)
cpumask_set_cpu(target[scope], pmu_cpumask);
}
/* migrate */
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
if (pmu->scope == PERF_PMU_SCOPE_NONE ||
WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
continue;
if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
}
}
static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu_context(int cpu)
{ {
struct perf_cpu_context *cpuctx; struct perf_cpu_context *cpuctx;
...@@ -13840,6 +13960,11 @@ static void perf_event_exit_cpu_context(int cpu) ...@@ -13840,6 +13960,11 @@ static void perf_event_exit_cpu_context(int cpu)
// XXX simplify cpuctx->online // XXX simplify cpuctx->online
mutex_lock(&pmus_lock); mutex_lock(&pmus_lock);
/*
* Clear the cpumasks, and migrate to other CPUs if possible.
* Must be invoked before the __perf_event_exit_context.
*/
perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx; ctx = &cpuctx->ctx;
...@@ -13847,7 +13972,6 @@ static void perf_event_exit_cpu_context(int cpu) ...@@ -13847,7 +13972,6 @@ static void perf_event_exit_cpu_context(int cpu)
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0; cpuctx->online = 0;
mutex_unlock(&ctx->mutex); mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock); mutex_unlock(&pmus_lock);
} }
#else #else
...@@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { } ...@@ -13856,6 +13980,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif #endif
static void perf_event_setup_cpumask(unsigned int cpu)
{
struct cpumask *pmu_cpumask;
unsigned int scope;
cpumask_set_cpu(cpu, perf_online_mask);
/*
* Early boot stage, the cpumask hasn't been set yet.
* The perf_online_<domain>_masks includes the first CPU of each domain.
* Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
*/
if (!topology_sibling_cpumask(cpu)) {
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
pmu_cpumask = perf_scope_cpumask(scope);
if (WARN_ON_ONCE(!pmu_cpumask))
continue;
cpumask_set_cpu(cpu, pmu_cpumask);
}
return;
}
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
pmu_cpumask = perf_scope_cpumask(scope);
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
continue;
if (!cpumask_empty(cpumask) &&
cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
cpumask_set_cpu(cpu, pmu_cpumask);
}
}
int perf_event_init_cpu(unsigned int cpu) int perf_event_init_cpu(unsigned int cpu)
{ {
struct perf_cpu_context *cpuctx; struct perf_cpu_context *cpuctx;
...@@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu) ...@@ -13864,7 +14024,7 @@ int perf_event_init_cpu(unsigned int cpu)
perf_swevent_init_cpu(cpu); perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock); mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask); perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu); cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx; ctx = &cpuctx->ctx;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment