Commit 9f0c253d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Implement per-PMU context rescheduling to significantly improve
   single-PMU performance, and related cleanups/fixes (Peter Zijlstra
   and Namhyung Kim)

 - Fix ancient bug resulting in a lot of events being dropped
   erroneously at higher sampling frequencies (Luo Gengkun)

 - uprobes enhancements:

     - Implement RCU-protected hot path optimizations for better
       performance:

         "For baseline vs SRCU, peak througput increased from 3.7 M/s
          (million uprobe triggerings per second) up to about 8 M/s. For
          uretprobes it's a bit more modest with bump from 2.4 M/s to
          5 M/s.

          For SRCU vs RCU Tasks Trace, peak throughput for uprobes
          increases further from 8 M/s to 10.3 M/s (+28%!), and for
          uretprobes from 5.3 M/s to 5.8 M/s (+11%), as we have more
          work to do on uretprobes side.

          Even single-thread (no contention) performance is slightly
          better: 3.276 M/s to 3.396 M/s (+3.5%) for uprobes, and 2.055
          M/s to 2.174 M/s (+5.8%) for uretprobes."

          (Andrii Nakryiko et al)

     - Document mmap_lock, don't abuse get_user_pages_remote() (Oleg
       Nesterov)

     - Cleanups & fixes to prepare for future work:
        - Remove uprobe_register_refctr()
	- Simplify error handling for alloc_uprobe()
        - Make uprobe_register() return struct uprobe *
        - Fold __uprobe_unregister() into uprobe_unregister()
        - Shift put_uprobe() from delete_uprobe() to uprobe_unregister()
        - BPF: Fix use-after-free in bpf_uprobe_multi_link_attach()
          (Oleg Nesterov)

 - New feature & ABI extension: allow events to use PERF_SAMPLE READ
   with inheritance, enabling sample based profiling of a group of
   counters over a hierarchy of processes or threads (Ben Gainey)

 - Intel uncore & power events updates:

      - Add Arrow Lake and Lunar Lake support
      - Add PERF_EV_CAP_READ_SCOPE
      - Clean up and enhance cpumask and hotplug support
        (Kan Liang)

      - Add LNL uncore iMC freerunning support
      - Use D0:F0 as a default device
        (Zhenyu Wang)

 - Intel PT: fix AUX snapshot handling race (Adrian Hunter)

 - Misc fixes and cleanups (James Clark, Jiri Olsa, Oleg Nesterov and
   Peter Zijlstra)

* tag 'perf-core-2024-09-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (40 commits)
  dmaengine: idxd: Clean up cpumask and hotplug for perfmon
  iommu/vt-d: Clean up cpumask and hotplug for perfmon
  perf/x86/intel/cstate: Clean up cpumask and hotplug
  perf: Add PERF_EV_CAP_READ_SCOPE
  perf: Generic hotplug support for a PMU with a scope
  uprobes: perform lockless SRCU-protected uprobes_tree lookup
  rbtree: provide rb_find_rcu() / rb_find_add_rcu()
  perf/uprobe: split uprobe_unregister()
  uprobes: travers uprobe's consumer list locklessly under SRCU protection
  uprobes: get rid of enum uprobe_filter_ctx in uprobe filter callbacks
  uprobes: protected uprobe lifetime with SRCU
  uprobes: revamp uprobe refcounting and lifetime management
  bpf: Fix use-after-free in bpf_uprobe_multi_link_attach()
  perf/core: Fix small negative period being ignored
  perf: Really fix event_function_call() locking
  perf: Optimize __pmu_ctx_sched_out()
  perf: Add context time freeze
  perf: Fix event_function_call() locking
  perf: Extract a few helpers
  perf: Optimize context reschedule for single PMU cases
  ...
parents 941c122d 5e645f31
......@@ -41,6 +41,8 @@
#include <asm/desc.h>
#include <asm/ldt.h>
#include <asm/unwind.h>
#include <asm/uprobes.h>
#include <asm/ibt.h>
#include "perf_event.h"
......@@ -2816,6 +2818,46 @@ static unsigned long get_segment_base(unsigned int segment)
return get_desc_base(desc);
}
#ifdef CONFIG_UPROBES
/*
* Heuristic-based check if uprobe is installed at the function entry.
*
* Under assumption of user code being compiled with frame pointers,
* `push %rbp/%ebp` is a good indicator that we indeed are.
*
* Similarly, `endbr64` (assuming 64-bit mode) is also a common pattern.
* If we get this wrong, captured stack trace might have one extra bogus
* entry, but the rest of stack trace will still be meaningful.
*/
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
struct arch_uprobe *auprobe;
if (!current->utask)
return false;
auprobe = current->utask->auprobe;
if (!auprobe)
return false;
/* push %rbp/%ebp */
if (auprobe->insn[0] == 0x55)
return true;
/* endbr64 (64-bit only) */
if (user_64bit_mode(regs) && is_endbr(*(u32 *)auprobe->insn))
return true;
return false;
}
#else
static bool is_uprobe_at_func_entry(struct pt_regs *regs)
{
return false;
}
#endif /* CONFIG_UPROBES */
#ifdef CONFIG_IA32_EMULATION
#include <linux/compat.h>
......@@ -2827,6 +2869,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
unsigned long ss_base, cs_base;
struct stack_frame_ia32 frame;
const struct stack_frame_ia32 __user *fp;
u32 ret_addr;
if (user_64bit_mode(regs))
return 0;
......@@ -2836,6 +2879,12 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry_ctx *ent
fp = compat_ptr(ss_base + regs->bp);
pagefault_disable();
/* see perf_callchain_user() below for why we do this */
if (is_uprobe_at_func_entry(regs) &&
!get_user(ret_addr, (const u32 __user *)regs->sp))
perf_callchain_store(entry, ret_addr);
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
......@@ -2864,6 +2913,7 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
{
struct stack_frame frame;
const struct stack_frame __user *fp;
unsigned long ret_addr;
if (perf_guest_state()) {
/* TODO: We don't support guest os callchain now */
......@@ -2887,6 +2937,19 @@ perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs
return;
pagefault_disable();
/*
* If we are called from uprobe handler, and we are indeed at the very
* entry to user function (which is normally a `push %rbp` instruction,
* under assumption of application being compiled with frame pointers),
* we should read return address from *regs->sp before proceeding
* to follow frame pointers, otherwise we'll skip immediate caller
* as %rbp is not yet setup.
*/
if (is_uprobe_at_func_entry(regs) &&
!get_user(ret_addr, (const unsigned long __user *)regs->sp))
perf_callchain_store(entry, ret_addr);
while (entry->nr < entry->max_stack) {
if (!valid_user_frame(fp, sizeof(frame)))
break;
......
......@@ -557,9 +557,6 @@ static int bts_event_init(struct perf_event *event)
* disabled, so disallow intel_bts driver for unprivileged
* users on paranoid systems since it provides trace data
* to the user in a zero-copy fashion.
*
* Note that the default paranoia setting permits unprivileged
* users to profile the kernel.
*/
if (event->attr.exclude_kernel) {
ret = perf_allow_kernel(&event->attr);
......
......@@ -128,10 +128,6 @@ static ssize_t __cstate_##_var##_show(struct device *dev, \
static struct device_attribute format_attr_##_var = \
__ATTR(_name, 0444, __cstate_##_var##_show, NULL)
static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf);
/* Model -> events mapping */
struct cstate_model {
unsigned long core_events;
......@@ -206,22 +202,9 @@ static struct attribute_group cstate_format_attr_group = {
.attrs = cstate_format_attrs,
};
static cpumask_t cstate_core_cpu_mask;
static DEVICE_ATTR(cpumask, S_IRUGO, cstate_get_attr_cpumask, NULL);
static struct attribute *cstate_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group cpumask_attr_group = {
.attrs = cstate_cpumask_attrs,
};
static const struct attribute_group *cstate_attr_groups[] = {
&cstate_events_attr_group,
&cstate_format_attr_group,
&cpumask_attr_group,
NULL,
};
......@@ -269,8 +252,6 @@ static struct perf_msr pkg_msr[] = {
[PERF_CSTATE_PKG_C10_RES] = { MSR_PKG_C10_RESIDENCY, &group_cstate_pkg_c10, test_msr },
};
static cpumask_t cstate_pkg_cpu_mask;
/* cstate_module PMU */
static struct pmu cstate_module_pmu;
static bool has_cstate_module;
......@@ -291,28 +272,9 @@ static struct perf_msr module_msr[] = {
[PERF_CSTATE_MODULE_C6_RES] = { MSR_MODULE_C6_RES_MS, &group_cstate_module_c6, test_msr },
};
static cpumask_t cstate_module_cpu_mask;
static ssize_t cstate_get_attr_cpumask(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct pmu *pmu = dev_get_drvdata(dev);
if (pmu == &cstate_core_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_core_cpu_mask);
else if (pmu == &cstate_pkg_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_pkg_cpu_mask);
else if (pmu == &cstate_module_pmu)
return cpumap_print_to_pagebuf(true, buf, &cstate_module_cpu_mask);
else
return 0;
}
static int cstate_pmu_event_init(struct perf_event *event)
{
u64 cfg = event->attr.config;
int cpu;
if (event->attr.type != event->pmu->type)
return -ENOENT;
......@@ -331,20 +293,13 @@ static int cstate_pmu_event_init(struct perf_event *event)
if (!(core_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = core_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_core_cpu_mask,
topology_sibling_cpumask(event->cpu));
} else if (event->pmu == &cstate_pkg_pmu) {
if (cfg >= PERF_CSTATE_PKG_EVENT_MAX)
return -EINVAL;
cfg = array_index_nospec((unsigned long)cfg, PERF_CSTATE_PKG_EVENT_MAX);
if (!(pkg_msr_mask & (1 << cfg)))
return -EINVAL;
event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
event->hw.event_base = pkg_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(event->cpu));
} else if (event->pmu == &cstate_module_pmu) {
if (cfg >= PERF_CSTATE_MODULE_EVENT_MAX)
return -EINVAL;
......@@ -352,16 +307,10 @@ static int cstate_pmu_event_init(struct perf_event *event)
if (!(module_msr_mask & (1 << cfg)))
return -EINVAL;
event->hw.event_base = module_msr[cfg].msr;
cpu = cpumask_any_and(&cstate_module_cpu_mask,
topology_cluster_cpumask(event->cpu));
} else {
return -ENOENT;
}
if (cpu >= nr_cpu_ids)
return -ENODEV;
event->cpu = cpu;
event->hw.config = cfg;
event->hw.idx = -1;
return 0;
......@@ -412,84 +361,6 @@ static int cstate_pmu_event_add(struct perf_event *event, int mode)
return 0;
}
/*
* Check if exiting cpu is the designated reader. If so migrate the
* events when there is a valid target available
*/
static int cstate_cpu_exit(unsigned int cpu)
{
unsigned int target;
if (has_cstate_core &&
cpumask_test_and_clear_cpu(cpu, &cstate_core_cpu_mask)) {
target = cpumask_any_but(topology_sibling_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_core_cpu_mask);
perf_pmu_migrate_context(&cstate_core_pmu, cpu, target);
}
}
if (has_cstate_pkg &&
cpumask_test_and_clear_cpu(cpu, &cstate_pkg_cpu_mask)) {
target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_pkg_cpu_mask);
perf_pmu_migrate_context(&cstate_pkg_pmu, cpu, target);
}
}
if (has_cstate_module &&
cpumask_test_and_clear_cpu(cpu, &cstate_module_cpu_mask)) {
target = cpumask_any_but(topology_cluster_cpumask(cpu), cpu);
/* Migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &cstate_module_cpu_mask);
perf_pmu_migrate_context(&cstate_module_pmu, cpu, target);
}
}
return 0;
}
static int cstate_cpu_init(unsigned int cpu)
{
unsigned int target;
/*
* If this is the first online thread of that core, set it in
* the core cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_core_cpu_mask,
topology_sibling_cpumask(cpu));
if (has_cstate_core && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_core_cpu_mask);
/*
* If this is the first online thread of that package, set it
* in the package cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_pkg_cpu_mask,
topology_die_cpumask(cpu));
if (has_cstate_pkg && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_pkg_cpu_mask);
/*
* If this is the first online thread of that cluster, set it
* in the cluster cpu mask as the designated reader.
*/
target = cpumask_any_and(&cstate_module_cpu_mask,
topology_cluster_cpumask(cpu));
if (has_cstate_module && target >= nr_cpu_ids)
cpumask_set_cpu(cpu, &cstate_module_cpu_mask);
return 0;
}
static const struct attribute_group *core_attr_update[] = {
&group_cstate_core_c1,
&group_cstate_core_c3,
......@@ -526,6 +397,7 @@ static struct pmu cstate_core_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_CORE,
.module = THIS_MODULE,
};
......@@ -541,6 +413,7 @@ static struct pmu cstate_pkg_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_PKG,
.module = THIS_MODULE,
};
......@@ -556,6 +429,7 @@ static struct pmu cstate_module_pmu = {
.stop = cstate_pmu_event_stop,
.read = cstate_pmu_event_update,
.capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE,
.scope = PERF_PMU_SCOPE_CLUSTER,
.module = THIS_MODULE,
};
......@@ -810,9 +684,6 @@ static int __init cstate_probe(const struct cstate_model *cm)
static inline void cstate_cleanup(void)
{
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_ONLINE);
cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_CSTATE_STARTING);
if (has_cstate_core)
perf_pmu_unregister(&cstate_core_pmu);
......@@ -827,11 +698,6 @@ static int __init cstate_init(void)
{
int err;
cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_STARTING,
"perf/x86/cstate:starting", cstate_cpu_init, NULL);
cpuhp_setup_state(CPUHP_AP_PERF_X86_CSTATE_ONLINE,
"perf/x86/cstate:online", NULL, cstate_cpu_exit);
if (has_cstate_core) {
err = perf_pmu_register(&cstate_core_pmu, cstate_core_pmu.name, -1);
if (err) {
......@@ -844,6 +710,8 @@ static int __init cstate_init(void)
if (has_cstate_pkg) {
if (topology_max_dies_per_package() > 1) {
/* CLX-AP is multi-die and the cstate is die-scope */
cstate_pkg_pmu.scope = PERF_PMU_SCOPE_DIE;
err = perf_pmu_register(&cstate_pkg_pmu,
"cstate_die", -1);
} else {
......
......@@ -416,7 +416,7 @@ static bool pt_event_valid(struct perf_event *event)
static void pt_config_start(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 ctl = event->hw.config;
u64 ctl = event->hw.aux_config;
ctl |= RTIT_CTL_TRACEEN;
if (READ_ONCE(pt->vmx_on))
......@@ -424,7 +424,7 @@ static void pt_config_start(struct perf_event *event)
else
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
WRITE_ONCE(event->hw.config, ctl);
WRITE_ONCE(event->hw.aux_config, ctl);
}
/* Address ranges and their corresponding msr configuration registers */
......@@ -503,7 +503,7 @@ static void pt_config(struct perf_event *event)
u64 reg;
/* First round: clear STATUS, in particular the PSB byte counter. */
if (!event->hw.config) {
if (!event->hw.aux_config) {
perf_event_itrace_started(event);
wrmsrl(MSR_IA32_RTIT_STATUS, 0);
}
......@@ -533,14 +533,14 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);
event->hw.config = reg;
event->hw.aux_config = reg;
pt_config_start(event);
}
static void pt_config_stop(struct perf_event *event)
{
struct pt *pt = this_cpu_ptr(&pt_ctx);
u64 ctl = READ_ONCE(event->hw.config);
u64 ctl = READ_ONCE(event->hw.aux_config);
/* may be already stopped by a PMI */
if (!(ctl & RTIT_CTL_TRACEEN))
......@@ -550,7 +550,7 @@ static void pt_config_stop(struct perf_event *event)
if (!READ_ONCE(pt->vmx_on))
wrmsrl(MSR_IA32_RTIT_CTL, ctl);
WRITE_ONCE(event->hw.config, ctl);
WRITE_ONCE(event->hw.aux_config, ctl);
/*
* A wrmsr that disables trace generation serializes other PT
......@@ -1557,7 +1557,7 @@ void intel_pt_handle_vmx(int on)
/* Turn PTs back on */
if (!on && event)
wrmsrl(MSR_IA32_RTIT_CTL, event->hw.config);
wrmsrl(MSR_IA32_RTIT_CTL, event->hw.aux_config);
local_irq_restore(flags);
}
......@@ -1606,6 +1606,7 @@ static void pt_event_stop(struct perf_event *event, int mode)
* see comment in intel_pt_interrupt().
*/
WRITE_ONCE(pt->handle_nmi, 0);
barrier();
pt_config_stop(event);
......@@ -1657,11 +1658,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
return 0;
/*
* Here, handle_nmi tells us if the tracing is on
* There is no PT interrupt in this mode, so stop the trace and it will
* remain stopped while the buffer is copied.
*/
if (READ_ONCE(pt->handle_nmi))
pt_config_stop(event);
pt_config_stop(event);
pt_read_offset(buf);
pt_update_head(pt);
......@@ -1673,11 +1673,10 @@ static long pt_event_snapshot_aux(struct perf_event *event,
ret = perf_output_copy_aux(&pt->handle, handle, from, to);
/*
* If the tracing was on when we turned up, restart it.
* Compiler barrier not needed as we couldn't have been
* preempted by anything that touches pt->handle_nmi.
* Here, handle_nmi tells us if the tracing was on.
* If the tracing was on, restart it.
*/
if (pt->handle_nmi)
if (READ_ONCE(pt->handle_nmi))
pt_config_start(event);
return ret;
......
......@@ -1816,6 +1816,11 @@ static const struct intel_uncore_init_fun mtl_uncore_init __initconst = {
.mmio_init = adl_uncore_mmio_init,
};
static const struct intel_uncore_init_fun lnl_uncore_init __initconst = {
.cpu_init = lnl_uncore_cpu_init,
.mmio_init = lnl_uncore_mmio_init,
};
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
......@@ -1893,6 +1898,10 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
X86_MATCH_VFM(INTEL_RAPTORLAKE_S, &adl_uncore_init),
X86_MATCH_VFM(INTEL_METEORLAKE, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_METEORLAKE_L, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE_U, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_ARROWLAKE_H, &mtl_uncore_init),
X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_uncore_init),
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_uncore_init),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_uncore_init),
......
......@@ -611,10 +611,12 @@ void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
void adl_uncore_cpu_init(void);
void lnl_uncore_cpu_init(void);
void mtl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
void adl_uncore_mmio_init(void);
void lnl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
......
......@@ -252,6 +252,7 @@ DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
DEFINE_UNCORE_FORMAT_ATTR(threshold2, threshold, "config:24-31");
/* Sandy Bridge uncore support */
static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event)
......@@ -746,6 +747,34 @@ void mtl_uncore_cpu_init(void)
uncore_msr_uncores = mtl_msr_uncores;
}
static struct intel_uncore_type *lnl_msr_uncores[] = {
&mtl_uncore_cbox,
&mtl_uncore_arb,
NULL
};
#define LNL_UNC_MSR_GLOBAL_CTL 0x240e
static void lnl_uncore_msr_init_box(struct intel_uncore_box *box)
{
if (box->pmu->pmu_idx == 0)
wrmsrl(LNL_UNC_MSR_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
}
static struct intel_uncore_ops lnl_uncore_msr_ops = {
.init_box = lnl_uncore_msr_init_box,
.disable_event = snb_uncore_msr_disable_event,
.enable_event = snb_uncore_msr_enable_event,
.read_counter = uncore_msr_read_counter,
};
void lnl_uncore_cpu_init(void)
{
mtl_uncore_cbox.num_boxes = 4;
mtl_uncore_cbox.ops = &lnl_uncore_msr_ops;
uncore_msr_uncores = lnl_msr_uncores;
}
enum {
SNB_PCI_UNCORE_IMC,
};
......@@ -1475,39 +1504,45 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
ids++;
}
/* Just try to grab 00:00.0 device */
if (!mc_dev)
mc_dev = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
return mc_dev;
}
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
static void __uncore_imc_init_box(struct intel_uncore_box *box,
unsigned int base_offset)
static void
uncore_get_box_mmio_addr(struct intel_uncore_box *box,
unsigned int base_offset,
int bar_offset, int step)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
struct intel_uncore_type *type = pmu->type;
resource_size_t addr;
u32 mch_bar;
u32 bar;
if (!pdev) {
pr_warn("perf uncore: Cannot find matched IMC device.\n");
return;
}
pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET, &mch_bar);
/* MCHBAR is disabled */
if (!(mch_bar & BIT(0))) {
pr_warn("perf uncore: MCHBAR is disabled. Failed to map IMC free-running counters.\n");
pci_read_config_dword(pdev, bar_offset, &bar);
if (!(bar & BIT(0))) {
pr_warn("perf uncore: BAR 0x%x is disabled. Failed to map %s counters.\n",
bar_offset, type->name);
pci_dev_put(pdev);
return;
}
mch_bar &= ~BIT(0);
addr = (resource_size_t)(mch_bar + TGL_UNCORE_MMIO_IMC_MEM_OFFSET * pmu->pmu_idx);
bar &= ~BIT(0);
addr = (resource_size_t)(bar + step * pmu->pmu_idx);
#ifdef CONFIG_PHYS_ADDR_T_64BIT
pci_read_config_dword(pdev, SNB_UNCORE_PCI_IMC_BAR_OFFSET + 4, &mch_bar);
addr |= ((resource_size_t)mch_bar << 32);
pci_read_config_dword(pdev, bar_offset + 4, &bar);
addr |= ((resource_size_t)bar << 32);
#endif
addr += base_offset;
......@@ -1518,6 +1553,14 @@ static void __uncore_imc_init_box(struct intel_uncore_box *box,
pci_dev_put(pdev);
}
static void __uncore_imc_init_box(struct intel_uncore_box *box,
unsigned int base_offset)
{
uncore_get_box_mmio_addr(box, base_offset,
SNB_UNCORE_PCI_IMC_BAR_OFFSET,
TGL_UNCORE_MMIO_IMC_MEM_OFFSET);
}
static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
{
__uncore_imc_init_box(box, 0);
......@@ -1612,14 +1655,17 @@ static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
writel(0, box->io_addr + uncore_mmio_box_ctl(box));
}
#define MMIO_UNCORE_COMMON_OPS() \
.exit_box = uncore_mmio_exit_box, \
.disable_box = adl_uncore_mmio_disable_box, \
.enable_box = adl_uncore_mmio_enable_box, \
.disable_event = intel_generic_uncore_mmio_disable_event, \
.enable_event = intel_generic_uncore_mmio_enable_event, \
.read_counter = uncore_mmio_read_counter,
static struct intel_uncore_ops adl_uncore_mmio_ops = {
.init_box = adl_uncore_imc_init_box,
.exit_box = uncore_mmio_exit_box,
.disable_box = adl_uncore_mmio_disable_box,
.enable_box = adl_uncore_mmio_enable_box,
.disable_event = intel_generic_uncore_mmio_disable_event,
.enable_event = intel_generic_uncore_mmio_enable_event,
.read_counter = uncore_mmio_read_counter,
MMIO_UNCORE_COMMON_OPS()
};
#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
......@@ -1703,3 +1749,108 @@ void adl_uncore_mmio_init(void)
}
/* end of Alder Lake MMIO uncore support */
/* Lunar Lake MMIO uncore support */
#define LNL_UNCORE_PCI_SAFBAR_OFFSET 0x68
#define LNL_UNCORE_MAP_SIZE 0x1000
#define LNL_UNCORE_SNCU_BASE 0xE4B000
#define LNL_UNCORE_SNCU_CTR 0x390
#define LNL_UNCORE_SNCU_CTRL 0x398
#define LNL_UNCORE_SNCU_BOX_CTL 0x380
#define LNL_UNCORE_GLOBAL_CTL 0x700
#define LNL_UNCORE_HBO_BASE 0xE54000
#define LNL_UNCORE_HBO_OFFSET -4096
#define LNL_UNCORE_HBO_CTR 0x570
#define LNL_UNCORE_HBO_CTRL 0x550
#define LNL_UNCORE_HBO_BOX_CTL 0x548
#define LNL_UNC_CTL_THRESHOLD 0xff000000
#define LNL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
SNB_UNC_CTL_UMASK_MASK | \
SNB_UNC_CTL_EDGE_DET | \
SNB_UNC_CTL_INVERT | \
LNL_UNC_CTL_THRESHOLD)
static struct attribute *lnl_uncore_formats_attr[] = {
&format_attr_event.attr,
&format_attr_umask.attr,
&format_attr_edge.attr,
&format_attr_inv.attr,
&format_attr_threshold2.attr,
NULL
};
static const struct attribute_group lnl_uncore_format_group = {
.name = "format",
.attrs = lnl_uncore_formats_attr,
};
static void lnl_uncore_hbo_init_box(struct intel_uncore_box *box)
{
uncore_get_box_mmio_addr(box, LNL_UNCORE_HBO_BASE,
LNL_UNCORE_PCI_SAFBAR_OFFSET,
LNL_UNCORE_HBO_OFFSET);
}
static struct intel_uncore_ops lnl_uncore_hbo_ops = {
.init_box = lnl_uncore_hbo_init_box,
MMIO_UNCORE_COMMON_OPS()
};
static struct intel_uncore_type lnl_uncore_hbo = {
.name = "hbo",
.num_counters = 4,
.num_boxes = 2,
.perf_ctr_bits = 64,
.perf_ctr = LNL_UNCORE_HBO_CTR,
.event_ctl = LNL_UNCORE_HBO_CTRL,
.event_mask = LNL_UNC_RAW_EVENT_MASK,
.box_ctl = LNL_UNCORE_HBO_BOX_CTL,
.mmio_map_size = LNL_UNCORE_MAP_SIZE,
.ops = &lnl_uncore_hbo_ops,
.format_group = &lnl_uncore_format_group,
};
static void lnl_uncore_sncu_init_box(struct intel_uncore_box *box)
{
uncore_get_box_mmio_addr(box, LNL_UNCORE_SNCU_BASE,
LNL_UNCORE_PCI_SAFBAR_OFFSET,
0);
if (box->io_addr)
writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + LNL_UNCORE_GLOBAL_CTL);
}
static struct intel_uncore_ops lnl_uncore_sncu_ops = {
.init_box = lnl_uncore_sncu_init_box,
MMIO_UNCORE_COMMON_OPS()
};
static struct intel_uncore_type lnl_uncore_sncu = {
.name = "sncu",
.num_counters = 2,
.num_boxes = 1,
.perf_ctr_bits = 64,
.perf_ctr = LNL_UNCORE_SNCU_CTR,
.event_ctl = LNL_UNCORE_SNCU_CTRL,
.event_mask = LNL_UNC_RAW_EVENT_MASK,
.box_ctl = LNL_UNCORE_SNCU_BOX_CTL,
.mmio_map_size = LNL_UNCORE_MAP_SIZE,
.ops = &lnl_uncore_sncu_ops,
.format_group = &lnl_uncore_format_group,
};
static struct intel_uncore_type *lnl_mmio_uncores[] = {
&adl_uncore_imc,
&lnl_uncore_hbo,
&lnl_uncore_sncu,
&adl_uncore_imc_free_running,
NULL
};
void lnl_uncore_mmio_init(void)
{
uncore_mmio_uncores = lnl_mmio_uncores;
}
/* end of Lunar Lake MMIO uncore support */
......@@ -124,7 +124,6 @@ struct idxd_pmu {
struct pmu pmu;
char name[IDXD_NAME_SIZE];
int cpu;
int n_counters;
int counter_width;
......@@ -135,8 +134,6 @@ struct idxd_pmu {
unsigned long supported_filters;
int n_filters;
struct hlist_node cpuhp_node;
};
#define IDXD_MAX_PRIORITY 0xf
......@@ -803,14 +800,10 @@ void idxd_user_counter_increment(struct idxd_wq *wq, u32 pasid, int index);
int perfmon_pmu_init(struct idxd_device *idxd);
void perfmon_pmu_remove(struct idxd_device *idxd);
void perfmon_counter_overflow(struct idxd_device *idxd);
void perfmon_init(void);
void perfmon_exit(void);
#else
static inline int perfmon_pmu_init(struct idxd_device *idxd) { return 0; }
static inline void perfmon_pmu_remove(struct idxd_device *idxd) {}
static inline void perfmon_counter_overflow(struct idxd_device *idxd) {}
static inline void perfmon_init(void) {}
static inline void perfmon_exit(void) {}
#endif
/* debugfs */
......
......@@ -878,8 +878,6 @@ static int __init idxd_init_module(void)
else
support_enqcmd = true;
perfmon_init();
err = idxd_driver_register(&idxd_drv);
if (err < 0)
goto err_idxd_driver_register;
......@@ -928,7 +926,6 @@ static void __exit idxd_exit_module(void)
idxd_driver_unregister(&idxd_drv);
pci_unregister_driver(&idxd_pci_driver);
idxd_cdev_remove();
perfmon_exit();
idxd_remove_debugfs();
}
module_exit(idxd_exit_module);
......@@ -6,29 +6,6 @@
#include "idxd.h"
#include "perfmon.h"
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
char *buf);
static cpumask_t perfmon_dsa_cpu_mask;
static bool cpuhp_set_up;
static enum cpuhp_state cpuhp_slot;
/*
* perf userspace reads this attribute to determine which cpus to open
* counters on. It's connected to perfmon_dsa_cpu_mask, which is
* maintained by the cpu hotplug handlers.
*/
static DEVICE_ATTR_RO(cpumask);
static struct attribute *perfmon_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL,
};
static struct attribute_group cpumask_attr_group = {
.attrs = perfmon_cpumask_attrs,
};
/*
* These attributes specify the bits in the config word that the perf
* syscall uses to pass the event ids and categories to perfmon.
......@@ -67,16 +44,9 @@ static struct attribute_group perfmon_format_attr_group = {
static const struct attribute_group *perfmon_attr_groups[] = {
&perfmon_format_attr_group,
&cpumask_attr_group,
NULL,
};
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
return cpumap_print_to_pagebuf(true, buf, &perfmon_dsa_cpu_mask);
}
static bool is_idxd_event(struct idxd_pmu *idxd_pmu, struct perf_event *event)
{
return &idxd_pmu->pmu == event->pmu;
......@@ -217,7 +187,6 @@ static int perfmon_pmu_event_init(struct perf_event *event)
return -EINVAL;
event->hw.event_base = ioread64(PERFMON_TABLE_OFFSET(idxd));
event->cpu = idxd->idxd_pmu->cpu;
event->hw.config = event->attr.config;
if (event->group_leader != event)
......@@ -488,6 +457,7 @@ static void idxd_pmu_init(struct idxd_pmu *idxd_pmu)
idxd_pmu->pmu.stop = perfmon_pmu_event_stop;
idxd_pmu->pmu.read = perfmon_pmu_event_update;
idxd_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
idxd_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
idxd_pmu->pmu.module = THIS_MODULE;
}
......@@ -496,59 +466,17 @@ void perfmon_pmu_remove(struct idxd_device *idxd)
if (!idxd->idxd_pmu)
return;
cpuhp_state_remove_instance(cpuhp_slot, &idxd->idxd_pmu->cpuhp_node);
perf_pmu_unregister(&idxd->idxd_pmu->pmu);
kfree(idxd->idxd_pmu);
idxd->idxd_pmu = NULL;
}
static int perf_event_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct idxd_pmu *idxd_pmu;
idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
/* select the first online CPU as the designated reader */
if (cpumask_empty(&perfmon_dsa_cpu_mask)) {
cpumask_set_cpu(cpu, &perfmon_dsa_cpu_mask);
idxd_pmu->cpu = cpu;
}
return 0;
}
static int perf_event_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct idxd_pmu *idxd_pmu;
unsigned int target;
idxd_pmu = hlist_entry_safe(node, typeof(*idxd_pmu), cpuhp_node);
if (!cpumask_test_and_clear_cpu(cpu, &perfmon_dsa_cpu_mask))
return 0;
target = cpumask_any_but(cpu_online_mask, cpu);
/* migrate events if there is a valid target */
if (target < nr_cpu_ids) {
cpumask_set_cpu(target, &perfmon_dsa_cpu_mask);
perf_pmu_migrate_context(&idxd_pmu->pmu, cpu, target);
}
return 0;
}
int perfmon_pmu_init(struct idxd_device *idxd)
{
union idxd_perfcap perfcap;
struct idxd_pmu *idxd_pmu;
int rc = -ENODEV;
/*
* perfmon module initialization failed, nothing to do
*/
if (!cpuhp_set_up)
return -ENODEV;
/*
* If perfmon_offset or num_counters is 0, it means perfmon is
* not supported on this hardware.
......@@ -624,11 +552,6 @@ int perfmon_pmu_init(struct idxd_device *idxd)
if (rc)
goto free;
rc = cpuhp_state_add_instance(cpuhp_slot, &idxd_pmu->cpuhp_node);
if (rc) {
perf_pmu_unregister(&idxd->idxd_pmu->pmu);
goto free;
}
out:
return rc;
free:
......@@ -637,22 +560,3 @@ int perfmon_pmu_init(struct idxd_device *idxd)
goto out;
}
void __init perfmon_init(void)
{
int rc = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
"driver/dma/idxd/perf:online",
perf_event_cpu_online,
perf_event_cpu_offline);
if (WARN_ON(rc < 0))
return;
cpuhp_slot = rc;
cpuhp_set_up = true;
}
void __exit perfmon_exit(void)
{
if (cpuhp_set_up)
cpuhp_remove_multi_state(cpuhp_slot);
}
......@@ -700,8 +700,6 @@ struct iommu_pmu {
DECLARE_BITMAP(used_mask, IOMMU_PMU_IDX_MAX);
struct perf_event *event_list[IOMMU_PMU_IDX_MAX];
unsigned char irq_name[16];
struct hlist_node cpuhp_node;
int cpu;
};
#define IOMMU_IRQ_ID_OFFSET_PRQ (DMAR_UNITS_SUPPORTED)
......
......@@ -34,28 +34,9 @@ static struct attribute_group iommu_pmu_events_attr_group = {
.attrs = attrs_empty,
};
static cpumask_t iommu_pmu_cpu_mask;
static ssize_t
cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
{
return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
}
static DEVICE_ATTR_RO(cpumask);
static struct attribute *iommu_pmu_cpumask_attrs[] = {
&dev_attr_cpumask.attr,
NULL
};
static struct attribute_group iommu_pmu_cpumask_attr_group = {
.attrs = iommu_pmu_cpumask_attrs,
};
static const struct attribute_group *iommu_pmu_attr_groups[] = {
&iommu_pmu_format_attr_group,
&iommu_pmu_events_attr_group,
&iommu_pmu_cpumask_attr_group,
NULL
};
......@@ -565,6 +546,7 @@ static int __iommu_pmu_register(struct intel_iommu *iommu)
iommu_pmu->pmu.attr_groups = iommu_pmu_attr_groups;
iommu_pmu->pmu.attr_update = iommu_pmu_attr_update;
iommu_pmu->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
iommu_pmu->pmu.scope = PERF_PMU_SCOPE_SYS_WIDE;
iommu_pmu->pmu.module = THIS_MODULE;
return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
......@@ -773,89 +755,6 @@ static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
iommu->perf_irq = 0;
}
static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
{
struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
if (cpumask_empty(&iommu_pmu_cpu_mask))
cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);
if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
iommu_pmu->cpu = cpu;
return 0;
}
static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
int target = cpumask_first(&iommu_pmu_cpu_mask);
/*
* The iommu_pmu_cpu_mask has been updated when offline the CPU
* for the first iommu_pmu. Migrate the other iommu_pmu to the
* new target.
*/
if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
iommu_pmu->cpu = target;
return 0;
}
if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
return 0;
target = cpumask_any_but(cpu_online_mask, cpu);
if (target < nr_cpu_ids)
cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
else
return 0;
perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
iommu_pmu->cpu = target;
return 0;
}
static int nr_iommu_pmu;
static enum cpuhp_state iommu_cpuhp_slot;
static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
{
int ret;
if (!nr_iommu_pmu) {
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
"driver/iommu/intel/perfmon:online",
iommu_pmu_cpu_online,
iommu_pmu_cpu_offline);
if (ret < 0)
return ret;
iommu_cpuhp_slot = ret;
}
ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
if (ret) {
if (!nr_iommu_pmu)
cpuhp_remove_multi_state(iommu_cpuhp_slot);
return ret;
}
nr_iommu_pmu++;
return 0;
}
static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
{
cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
if (--nr_iommu_pmu)
return;
cpuhp_remove_multi_state(iommu_cpuhp_slot);
}
void iommu_pmu_register(struct intel_iommu *iommu)
{
struct iommu_pmu *iommu_pmu = iommu->pmu;
......@@ -866,17 +765,12 @@ void iommu_pmu_register(struct intel_iommu *iommu)
if (__iommu_pmu_register(iommu))
goto err;
if (iommu_pmu_cpuhp_setup(iommu_pmu))
goto unregister;
/* Set interrupt for overflow */
if (iommu_pmu_set_interrupt(iommu))
goto cpuhp_free;
goto unregister;
return;
cpuhp_free:
iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
perf_pmu_unregister(&iommu_pmu->pmu);
err:
......@@ -892,6 +786,5 @@ void iommu_pmu_unregister(struct intel_iommu *iommu)
return;
iommu_pmu_unset_interrupt(iommu);
iommu_pmu_cpuhp_free(iommu_pmu);
perf_pmu_unregister(&iommu_pmu->pmu);
}
......@@ -153,7 +153,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING,
CPUHP_AP_PERF_X86_STARTING,
CPUHP_AP_PERF_X86_AMD_IBS_STARTING,
CPUHP_AP_PERF_X86_CSTATE_STARTING,
CPUHP_AP_PERF_XTENSA_STARTING,
CPUHP_AP_ARM_VFP_STARTING,
CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
......@@ -210,7 +209,6 @@ enum cpuhp_state {
CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE,
CPUHP_AP_PERF_X86_AMD_POWER_ONLINE,
CPUHP_AP_PERF_X86_RAPL_ONLINE,
CPUHP_AP_PERF_X86_CSTATE_ONLINE,
CPUHP_AP_PERF_S390_CF_ONLINE,
CPUHP_AP_PERF_S390_SF_ONLINE,
CPUHP_AP_PERF_ARM_CCI_ONLINE,
......
......@@ -168,6 +168,9 @@ struct hw_perf_event {
struct hw_perf_event_extra extra_reg;
struct hw_perf_event_extra branch_reg;
};
struct { /* aux / Intel-PT */
u64 aux_config;
};
struct { /* software */
struct hrtimer hrtimer;
};
......@@ -292,6 +295,19 @@ struct perf_event_pmu_context;
#define PERF_PMU_CAP_AUX_OUTPUT 0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100
/**
* pmu::scope
*/
enum perf_pmu_scope {
PERF_PMU_SCOPE_NONE = 0,
PERF_PMU_SCOPE_CORE,
PERF_PMU_SCOPE_DIE,
PERF_PMU_SCOPE_CLUSTER,
PERF_PMU_SCOPE_PKG,
PERF_PMU_SCOPE_SYS_WIDE,
PERF_PMU_MAX_SCOPE,
};
struct perf_output_handle;
#define PMU_NULL_DEV ((void *)(~0UL))
......@@ -315,6 +331,11 @@ struct pmu {
*/
int capabilities;
/*
* PMU scope
*/
unsigned int scope;
int __percpu *pmu_disable_count;
struct perf_cpu_pmu_context __percpu *cpu_pmu_context;
atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */
......@@ -615,10 +636,13 @@ typedef void (*perf_overflow_handler_t)(struct perf_event *,
* PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
* cannot be a group leader. If an event with this flag is detached from the
* group it is scheduled out and moved into an unrecoverable ERROR state.
* PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
* PMU scope where it is active.
*/
#define PERF_EV_CAP_SOFTWARE BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG BIT(1)
#define PERF_EV_CAP_SIBLING BIT(2)
#define PERF_EV_CAP_READ_SCOPE BIT(3)
#define SWEVENT_HLIST_BITS 8
#define SWEVENT_HLIST_SIZE (1 << SWEVENT_HLIST_BITS)
......@@ -963,12 +987,16 @@ struct perf_event_context {
struct rcu_head rcu_head;
/*
* Sum (event->pending_work + event->pending_work)
* The count of events for which using the switch-out fast path
* should be avoided.
*
* Sum (event->pending_work + events with
* (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
*
* The SIGTRAP is targeted at ctx->task, as such it won't do changing
* that until the signal is delivered.
*/
local_t nr_pending;
local_t nr_no_switch_fast;
};
struct perf_cpu_pmu_context {
......
......@@ -244,6 +244,42 @@ rb_find_add(struct rb_node *node, struct rb_root *tree,
return NULL;
}
/**
* rb_find_add_rcu() - find equivalent @node in @tree, or add @node
* @node: node to look-for / insert
* @tree: tree to search / modify
* @cmp: operator defining the node order
*
* Adds a Store-Release for link_node.
*
* Returns the rb_node matching @node, or NULL when no match is found and @node
* is inserted.
*/
static __always_inline struct rb_node *
rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
int (*cmp)(struct rb_node *, const struct rb_node *))
{
struct rb_node **link = &tree->rb_node;
struct rb_node *parent = NULL;
int c;
while (*link) {
parent = *link;
c = cmp(node, parent);
if (c < 0)
link = &parent->rb_left;
else if (c > 0)
link = &parent->rb_right;
else
return parent;
}
rb_link_node_rcu(node, parent, link);
rb_insert_color(node, tree);
return NULL;
}
/**
* rb_find() - find @key in tree @tree
* @key: key to match
......@@ -272,6 +308,37 @@ rb_find(const void *key, const struct rb_root *tree,
return NULL;
}
/**
* rb_find_rcu() - find @key in tree @tree
* @key: key to match
* @tree: tree to search
* @cmp: operator defining the node order
*
* Notably, tree descent vs concurrent tree rotations is unsound and can result
* in false-negatives.
*
* Returns the rb_node matching @key or NULL.
*/
static __always_inline struct rb_node *
rb_find_rcu(const void *key, const struct rb_root *tree,
int (*cmp)(const void *key, const struct rb_node *))
{
struct rb_node *node = tree->rb_node;
while (node) {
int c = cmp(key, node);
if (c < 0)
node = rcu_dereference_raw(node->rb_left);
else if (c > 0)
node = rcu_dereference_raw(node->rb_right);
else
return node;
}
return NULL;
}
/**
* rb_find_first() - find the first @key in @tree
* @key: key to match
......
......@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/wait.h>
struct uprobe;
struct vm_area_struct;
struct mm_struct;
struct inode;
......@@ -27,22 +28,22 @@ struct page;
#define MAX_URETPROBE_DEPTH 64
enum uprobe_filter_ctx {
UPROBE_FILTER_REGISTER,
UPROBE_FILTER_UNREGISTER,
UPROBE_FILTER_MMAP,
};
struct uprobe_consumer {
/*
* handler() can return UPROBE_HANDLER_REMOVE to signal the need to
* unregister uprobe for current process. If UPROBE_HANDLER_REMOVE is
* returned, filter() callback has to be implemented as well and it
* should return false to "confirm" the decision to uninstall uprobe
* for the current process. If filter() is omitted or returns true,
* UPROBE_HANDLER_REMOVE is effectively ignored.
*/
int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
int (*ret_handler)(struct uprobe_consumer *self,
unsigned long func,
struct pt_regs *regs);
bool (*filter)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm);
struct uprobe_consumer *next;
struct list_head cons_node;
};
#ifdef CONFIG_UPROBES
......@@ -76,6 +77,8 @@ struct uprobe_task {
struct uprobe *active_uprobe;
unsigned long xol_vaddr;
struct arch_uprobe *auprobe;
struct return_instance *return_instances;
unsigned int depth;
};
......@@ -110,10 +113,10 @@ extern bool is_trap_insn(uprobe_opcode_t *insn);
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t);
extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
extern void uprobe_unregister_sync(void);
extern int uprobe_mmap(struct vm_area_struct *vma);
extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end);
extern void uprobe_start_dup_mmap(void);
......@@ -151,22 +154,21 @@ static inline void uprobes_init(void)
#define uprobe_get_trap_addr(regs) instruction_pointer(regs)
static inline int
uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
return -ENOSYS;
}
static inline int uprobe_register_refctr(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
static inline struct uprobe *
uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
return -ENOSYS;
return ERR_PTR(-ENOSYS);
}
static inline int
uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool add)
uprobe_apply(struct uprobe* uprobe, struct uprobe_consumer *uc, bool add)
{
return -ENOSYS;
}
static inline void
uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
}
static inline void uprobe_unregister_sync(void)
{
}
static inline int uprobe_mmap(struct vm_area_struct *vma)
......
......@@ -155,20 +155,55 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
enum event_type_t {
EVENT_FLEXIBLE = 0x01,
EVENT_PINNED = 0x02,
EVENT_TIME = 0x04,
EVENT_FROZEN = 0x08,
/* see ctx_resched() for details */
EVENT_CPU = 0x10,
EVENT_CGROUP = 0x20,
/* compound helpers */
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
};
static inline void __perf_ctx_lock(struct perf_event_context *ctx)
{
raw_spin_lock(&ctx->lock);
WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
}
static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
raw_spin_lock(&cpuctx->ctx.lock);
__perf_ctx_lock(&cpuctx->ctx);
if (ctx)
raw_spin_lock(&ctx->lock);
__perf_ctx_lock(ctx);
}
static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
{
/*
* If ctx_sched_in() didn't again set any ALL flags, clean up
* after ctx_sched_out() by clearing is_active.
*/
if (ctx->is_active & EVENT_FROZEN) {
if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0;
else
ctx->is_active &= ~EVENT_FROZEN;
}
raw_spin_unlock(&ctx->lock);
}
static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
if (ctx)
raw_spin_unlock(&ctx->lock);
raw_spin_unlock(&cpuctx->ctx.lock);
__perf_ctx_unlock(ctx);
__perf_ctx_unlock(&cpuctx->ctx);
}
#define TASK_TOMBSTONE ((void *)-1L)
......@@ -264,6 +299,7 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
struct perf_cpu_context *cpuctx;
struct event_function_struct efs = {
.event = event,
.func = func,
......@@ -291,22 +327,25 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
if (!task_function_call(task, event_function, &efs))
return;
raw_spin_lock_irq(&ctx->lock);
local_irq_disable();
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
/*
* Reload the task pointer, it might have been changed by
* a concurrent perf_event_context_sched_out().
*/
task = ctx->task;
if (task == TASK_TOMBSTONE) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
if (task == TASK_TOMBSTONE)
goto unlock;
if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable();
goto again;
}
func(event, NULL, ctx, data);
raw_spin_unlock_irq(&ctx->lock);
unlock:
perf_ctx_unlock(cpuctx, ctx);
local_irq_enable();
}
/*
......@@ -369,16 +408,6 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
(PERF_SAMPLE_BRANCH_KERNEL |\
PERF_SAMPLE_BRANCH_HV)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_TIME = 0x4,
/* see ctx_resched() for details */
EVENT_CPU = 0x8,
EVENT_CGROUP = 0x10,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
/*
* perf_sched_events : >0 events exist
*/
......@@ -407,6 +436,11 @@ static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
static cpumask_var_t perf_online_core_mask;
static cpumask_var_t perf_online_die_mask;
static cpumask_var_t perf_online_cluster_mask;
static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;
/*
......@@ -685,30 +719,32 @@ do { \
___p; \
})
#define for_each_epc(_epc, _ctx, _pmu, _cgroup) \
list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
if (_cgroup && !_epc->nr_cgroups) \
continue; \
else if (_pmu && _epc->pmu != _pmu) \
continue; \
else
static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_disable(pmu_ctx->pmu);
}
}
static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, NULL, cgroup)
perf_pmu_enable(pmu_ctx->pmu);
}
}
static void ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type);
static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
#ifdef CONFIG_CGROUP_PERF
......@@ -865,7 +901,7 @@ static void perf_cgroup_switch(struct task_struct *task)
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_ctx_disable(&cpuctx->ctx, true);
ctx_sched_out(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
/*
* must not be done before ctxswout due
* to update_cgrp_time_from_cpuctx() in
......@@ -877,7 +913,7 @@ static void perf_cgroup_switch(struct task_struct *task)
* perf_cgroup_set_timestamp() in ctx_sched_in()
* to not have to pass task around
*/
ctx_sched_in(&cpuctx->ctx, EVENT_ALL|EVENT_CGROUP);
ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
perf_ctx_enable(&cpuctx->ctx, true);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
......@@ -1768,6 +1804,14 @@ perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
event = rb_entry_safe(rb_next(&event->group_node), \
typeof(*event), group_node))
/*
* Does the event attribute request inherit with PERF_SAMPLE_READ
*/
static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
{
return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
}
/*
* Add an event from the lists for its context.
* Must be called with ctx->mutex and ctx->lock held.
......@@ -1798,6 +1842,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
if (has_inherit_and_sample_read(&event->attr))
local_inc(&ctx->nr_no_switch_fast);
if (event->state > PERF_EVENT_STATE_OFF)
perf_cgroup_event_enable(event, ctx);
......@@ -2022,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
if (has_inherit_and_sample_read(&event->attr))
local_dec(&ctx->nr_no_switch_fast);
list_del_rcu(&event->event_entry);
......@@ -2317,6 +2365,45 @@ group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
event_sched_out(event, ctx);
}
static inline void
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
{
if (ctx->is_active & EVENT_TIME) {
if (ctx->is_active & EVENT_FROZEN)
return;
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx, final);
}
}
static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
__ctx_time_update(cpuctx, ctx, false);
}
/*
* To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
*/
static inline void
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
ctx_time_update(cpuctx, ctx);
if (ctx->is_active & EVENT_TIME)
ctx->is_active |= EVENT_FROZEN;
}
static inline void
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
{
if (ctx->is_active & EVENT_TIME) {
if (ctx->is_active & EVENT_FROZEN)
return;
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
}
#define DETACH_GROUP 0x01UL
#define DETACH_CHILD 0x02UL
#define DETACH_DEAD 0x04UL
......@@ -2336,10 +2423,7 @@ __perf_remove_from_context(struct perf_event *event,
struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
unsigned long flags = (unsigned long)info;
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx, false);
}
ctx_time_update(cpuctx, ctx);
/*
* Ensure event_sched_out() switches to OFF, at the very least
......@@ -2424,12 +2508,8 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
perf_pmu_disable(event->pmu_ctx->pmu);
ctx_time_update_event(ctx, event);
if (event == event->group_leader)
group_sched_out(event, ctx);
......@@ -2645,7 +2725,8 @@ static void add_event_to_ctx(struct perf_event *event,
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
enum event_type_t event_type)
struct pmu *pmu,
enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
......@@ -2655,18 +2736,19 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
return;
ctx_sched_out(ctx, event_type);
ctx_sched_out(ctx, pmu, event_type);
}
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
struct perf_event_context *ctx,
struct pmu *pmu)
{
ctx_sched_in(&cpuctx->ctx, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
if (ctx)
ctx_sched_in(ctx, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_in(ctx, pmu, EVENT_PINNED);
ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
if (ctx)
ctx_sched_in(ctx, EVENT_FLEXIBLE);
ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}
/*
......@@ -2684,16 +2766,12 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
* event_type is a bit mask of the types of events involved. For CPU events,
* event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
*/
/*
* XXX: ctx_resched() reschedule entire perf_event_context while adding new
* event to the context or enabling existing event in the context. We can
* probably optimize it by rescheduling only affected pmu_ctx.
*/
static void ctx_resched(struct perf_cpu_context *cpuctx,
struct perf_event_context *task_ctx,
enum event_type_t event_type)
struct pmu *pmu, enum event_type_t event_type)
{
bool cpu_event = !!(event_type & EVENT_CPU);
struct perf_event_pmu_context *epc;
/*
* If pinned groups are involved, flexible groups also need to be
......@@ -2704,10 +2782,14 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
event_type &= EVENT_ALL;
perf_ctx_disable(&cpuctx->ctx, false);
for_each_epc(epc, &cpuctx->ctx, pmu, false)
perf_pmu_disable(epc->pmu);
if (task_ctx) {
perf_ctx_disable(task_ctx, false);
task_ctx_sched_out(task_ctx, event_type);
for_each_epc(epc, task_ctx, pmu, false)
perf_pmu_disable(epc->pmu);
task_ctx_sched_out(task_ctx, pmu, event_type);
}
/*
......@@ -2718,15 +2800,19 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
* - otherwise, do nothing more.
*/
if (cpu_event)
ctx_sched_out(&cpuctx->ctx, event_type);
ctx_sched_out(&cpuctx->ctx, pmu, event_type);
else if (event_type & EVENT_PINNED)
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, task_ctx, pmu);
perf_event_sched_in(cpuctx, task_ctx);
for_each_epc(epc, &cpuctx->ctx, pmu, false)
perf_pmu_enable(epc->pmu);
perf_ctx_enable(&cpuctx->ctx, false);
if (task_ctx)
perf_ctx_enable(task_ctx, false);
if (task_ctx) {
for_each_epc(epc, task_ctx, pmu, false)
perf_pmu_enable(epc->pmu);
}
}
void perf_pmu_resched(struct pmu *pmu)
......@@ -2735,7 +2821,7 @@ void perf_pmu_resched(struct pmu *pmu)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
perf_ctx_lock(cpuctx, task_ctx);
ctx_resched(cpuctx, task_ctx, EVENT_ALL|EVENT_CPU);
ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
perf_ctx_unlock(cpuctx, task_ctx);
}
......@@ -2791,9 +2877,10 @@ static int __perf_install_in_context(void *info)
#endif
if (reprogram) {
ctx_sched_out(ctx, EVENT_TIME);
ctx_time_freeze(cpuctx, ctx);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
get_event_type(event));
} else {
add_event_to_ctx(event, ctx);
}
......@@ -2936,8 +3023,7 @@ static void __perf_event_enable(struct perf_event *event,
event->state <= PERF_EVENT_STATE_ERROR)
return;
if (ctx->is_active)
ctx_sched_out(ctx, EVENT_TIME);
ctx_time_freeze(cpuctx, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
perf_cgroup_event_enable(event, ctx);
......@@ -2945,25 +3031,21 @@ static void __perf_event_enable(struct perf_event *event,
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
ctx_sched_in(ctx, EVENT_TIME);
if (!event_filter_match(event))
return;
}
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
ctx_sched_in(ctx, EVENT_TIME);
if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
return;
}
task_ctx = cpuctx->task_ctx;
if (ctx->task)
WARN_ON_ONCE(task_ctx != ctx);
ctx_resched(cpuctx, task_ctx, get_event_type(event));
ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}
/*
......@@ -3231,7 +3313,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
struct perf_event *event, *tmp;
struct pmu *pmu = pmu_ctx->pmu;
if (ctx->task && !ctx->is_active) {
if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
struct perf_cpu_pmu_context *cpc;
cpc = this_cpu_ptr(pmu->cpu_pmu_context);
......@@ -3239,7 +3321,7 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
cpc->task_epc = NULL;
}
if (!event_type)
if (!(event_type & EVENT_ALL))
return;
perf_pmu_disable(pmu);
......@@ -3265,8 +3347,17 @@ static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
perf_pmu_enable(pmu);
}
/*
* Be very careful with the @pmu argument since this will change ctx state.
* The @pmu argument works for ctx_resched(), because that is symmetric in
* ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
*
* However, if you were to be asymmetrical, you could end up with messed up
* state, eg. ctx->is_active cleared even though most EPCs would still actually
* be active.
*/
static void
ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
......@@ -3297,34 +3388,36 @@ ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type)
*
* would only update time for the pinned events.
*/
if (is_active & EVENT_TIME) {
/* update (and stop) ctx time */
update_context_time(ctx);
update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx);
__ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
*/
barrier();
ctx->is_active &= ~event_type;
if (!(ctx->is_active & EVENT_ALL)) {
/*
* CPU-release for the below ->is_active store,
* see __load_acquire() in perf_event_time_now()
* For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
* does not observe a hole. perf_ctx_unlock() will clean up.
*/
barrier();
if (ctx->is_active & EVENT_FROZEN)
ctx->is_active &= EVENT_TIME_FROZEN;
else
ctx->is_active = 0;
}
ctx->is_active &= ~event_type;
if (!(ctx->is_active & EVENT_ALL))
ctx->is_active = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
if (!ctx->is_active)
if (!(ctx->is_active & EVENT_ALL))
cpuctx->task_ctx = NULL;
}
is_active ^= ctx->is_active; /* changed bits */
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_out(pmu_ctx, is_active);
}
}
/*
......@@ -3517,12 +3610,17 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
perf_ctx_disable(ctx, false);
/* PMIs are disabled; ctx->nr_pending is stable. */
if (local_read(&ctx->nr_pending) ||
local_read(&next_ctx->nr_pending)) {
/* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
if (local_read(&ctx->nr_no_switch_fast) ||
local_read(&next_ctx->nr_no_switch_fast)) {
/*
* Must not swap out ctx when there's pending
* events that rely on the ctx->task relation.
*
* Likewise, when a context contains inherit +
* SAMPLE_READ events they should be switched
* out using the slow path so that they are
* treated as if they were distinct contexts.
*/
raw_spin_unlock(&next_ctx->lock);
rcu_read_unlock();
......@@ -3563,7 +3661,7 @@ perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
inside_switch:
perf_ctx_sched_task_cb(ctx, false);
task_ctx_sched_out(ctx, EVENT_ALL);
task_ctx_sched_out(ctx, NULL, EVENT_ALL);
perf_ctx_enable(ctx, false);
raw_spin_unlock(&ctx->lock);
......@@ -3861,29 +3959,22 @@ static void pmu_groups_sched_in(struct perf_event_context *ctx,
merge_sched_in, &can_add_hw);
}
static void ctx_groups_sched_in(struct perf_event_context *ctx,
struct perf_event_groups *groups,
bool cgroup)
static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
enum event_type_t event_type)
{
struct perf_event_pmu_context *pmu_ctx;
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (cgroup && !pmu_ctx->nr_cgroups)
continue;
pmu_groups_sched_in(ctx, groups, pmu_ctx->pmu);
}
}
struct perf_event_context *ctx = pmu_ctx->ctx;
static void __pmu_ctx_sched_in(struct perf_event_context *ctx,
struct pmu *pmu)
{
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu);
if (event_type & EVENT_PINNED)
pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
if (event_type & EVENT_FLEXIBLE)
pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}
static void
ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
struct perf_event_pmu_context *pmu_ctx;
int is_active = ctx->is_active;
bool cgroup = event_type & EVENT_CGROUP;
......@@ -3907,7 +3998,7 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
ctx->is_active |= (event_type | EVENT_TIME);
if (ctx->task) {
if (!is_active)
if (!(is_active & EVENT_ALL))
cpuctx->task_ctx = ctx;
else
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
......@@ -3919,12 +4010,16 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type)
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
if (is_active & EVENT_PINNED)
ctx_groups_sched_in(ctx, &ctx->pinned_groups, cgroup);
if (is_active & EVENT_PINNED) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
}
/* Then walk through the lower prio flexible groups */
if (is_active & EVENT_FLEXIBLE)
ctx_groups_sched_in(ctx, &ctx->flexible_groups, cgroup);
if (is_active & EVENT_FLEXIBLE) {
for_each_epc(pmu_ctx, ctx, pmu, cgroup)
__pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
}
}
static void perf_event_context_sched_in(struct task_struct *task)
......@@ -3967,10 +4062,10 @@ static void perf_event_context_sched_in(struct task_struct *task)
*/
if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
perf_ctx_disable(&cpuctx->ctx, false);
ctx_sched_out(&cpuctx->ctx, EVENT_FLEXIBLE);
ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
}
perf_event_sched_in(cpuctx, ctx);
perf_event_sched_in(cpuctx, ctx, NULL);
perf_ctx_sched_task_cb(cpuctx->task_ctx, true);
......@@ -4093,7 +4188,11 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
period = perf_calculate_period(event, nsec, count);
delta = (s64)(period - hwc->sample_period);
delta = (delta + 7) / 8; /* low pass filter */
if (delta >= 0)
delta += 7;
else
delta -= 7;
delta /= 8; /* low pass filter */
sample_period = hwc->sample_period + delta;
......@@ -4311,14 +4410,14 @@ static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
update_context_time(&cpuctx->ctx);
__pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
rotate_ctx(&cpuctx->ctx, cpu_event);
__pmu_ctx_sched_in(&cpuctx->ctx, pmu);
__pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
}
if (task_event)
rotate_ctx(task_epc->ctx, task_event);
if (task_event || (task_epc && cpu_event))
__pmu_ctx_sched_in(task_epc->ctx, pmu);
__pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);
perf_pmu_enable(pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
......@@ -4384,7 +4483,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
cpuctx = this_cpu_ptr(&perf_cpu_context);
perf_ctx_lock(cpuctx, ctx);
ctx_sched_out(ctx, EVENT_TIME);
ctx_time_freeze(cpuctx, ctx);
list_for_each_entry(event, &ctx->event_list, event_entry) {
enabled |= event_enable_on_exec(event, ctx);
......@@ -4396,9 +4495,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
*/
if (enabled) {
clone_ctx = unclone_ctx(ctx);
ctx_resched(cpuctx, ctx, event_type);
} else {
ctx_sched_in(ctx, EVENT_TIME);
ctx_resched(cpuctx, ctx, NULL, event_type);
}
perf_ctx_unlock(cpuctx, ctx);
......@@ -4459,16 +4556,24 @@ struct perf_read_data {
int ret;
};
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);
static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
int local_cpu = smp_processor_id();
u16 local_pkg, event_pkg;
if ((unsigned)event_cpu >= nr_cpu_ids)
return event_cpu;
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
int local_cpu = smp_processor_id();
if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);
if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
return local_cpu;
}
if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
event_pkg = topology_physical_package_id(event_cpu);
local_pkg = topology_physical_package_id(local_cpu);
......@@ -4501,10 +4606,7 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (data->group)
......@@ -4539,8 +4641,11 @@ static void __perf_event_read(void *info)
raw_spin_unlock(&ctx->lock);
}
static inline u64 perf_event_count(struct perf_event *event)
static inline u64 perf_event_count(struct perf_event *event, bool self)
{
if (self)
return local64_read(&event->count);
return local64_read(&event->count) + atomic64_read(&event->child_count);
}
......@@ -4701,10 +4806,7 @@ static int perf_event_read(struct perf_event *event, bool group)
* May read while context is not active (e.g., thread is
* blocked), in that case we cannot update context time
*/
if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
ctx_time_update_event(ctx, event);
perf_event_update_time(event);
if (group)
......@@ -5205,7 +5307,7 @@ static void perf_pending_task_sync(struct perf_event *event)
*/
if (task_work_cancel(current, head)) {
event->pending_work = 0;
local_dec(&event->ctx->nr_pending);
local_dec(&event->ctx->nr_no_switch_fast);
return;
}
......@@ -5499,7 +5601,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
mutex_lock(&event->child_mutex);
(void)perf_event_read(event, false);
total += perf_event_count(event);
total += perf_event_count(event, false);
*enabled += event->total_time_enabled +
atomic64_read(&event->child_total_time_enabled);
......@@ -5508,7 +5610,7 @@ static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *
list_for_each_entry(child, &event->child_list, child_list) {
(void)perf_event_read(child, false);
total += perf_event_count(child);
total += perf_event_count(child, false);
*enabled += child->total_time_enabled;
*running += child->total_time_running;
}
......@@ -5590,14 +5692,14 @@ static int __perf_read_group_add(struct perf_event *leader,
/*
* Write {count,id} tuples for every sibling.
*/
values[n++] += perf_event_count(leader);
values[n++] += perf_event_count(leader, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
values[n++] += perf_event_count(sub, false);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
......@@ -6177,7 +6279,7 @@ void perf_event_update_userpage(struct perf_event *event)
++userpg->lock;
barrier();
userpg->index = perf_event_index(event);
userpg->offset = perf_event_count(event);
userpg->offset = perf_event_count(event, false);
if (userpg->index)
userpg->offset -= local64_read(&event->hw.prev_count);
......@@ -6874,7 +6976,7 @@ static void perf_pending_task(struct callback_head *head)
if (event->pending_work) {
event->pending_work = 0;
perf_sigtrap(event);
local_dec(&event->ctx->nr_pending);
local_dec(&event->ctx->nr_no_switch_fast);
rcuwait_wake_up(&event->pending_work_wait);
}
rcu_read_unlock();
......@@ -7256,7 +7358,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
......@@ -7274,14 +7376,15 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
static void perf_output_read_group(struct perf_output_handle *handle,
struct perf_event *event,
u64 enabled, u64 running)
struct perf_event *event,
u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
unsigned long flags;
u64 values[6];
int n = 0;
bool self = has_inherit_and_sample_read(&event->attr);
/*
* Disabling interrupts avoids all counter scheduling
......@@ -7301,7 +7404,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(leader->state == PERF_EVENT_STATE_ACTIVE))
leader->pmu->read(leader);
values[n++] = perf_event_count(leader);
values[n++] = perf_event_count(leader, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
......@@ -7316,7 +7419,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
(sub->state == PERF_EVENT_STATE_ACTIVE))
sub->pmu->read(sub);
values[n++] = perf_event_count(sub);
values[n++] = perf_event_count(sub, self);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
......@@ -7337,6 +7440,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
* The problem is that its both hard and excessively expensive to iterate the
* child list, not to mention that its impossible to IPI the children running
* on another CPU, from interrupt/NMI context.
*
* Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
* counts rather than attempting to accumulate some value across all children on
* all cores.
*/
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
......@@ -9747,7 +9854,7 @@ static int __perf_event_overflow(struct perf_event *event,
if (!event->pending_work &&
!task_work_add(current, &event->pending_task, notify_mode)) {
event->pending_work = pending_id;
local_inc(&event->ctx->nr_pending);
local_inc(&event->ctx->nr_no_switch_fast);
event->pending_addr = 0;
if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
......@@ -11484,10 +11591,60 @@ perf_event_mux_interval_ms_store(struct device *dev,
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
{
switch (scope) {
case PERF_PMU_SCOPE_CORE:
return topology_sibling_cpumask(cpu);
case PERF_PMU_SCOPE_DIE:
return topology_die_cpumask(cpu);
case PERF_PMU_SCOPE_CLUSTER:
return topology_cluster_cpumask(cpu);
case PERF_PMU_SCOPE_PKG:
return topology_core_cpumask(cpu);
case PERF_PMU_SCOPE_SYS_WIDE:
return cpu_online_mask;
}
return NULL;
}
static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
{
switch (scope) {
case PERF_PMU_SCOPE_CORE:
return perf_online_core_mask;
case PERF_PMU_SCOPE_DIE:
return perf_online_die_mask;
case PERF_PMU_SCOPE_CLUSTER:
return perf_online_cluster_mask;
case PERF_PMU_SCOPE_PKG:
return perf_online_pkg_mask;
case PERF_PMU_SCOPE_SYS_WIDE:
return perf_online_sys_mask;
}
return NULL;
}
static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct pmu *pmu = dev_get_drvdata(dev);
struct cpumask *mask = perf_scope_cpumask(pmu->scope);
if (mask)
return cpumap_print_to_pagebuf(true, buf, mask);
return 0;
}
static DEVICE_ATTR_RO(cpumask);
static struct attribute *pmu_dev_attrs[] = {
&dev_attr_type.attr,
&dev_attr_perf_event_mux_interval_ms.attr,
&dev_attr_nr_addr_filters.attr,
&dev_attr_cpumask.attr,
NULL,
};
......@@ -11499,6 +11656,10 @@ static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int
if (n == 2 && !pmu->nr_addr_filters)
return 0;
/* cpumask */
if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
return 0;
return a->mode;
}
......@@ -11583,6 +11744,11 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
goto free_pdc;
}
if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE, "Can not register a pmu with an invalid scope.\n")) {
ret = -EINVAL;
goto free_pdc;
}
pmu->name = name;
if (type >= 0)
......@@ -11737,6 +11903,22 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
event_has_any_exclude_flag(event))
ret = -EINVAL;
if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
struct cpumask *pmu_cpumask = perf_scope_cpumask(pmu->scope);
int cpu;
if (pmu_cpumask && cpumask) {
cpu = cpumask_any_and(pmu_cpumask, cpumask);
if (cpu >= nr_cpu_ids)
ret = -ENODEV;
else
event->event_caps |= PERF_EV_CAP_READ_SCOPE;
} else {
ret = -ENODEV;
}
}
if (ret && event->destroy)
event->destroy(event);
}
......@@ -12064,10 +12246,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
local64_set(&hwc->period_left, hwc->sample_period);
/*
* We currently do not support PERF_SAMPLE_READ on inherited events.
* We do not support PERF_SAMPLE_READ on inherited events unless
* PERF_SAMPLE_TID is also selected, which allows inherited events to
* collect per-thread samples.
* See perf_output_read().
*/
if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
goto err_ns;
if (!has_branch_stack(event))
......@@ -13091,7 +13275,7 @@ static void sync_child_event(struct perf_event *child_event)
perf_event_read_event(child_event, task);
}
child_val = perf_event_count(child_event);
child_val = perf_event_count(child_event, false);
/*
* Add back the child's count to the parent's count:
......@@ -13182,7 +13366,7 @@ static void perf_event_exit_task_context(struct task_struct *child)
* in.
*/
raw_spin_lock_irq(&child_ctx->lock);
task_ctx_sched_out(child_ctx, EVENT_ALL);
task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);
/*
* Now that the context is inactive, destroy the task <-> ctx relation
......@@ -13697,6 +13881,12 @@ static void __init perf_event_init_all_cpus(void)
int cpu;
zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);
for_each_possible_cpu(cpu) {
swhash = &per_cpu(swevent_htable, cpu);
......@@ -13740,12 +13930,46 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
ctx_sched_out(ctx, EVENT_TIME);
ctx_sched_out(ctx, NULL, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
}
static void perf_event_clear_cpumask(unsigned int cpu)
{
int target[PERF_PMU_MAX_SCOPE];
unsigned int scope;
struct pmu *pmu;
cpumask_clear_cpu(cpu, perf_online_mask);
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);
target[scope] = -1;
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
continue;
if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
continue;
target[scope] = cpumask_any_but(cpumask, cpu);
if (target[scope] < nr_cpu_ids)
cpumask_set_cpu(target[scope], pmu_cpumask);
}
/* migrate */
list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
if (pmu->scope == PERF_PMU_SCOPE_NONE ||
WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
continue;
if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
}
}
static void perf_event_exit_cpu_context(int cpu)
{
struct perf_cpu_context *cpuctx;
......@@ -13753,6 +13977,11 @@ static void perf_event_exit_cpu_context(int cpu)
// XXX simplify cpuctx->online
mutex_lock(&pmus_lock);
/*
* Clear the cpumasks, and migrate to other CPUs if possible.
* Must be invoked before the __perf_event_exit_context.
*/
perf_event_clear_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
......@@ -13760,7 +13989,6 @@ static void perf_event_exit_cpu_context(int cpu)
smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
cpuctx->online = 0;
mutex_unlock(&ctx->mutex);
cpumask_clear_cpu(cpu, perf_online_mask);
mutex_unlock(&pmus_lock);
}
#else
......@@ -13769,6 +13997,42 @@ static void perf_event_exit_cpu_context(int cpu) { }
#endif
static void perf_event_setup_cpumask(unsigned int cpu)
{
struct cpumask *pmu_cpumask;
unsigned int scope;
cpumask_set_cpu(cpu, perf_online_mask);
/*
* Early boot stage, the cpumask hasn't been set yet.
* The perf_online_<domain>_masks includes the first CPU of each domain.
* Always uncondifionally set the boot CPU for the perf_online_<domain>_masks.
*/
if (!topology_sibling_cpumask(cpu)) {
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
pmu_cpumask = perf_scope_cpumask(scope);
if (WARN_ON_ONCE(!pmu_cpumask))
continue;
cpumask_set_cpu(cpu, pmu_cpumask);
}
return;
}
for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
pmu_cpumask = perf_scope_cpumask(scope);
if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
continue;
if (!cpumask_empty(cpumask) &&
cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
cpumask_set_cpu(cpu, pmu_cpumask);
}
}
int perf_event_init_cpu(unsigned int cpu)
{
struct perf_cpu_context *cpuctx;
......@@ -13777,7 +14041,7 @@ int perf_event_init_cpu(unsigned int cpu)
perf_swevent_init_cpu(cpu);
mutex_lock(&pmus_lock);
cpumask_set_cpu(cpu, perf_online_mask);
perf_event_setup_cpumask(cpu);
cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
ctx = &cpuctx->ctx;
......
......@@ -40,6 +40,9 @@ static struct rb_root uprobes_tree = RB_ROOT;
#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */
static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock);
DEFINE_STATIC_SRCU(uprobes_srcu);
#define UPROBES_HASH_SZ 13
/* serialize uprobe->pending_list */
......@@ -57,8 +60,9 @@ struct uprobe {
struct rw_semaphore register_rwsem;
struct rw_semaphore consumer_rwsem;
struct list_head pending_list;
struct uprobe_consumer *consumers;
struct list_head consumers;
struct inode *inode; /* Also hold a ref to inode */
struct rcu_head rcu;
loff_t offset;
loff_t ref_ctr_offset;
unsigned long flags;
......@@ -109,6 +113,11 @@ struct xol_area {
unsigned long vaddr; /* Page(s) of instruction slots */
};
static void uprobe_warn(struct task_struct *t, const char *msg)
{
pr_warn("uprobe: %s:%d failed to %s\n", current->comm, current->pid, msg);
}
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
......@@ -453,7 +462,7 @@ static int update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm,
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
* Called with mm->mmap_lock held for write.
* Called with mm->mmap_lock held for read or write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct mm_struct *mm,
......@@ -587,25 +596,63 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
*(uprobe_opcode_t *)&auprobe->insn);
}
/* uprobe should have guaranteed positive refcount */
static struct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref);
return uprobe;
}
/*
* uprobe should have guaranteed lifetime, which can be either of:
* - caller already has refcount taken (and wants an extra one);
* - uprobe is RCU protected and won't be freed until after grace period;
* - we are holding uprobes_treelock (for read or write, doesn't matter).
*/
static struct uprobe *try_get_uprobe(struct uprobe *uprobe)
{
if (refcount_inc_not_zero(&uprobe->ref))
return uprobe;
return NULL;
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
static void uprobe_free_rcu(struct rcu_head *rcu)
{
struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu);
kfree(uprobe);
}
static void put_uprobe(struct uprobe *uprobe)
{
if (refcount_dec_and_test(&uprobe->ref)) {
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
kfree(uprobe);
if (!refcount_dec_and_test(&uprobe->ref))
return;
write_lock(&uprobes_treelock);
if (uprobe_is_active(uprobe)) {
write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_seqcount_end(&uprobes_seqcount);
}
write_unlock(&uprobes_treelock);
/*
* If application munmap(exec_vma) before uprobe_unregister()
* gets called, we don't get a chance to remove uprobe from
* delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu);
}
static __always_inline
......@@ -647,62 +694,86 @@ static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b)
return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b));
}
static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
/*
* Assumes being inside RCU protected region.
* No refcount is taken on returned uprobe.
*/
static struct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{
struct __uprobe_key key = {
.inode = inode,
.offset = offset,
};
struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key);
struct rb_node *node;
unsigned int seq;
if (node)
return get_uprobe(__node_2_uprobe(node));
lockdep_assert(srcu_read_lock_held(&uprobes_srcu));
do {
seq = read_seqcount_begin(&uprobes_seqcount);
node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key);
/*
* Lockless RB-tree lookups can result only in false negatives.
* If the element is found, it is correct and can be returned
* under RCU protection. If we find nothing, we need to
* validate that seqcount didn't change. If it did, we have to
* try again as we might have missed the element (false
* negative). If seqcount is unchanged, search truly failed.
*/
if (node)
return __node_2_uprobe(node);
} while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL;
}
/*
* Find a uprobe corresponding to a given inode:offset
* Acquires uprobes_treelock
* Attempt to insert a new uprobe into uprobes_tree.
*
* If uprobe already exists (for given inode+offset), we just increment
* refcount of previously existing uprobe.
*
* If not, a provided new instance of uprobe is inserted into the tree (with
* assumed initial refcount == 1).
*
* In any case, we return a uprobe instance that ends up being in uprobes_tree.
* Caller has to clean up new uprobe instance, if it ended up not being
* inserted into the tree.
*
* We assume that uprobes_treelock is held for writing.
*/
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe;
read_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
read_unlock(&uprobes_treelock);
return uprobe;
}
static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
{
struct rb_node *node;
again:
node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node) {
struct uprobe *u = __node_2_uprobe(node);
if (!try_get_uprobe(u)) {
rb_erase(node, &uprobes_tree);
RB_CLEAR_NODE(&u->rb_node);
goto again;
}
node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp);
if (node)
return get_uprobe(__node_2_uprobe(node));
return u;
}
/* get access + creation ref */
refcount_set(&uprobe->ref, 2);
return NULL;
return uprobe;
}
/*
* Acquire uprobes_treelock.
* Matching uprobe already exists in rbtree;
* increment (access refcount) and return the matching uprobe.
*
* No matching uprobe; insert the uprobe in rb_tree;
* get a double refcount (access + creation) and return NULL.
* Acquire uprobes_treelock and insert uprobe into uprobes_tree
* (or reuse existing one, see __insert_uprobe() comments above).
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
struct uprobe *u;
write_lock(&uprobes_treelock);
write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
return u;
......@@ -725,18 +796,21 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
uprobe = kzalloc(sizeof(struct uprobe), GFP_KERNEL);
if (!uprobe)
return NULL;
return ERR_PTR(-ENOMEM);
uprobe->inode = inode;
uprobe->offset = offset;
uprobe->ref_ctr_offset = ref_ctr_offset;
INIT_LIST_HEAD(&uprobe->consumers);
init_rwsem(&uprobe->register_rwsem);
init_rwsem(&uprobe->consumer_rwsem);
RB_CLEAR_NODE(&uprobe->rb_node);
refcount_set(&uprobe->ref, 1);
/* add to uprobes_tree, sorted on inode:offset */
cur_uprobe = insert_uprobe(uprobe);
/* a uprobe exists for this inode:offset combination */
if (cur_uprobe) {
if (cur_uprobe != uprobe) {
if (cur_uprobe->ref_ctr_offset != uprobe->ref_ctr_offset) {
ref_ctr_mismatch_warn(cur_uprobe, uprobe);
put_uprobe(cur_uprobe);
......@@ -753,32 +827,19 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset,
static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
uc->next = uprobe->consumers;
uprobe->consumers = uc;
list_add_rcu(&uc->cons_node, &uprobe->consumers);
up_write(&uprobe->consumer_rwsem);
}
/*
* For uprobe @uprobe, delete the consumer @uc.
* Return true if the @uc is deleted successfully
* or return false.
* Should never be called with consumer that's not part of @uprobe->consumers.
*/
static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
static void consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
struct uprobe_consumer **con;
bool ret = false;
down_write(&uprobe->consumer_rwsem);
for (con = &uprobe->consumers; *con; con = &(*con)->next) {
if (*con == uc) {
*con = uc->next;
ret = true;
break;
}
}
list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
return ret;
}
static int __copy_insn(struct address_space *mapping, struct file *filp,
......@@ -863,21 +924,20 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
return ret;
}
static inline bool consumer_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static inline bool consumer_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
return !uc->filter || uc->filter(uc, ctx, mm);
return !uc->filter || uc->filter(uc, mm);
}
static bool filter_chain(struct uprobe *uprobe,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static bool filter_chain(struct uprobe *uprobe, struct mm_struct *mm)
{
struct uprobe_consumer *uc;
bool ret = false;
down_read(&uprobe->consumer_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
ret = consumer_filter(uc, ctx, mm);
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
ret = consumer_filter(uc, mm);
if (ret)
break;
}
......@@ -921,27 +981,6 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
return set_orig_insn(&uprobe->arch, mm, vaddr);
}
static inline bool uprobe_is_active(struct uprobe *uprobe)
{
return !RB_EMPTY_NODE(&uprobe->rb_node);
}
/*
* There could be threads that have already hit the breakpoint. They
* will recheck the current insn and restart if find_uprobe() fails.
* See find_active_uprobe().
*/
static void delete_uprobe(struct uprobe *uprobe)
{
if (WARN_ON(!uprobe_is_active(uprobe)))
return;
write_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_unlock(&uprobes_treelock);
RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
put_uprobe(uprobe);
}
struct map_info {
struct map_info *next;
struct mm_struct *mm;
......@@ -1046,7 +1085,13 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (err && is_register)
goto free;
/*
* We take mmap_lock for writing to avoid the race with
* find_active_uprobe_rcu() which takes mmap_lock for reading.
* Thus this install_breakpoint() can not make
* is_trap_at_addr() true right after find_uprobe_rcu()
* returns NULL in find_active_uprobe_rcu().
*/
mmap_write_lock(mm);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
......@@ -1059,12 +1104,10 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
if (is_register) {
/* consult only the "caller", new consumer. */
if (consumer_filter(new,
UPROBE_FILTER_REGISTER, mm))
if (consumer_filter(new, mm))
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
} else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
if (!filter_chain(uprobe,
UPROBE_FILTER_UNREGISTER, mm))
if (!filter_chain(uprobe, mm))
err |= remove_breakpoint(uprobe, mm, info->vaddr);
}
......@@ -1079,152 +1122,140 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
return err;
}
static void
__uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
/**
* uprobe_unregister_nosync - unregister an already registered probe.
* @uprobe: uprobe to remove
* @uc: identify which probe if multiple probes are colocated.
*/
void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
int err;
if (WARN_ON(!consumer_del(uprobe, uc)))
return;
down_write(&uprobe->register_rwsem);
consumer_del(uprobe, uc);
err = register_for_each_vma(uprobe, NULL);
/* TODO : cant unregister? schedule a worker thread */
if (!uprobe->consumers && !err)
delete_uprobe(uprobe);
}
/*
* uprobe_unregister - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
* @uc: identify which probe if multiple probes are colocated.
*/
void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
up_write(&uprobe->register_rwsem);
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
/* TODO : cant unregister? schedule a worker thread */
if (unlikely(err)) {
uprobe_warn(current, "unregister, leaking uprobe");
return;
}
down_write(&uprobe->register_rwsem);
__uprobe_unregister(uprobe, uc);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
}
EXPORT_SYMBOL_GPL(uprobe_unregister);
EXPORT_SYMBOL_GPL(uprobe_unregister_nosync);
/*
* __uprobe_register - register a probe
void uprobe_unregister_sync(void)
{
/*
* Now that handler_chain() and handle_uretprobe_chain() iterate over
* uprobe->consumers list under RCU protection without holding
* uprobe->register_rwsem, we need to wait for RCU grace period to
* make sure that we can't call into just unregistered
* uprobe_consumer's callbacks anymore. If we don't do that, fast and
* unlucky enough caller can free consumer's memory and cause
* handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
synchronize_srcu(&uprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
/**
* uprobe_register - register a probe
* @inode: the file in which the probe has to be placed.
* @offset: offset from the start of the file.
* @ref_ctr_offset: offset of SDT marker / reference counter
* @uc: information on howto handle the probe..
*
* Apart from the access refcount, __uprobe_register() takes a creation
* Apart from the access refcount, uprobe_register() takes a creation
* refcount (thro alloc_uprobe) if and only if this @uprobe is getting
* inserted into the rbtree (i.e first consumer for a @inode:@offset
* tuple). Creation refcount stops uprobe_unregister from freeing the
* @uprobe even before the register operation is complete. Creation
* refcount is released when the last @uc for the @uprobe
* unregisters. Caller of __uprobe_register() is required to keep @inode
* unregisters. Caller of uprobe_register() is required to keep @inode
* (and the containing mount) referenced.
*
* Return errno if it cannot successully install probes
* else return 0 (success)
* Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/
static int __uprobe_register(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
struct uprobe *uprobe_register(struct inode *inode,
loff_t offset, loff_t ref_ctr_offset,
struct uprobe_consumer *uc)
{
struct uprobe *uprobe;
int ret;
/* Uprobe must have at least one set consumer */
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
if (!inode->i_mapping->a_ops->read_folio &&
!shmem_mapping(inode->i_mapping))
return -EIO;
return ERR_PTR(-EIO);
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
return -EINVAL;
return ERR_PTR(-EINVAL);
if (!IS_ALIGNED(ref_ctr_offset, sizeof(short)))
return -EINVAL;
return ERR_PTR(-EINVAL);
retry:
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset);
if (!uprobe)
return -ENOMEM;
if (IS_ERR(uprobe))
return PTR_ERR(uprobe);
return uprobe;
/*
* We can race with uprobe_unregister()->delete_uprobe().
* Check uprobe_is_active() and retry if it is false.
*/
down_write(&uprobe->register_rwsem);
ret = -EAGAIN;
if (likely(uprobe_is_active(uprobe))) {
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
if (ret)
__uprobe_unregister(uprobe, uc);
}
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
if (unlikely(ret == -EAGAIN))
goto retry;
return ret;
}
if (ret) {
uprobe_unregister_nosync(uprobe, uc);
/*
* Registration might have partially succeeded, so we can have
* this consumer being called right at this time. We need to
* sync here. It's ok, it's unlikely slow path.
*/
uprobe_unregister_sync();
return ERR_PTR(ret);
}
int uprobe_register(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, 0, uc);
return uprobe;
}
EXPORT_SYMBOL_GPL(uprobe_register);
int uprobe_register_refctr(struct inode *inode, loff_t offset,
loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{
return __uprobe_register(inode, offset, ref_ctr_offset, uc);
}
EXPORT_SYMBOL_GPL(uprobe_register_refctr);
/*
* uprobe_apply - unregister an already registered probe.
* @inode: the file in which the probe has to be removed.
* @offset: offset from the start of the file.
/**
* uprobe_apply - add or remove the breakpoints according to @uc->filter
* @uprobe: uprobe which "owns" the breakpoint
* @uc: consumer which wants to add more or remove some breakpoints
* @add: add or remove the breakpoints
* Return: 0 on success or negative error code.
*/
int uprobe_apply(struct inode *inode, loff_t offset,
struct uprobe_consumer *uc, bool add)
int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{
struct uprobe *uprobe;
struct uprobe_consumer *con;
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
if (WARN_ON(!uprobe))
return ret;
int ret = -ENOENT, srcu_idx;
down_write(&uprobe->register_rwsem);
for (con = uprobe->consumers; con && con != uc ; con = con->next)
;
if (con)
ret = register_for_each_vma(uprobe, add ? uc : NULL);
srcu_idx = srcu_read_lock(&uprobes_srcu);
list_for_each_entry_srcu(con, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
if (con == uc) {
ret = register_for_each_vma(uprobe, add ? uc : NULL);
break;
}
}
srcu_read_unlock(&uprobes_srcu, srcu_idx);
up_write(&uprobe->register_rwsem);
put_uprobe(uprobe);
return ret;
}
......@@ -1305,15 +1336,17 @@ static void build_probe_list(struct inode *inode,
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset < min)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
/* if uprobe went away, it's safe to ignore it */
if (try_get_uprobe(u))
list_add(&u->pending_list, head);
}
for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node);
if (u->inode != inode || u->offset > max)
break;
list_add(&u->pending_list, head);
get_uprobe(u);
/* if uprobe went away, it's safe to ignore it */
if (try_get_uprobe(u))
list_add(&u->pending_list, head);
}
}
read_unlock(&uprobes_treelock);
......@@ -1384,7 +1417,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
filter_chain(uprobe, vma->vm_mm)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
......@@ -1770,6 +1803,12 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return -ENOMEM;
*n = *o;
/*
* uprobe's refcnt has to be positive at this point, kept by
* utask->return_instances items; return_instances can't be
* removed right now, as task is blocked due to duping; so
* get_uprobe() is safe to use here.
*/
get_uprobe(n->uprobe);
n->next = NULL;
......@@ -1781,12 +1820,6 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
return 0;
}
static void uprobe_warn(struct task_struct *t, const char *msg)
{
pr_warn("uprobe: %s:%d failed to %s\n",
current->comm, current->pid, msg);
}
static void dup_xol_work(struct callback_head *work)
{
if (current->flags & PF_EXITING)
......@@ -1883,9 +1916,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
return;
}
/* we need to bump refcount to store uprobe in utask */
if (!try_get_uprobe(uprobe))
return;
ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
if (!ri)
return;
goto fail;
trampoline_vaddr = uprobe_get_trampoline_vaddr();
orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
......@@ -1912,8 +1949,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
ri->uprobe = get_uprobe(uprobe);
ri->uprobe = uprobe;
ri->func = instruction_pointer(regs);
ri->stack = user_stack_pointer(regs);
ri->orig_ret_vaddr = orig_ret_vaddr;
......@@ -1924,8 +1960,9 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
utask->return_instances = ri;
return;
fail:
fail:
kfree(ri);
put_uprobe(uprobe);
}
/* Prepare to single-step probed instruction out of line. */
......@@ -1940,9 +1977,14 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
if (!utask)
return -ENOMEM;
if (!try_get_uprobe(uprobe))
return -EINVAL;
xol_vaddr = xol_get_insn_slot(uprobe);
if (!xol_vaddr)
return -ENOMEM;
if (!xol_vaddr) {
err = -ENOMEM;
goto err_out;
}
utask->xol_vaddr = xol_vaddr;
utask->vaddr = bp_vaddr;
......@@ -1950,12 +1992,15 @@ pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
err = arch_uprobe_pre_xol(&uprobe->arch, regs);
if (unlikely(err)) {
xol_free_insn_slot(current);
return err;
goto err_out;
}
utask->active_uprobe = uprobe;
utask->state = UTASK_SSTEP;
return 0;
err_out:
put_uprobe(uprobe);
return err;
}
/*
......@@ -2028,13 +2073,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (likely(result == 0))
goto out;
/*
* The NULL 'tsk' here ensures that any faults that occur here
* will not be accounted to the task. 'mm' *is* current->mm,
* but we treat this as a 'remote' access since it is
* essentially a kernel access to the memory.
*/
result = get_user_pages_remote(mm, vaddr, 1, FOLL_FORCE, &page, NULL);
result = get_user_pages(vaddr, 1, FOLL_FORCE, &page);
if (result < 0)
return result;
......@@ -2045,7 +2084,8 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
struct uprobe *uprobe = NULL;
......@@ -2058,7 +2098,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
uprobe = find_uprobe_rcu(inode, offset);
}
if (!uprobe)
......@@ -2079,9 +2119,12 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
bool need_prep = false; /* prepare return uprobe, when needed */
bool has_consumers = false;
current->utask->auprobe = &uprobe->arch;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
int rc = 0;
if (uc->handler) {
......@@ -2094,16 +2137,24 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
need_prep = true;
remove &= rc;
has_consumers = true;
}
current->utask->auprobe = NULL;
if (need_prep && !remove)
prepare_uretprobe(uprobe, regs); /* put bp at return */
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
if (remove && has_consumers) {
down_read(&uprobe->register_rwsem);
/* re-check that removal is still required, this time under lock */
if (!filter_chain(uprobe, current->mm)) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
}
up_read(&uprobe->register_rwsem);
}
up_read(&uprobe->register_rwsem);
}
static void
......@@ -2111,13 +2162,15 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
{
struct uprobe *uprobe = ri->uprobe;
struct uprobe_consumer *uc;
int srcu_idx;
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
srcu_idx = srcu_read_lock(&uprobes_srcu);
list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node,
srcu_read_lock_held(&uprobes_srcu)) {
if (uc->ret_handler)
uc->ret_handler(uc, ri->func, regs);
}
up_read(&uprobe->register_rwsem);
srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
static struct return_instance *find_next_ret_chain(struct return_instance *ri)
......@@ -2202,13 +2255,15 @@ static void handle_swbp(struct pt_regs *regs)
{
struct uprobe *uprobe;
unsigned long bp_vaddr;
int is_swbp;
int is_swbp, srcu_idx;
bp_vaddr = uprobe_get_swbp_addr(regs);
if (bp_vaddr == uprobe_get_trampoline_vaddr())
return uprobe_handle_trampoline(regs);
uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
srcu_idx = srcu_read_lock(&uprobes_srcu);
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
......@@ -2224,7 +2279,7 @@ static void handle_swbp(struct pt_regs *regs)
*/
instruction_pointer_set(regs, bp_vaddr);
}
return;
goto out;
}
/* change it in advance for ->handler() and restart */
......@@ -2259,12 +2314,12 @@ static void handle_swbp(struct pt_regs *regs)
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
if (!pre_ssout(uprobe, regs, bp_vaddr))
return;
if (pre_ssout(uprobe, regs, bp_vaddr))
goto out;
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
out:
put_uprobe(uprobe);
/* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */
srcu_read_unlock(&uprobes_srcu, srcu_idx);
}
/*
......
......@@ -3160,6 +3160,7 @@ struct bpf_uprobe {
loff_t offset;
unsigned long ref_ctr_offset;
u64 cookie;
struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
......@@ -3178,15 +3179,15 @@ struct bpf_uprobe_multi_run_ctx {
struct bpf_uprobe *uprobe;
};
static void bpf_uprobe_unregister(struct path *path, struct bpf_uprobe *uprobes,
u32 cnt)
static void bpf_uprobe_unregister(struct bpf_uprobe *uprobes, u32 cnt)
{
u32 i;
for (i = 0; i < cnt; i++) {
uprobe_unregister(d_real_inode(path->dentry), uprobes[i].offset,
&uprobes[i].consumer);
}
for (i = 0; i < cnt; i++)
uprobe_unregister_nosync(uprobes[i].uprobe, &uprobes[i].consumer);
if (cnt)
uprobe_unregister_sync();
}
static void bpf_uprobe_multi_link_release(struct bpf_link *link)
......@@ -3194,7 +3195,7 @@ static void bpf_uprobe_multi_link_release(struct bpf_link *link)
struct bpf_uprobe_multi_link *umulti_link;
umulti_link = container_of(link, struct bpf_uprobe_multi_link, link);
bpf_uprobe_unregister(&umulti_link->path, umulti_link->uprobes, umulti_link->cnt);
bpf_uprobe_unregister(umulti_link->uprobes, umulti_link->cnt);
if (umulti_link->task)
put_task_struct(umulti_link->task);
path_put(&umulti_link->path);
......@@ -3322,8 +3323,7 @@ static int uprobe_prog_run(struct bpf_uprobe *uprobe,
}
static bool
uprobe_multi_link_filter(struct uprobe_consumer *con, enum uprobe_filter_ctx ctx,
struct mm_struct *mm)
uprobe_multi_link_filter(struct uprobe_consumer *con, struct mm_struct *mm)
{
struct bpf_uprobe *uprobe;
......@@ -3480,22 +3480,26 @@ int bpf_uprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *pr
&bpf_uprobe_multi_link_lops, prog);
for (i = 0; i < cnt; i++) {
err = uprobe_register_refctr(d_real_inode(link->path.dentry),
uprobes[i].offset,
uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (err) {
bpf_uprobe_unregister(&path, uprobes, i);
goto error_free;
uprobes[i].uprobe = uprobe_register(d_real_inode(link->path.dentry),
uprobes[i].offset,
uprobes[i].ref_ctr_offset,
&uprobes[i].consumer);
if (IS_ERR(uprobes[i].uprobe)) {
err = PTR_ERR(uprobes[i].uprobe);
link->cnt = i;
goto error_unregister;
}
}
err = bpf_link_prime(&link->link, &link_primer);
if (err)
goto error_free;
goto error_unregister;
return bpf_link_settle(&link_primer);
error_unregister:
bpf_uprobe_unregister(uprobes, link->cnt);
error_free:
kvfree(uprobes);
kfree(link);
......
......@@ -58,8 +58,8 @@ struct trace_uprobe {
struct dyn_event devent;
struct uprobe_consumer consumer;
struct path path;
struct inode *inode;
char *filename;
struct uprobe *uprobe;
unsigned long offset;
unsigned long ref_ctr_offset;
unsigned long nhit;
......@@ -1078,43 +1078,40 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e
return trace_handle_return(s);
}
typedef bool (*filter_func_t)(struct uprobe_consumer *self,
enum uprobe_filter_ctx ctx,
struct mm_struct *mm);
typedef bool (*filter_func_t)(struct uprobe_consumer *self, struct mm_struct *mm);
static int trace_uprobe_enable(struct trace_uprobe *tu, filter_func_t filter)
{
int ret;
struct inode *inode = d_real_inode(tu->path.dentry);
struct uprobe *uprobe;
tu->consumer.filter = filter;
tu->inode = d_real_inode(tu->path.dentry);
if (tu->ref_ctr_offset)
ret = uprobe_register_refctr(tu->inode, tu->offset,
tu->ref_ctr_offset, &tu->consumer);
else
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
uprobe = uprobe_register(inode, tu->offset, tu->ref_ctr_offset, &tu->consumer);
if (IS_ERR(uprobe))
return PTR_ERR(uprobe);
if (ret)
tu->inode = NULL;
return ret;
tu->uprobe = uprobe;
return 0;
}
static void __probe_event_disable(struct trace_probe *tp)
{
struct trace_uprobe *tu;
bool sync = false;
tu = container_of(tp, struct trace_uprobe, tp);
WARN_ON(!uprobe_filter_is_empty(tu->tp.event->filter));
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
if (!tu->inode)
if (!tu->uprobe)
continue;
uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
tu->inode = NULL;
uprobe_unregister_nosync(tu->uprobe, &tu->consumer);
sync = true;
tu->uprobe = NULL;
}
if (sync)
uprobe_unregister_sync();
}
static int probe_event_enable(struct trace_event_call *call,
......@@ -1310,7 +1307,7 @@ static int uprobe_perf_close(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
ret = uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
ret = uprobe_apply(tu->uprobe, &tu->consumer, false);
if (ret)
break;
}
......@@ -1334,7 +1331,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(tu, trace_probe_probe_list(tp), tp.list) {
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
err = uprobe_apply(tu->uprobe, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
break;
......@@ -1344,8 +1341,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
enum uprobe_filter_ctx ctx, struct mm_struct *mm)
static bool uprobe_perf_filter(struct uprobe_consumer *uc, struct mm_struct *mm)
{
struct trace_uprobe_filter *filter;
struct trace_uprobe *tu;
......@@ -1431,7 +1427,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs,
struct uprobe_cpu_buffer **ucbp)
{
if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
if (!uprobe_perf_filter(&tu->consumer, current->mm))
return UPROBE_HANDLER_REMOVE;
if (!is_ret_probe(tu))
......
......@@ -434,7 +434,7 @@ uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
struct testmod_uprobe {
struct path path;
loff_t offset;
struct uprobe *uprobe;
struct uprobe_consumer consumer;
};
......@@ -448,25 +448,25 @@ static int testmod_register_uprobe(loff_t offset)
{
int err = -EBUSY;
if (uprobe.offset)
if (uprobe.uprobe)
return -EBUSY;
mutex_lock(&testmod_uprobe_mutex);
if (uprobe.offset)
if (uprobe.uprobe)
goto out;
err = kern_path("/proc/self/exe", LOOKUP_FOLLOW, &uprobe.path);
if (err)
goto out;
err = uprobe_register_refctr(d_real_inode(uprobe.path.dentry),
offset, 0, &uprobe.consumer);
if (err)
uprobe.uprobe = uprobe_register(d_real_inode(uprobe.path.dentry),
offset, 0, &uprobe.consumer);
if (IS_ERR(uprobe.uprobe)) {
err = PTR_ERR(uprobe.uprobe);
path_put(&uprobe.path);
else
uprobe.offset = offset;
uprobe.uprobe = NULL;
}
out:
mutex_unlock(&testmod_uprobe_mutex);
return err;
......@@ -476,10 +476,11 @@ static void testmod_unregister_uprobe(void)
{
mutex_lock(&testmod_uprobe_mutex);
if (uprobe.offset) {
uprobe_unregister(d_real_inode(uprobe.path.dentry),
uprobe.offset, &uprobe.consumer);
uprobe.offset = 0;
if (uprobe.uprobe) {
uprobe_unregister_nosync(uprobe.uprobe, &uprobe.consumer);
uprobe_unregister_sync();
path_put(&uprobe.path);
uprobe.uprobe = NULL;
}
mutex_unlock(&testmod_uprobe_mutex);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment