Commit 17ca7fc2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Combine perf and BPF for fast evalution of HW breakpoint
   conditions

 - Add LBR capture support outside of hardware events

 - Trigger IO signals for watermark_wakeup

 - Add RAPL support for Intel Arrow Lake and Lunar Lake

 - Optimize frequency-throttling

 - Miscellaneous cleanups & fixes

* tag 'perf-core-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (21 commits)
  perf/bpf: Mark perf_event_set_bpf_handler() and perf_event_free_bpf_handler() as inline too
  selftests/perf_events: Test FASYNC with watermark wakeups
  perf/ring_buffer: Trigger IO signals for watermark_wakeup
  perf: Move perf_event_fasync() to perf_event.h
  perf/bpf: Change the !CONFIG_BPF_SYSCALL stubs to static inlines
  selftest/bpf: Test a perf BPF program that suppresses side effects
  perf/bpf: Allow a BPF program to suppress all sample side effects
  perf/bpf: Remove unneeded uses_default_overflow_handler()
  perf/bpf: Call BPF handler directly, not through overflow machinery
  perf/bpf: Remove #ifdef CONFIG_BPF_SYSCALL from struct perf_event members
  perf/bpf: Create bpf_overflow_handler() stub for !CONFIG_BPF_SYSCALL
  perf/bpf: Reorder bpf_overflow_handler() ahead of __perf_event_overflow()
  perf/x86/rapl: Add support for Intel Lunar Lake
  perf/x86/rapl: Add support for Intel Arrow Lake
  perf/core: Reduce PMU access to adjust sample freq
  perf/core: Optimize perf_adjust_freq_unthr_context()
  perf/x86/amd: Don't reject non-sampling events with configured LBR
  perf/x86/amd: Support capturing LBR from software events
  perf/x86/amd: Avoid taking branches before disabling LBR
  perf/x86/amd: Ensure amd_pmu_core_disable_all() is always inlined
  ...
parents 48fc82c4 854dd99b
...@@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp, ...@@ -626,7 +626,7 @@ int hw_breakpoint_arch_parse(struct perf_event *bp,
hw->address &= ~alignment_mask; hw->address &= ~alignment_mask;
hw->ctrl.len <<= offset; hw->ctrl.len <<= offset;
if (uses_default_overflow_handler(bp)) { if (is_default_overflow_handler(bp)) {
/* /*
* Mismatch breakpoints are required for single-stepping * Mismatch breakpoints are required for single-stepping
* breakpoints. * breakpoints.
...@@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, ...@@ -798,7 +798,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
* Otherwise, insert a temporary mismatch breakpoint so that * Otherwise, insert a temporary mismatch breakpoint so that
* we can single-step over the watchpoint trigger. * we can single-step over the watchpoint trigger.
*/ */
if (!uses_default_overflow_handler(wp)) if (!is_default_overflow_handler(wp))
continue; continue;
step: step:
enable_single_step(wp, instruction_pointer(regs)); enable_single_step(wp, instruction_pointer(regs));
...@@ -811,7 +811,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr, ...@@ -811,7 +811,7 @@ static void watchpoint_handler(unsigned long addr, unsigned int fsr,
info->trigger = addr; info->trigger = addr;
pr_debug("watchpoint fired: address = 0x%x\n", info->trigger); pr_debug("watchpoint fired: address = 0x%x\n", info->trigger);
perf_bp_event(wp, regs); perf_bp_event(wp, regs);
if (uses_default_overflow_handler(wp)) if (is_default_overflow_handler(wp))
enable_single_step(wp, instruction_pointer(regs)); enable_single_step(wp, instruction_pointer(regs));
} }
...@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs) ...@@ -886,7 +886,7 @@ static void breakpoint_handler(unsigned long unknown, struct pt_regs *regs)
info->trigger = addr; info->trigger = addr;
pr_debug("breakpoint fired: address = 0x%x\n", addr); pr_debug("breakpoint fired: address = 0x%x\n", addr);
perf_bp_event(bp, regs); perf_bp_event(bp, regs);
if (uses_default_overflow_handler(bp)) if (is_default_overflow_handler(bp))
enable_single_step(bp, addr); enable_single_step(bp, addr);
goto unlock; goto unlock;
} }
......
...@@ -655,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr, ...@@ -655,7 +655,7 @@ static int breakpoint_handler(unsigned long unused, unsigned long esr,
perf_bp_event(bp, regs); perf_bp_event(bp, regs);
/* Do we need to handle the stepping? */ /* Do we need to handle the stepping? */
if (uses_default_overflow_handler(bp)) if (is_default_overflow_handler(bp))
step = 1; step = 1;
unlock: unlock:
rcu_read_unlock(); rcu_read_unlock();
...@@ -734,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, ...@@ -734,7 +734,7 @@ static u64 get_distance_from_watchpoint(unsigned long addr, u64 val,
static int watchpoint_report(struct perf_event *wp, unsigned long addr, static int watchpoint_report(struct perf_event *wp, unsigned long addr,
struct pt_regs *regs) struct pt_regs *regs)
{ {
int step = uses_default_overflow_handler(wp); int step = is_default_overflow_handler(wp);
struct arch_hw_breakpoint *info = counter_arch_bp(wp); struct arch_hw_breakpoint *info = counter_arch_bp(wp);
info->trigger = addr; info->trigger = addr;
......
...@@ -647,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu) ...@@ -647,7 +647,7 @@ static void amd_pmu_cpu_dead(int cpu)
} }
} }
static inline void amd_pmu_set_global_ctl(u64 ctl) static __always_inline void amd_pmu_set_global_ctl(u64 ctl)
{ {
wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl); wrmsrl(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, ctl);
} }
...@@ -907,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) ...@@ -907,6 +907,37 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
return amd_pmu_adjust_nmi_window(handled); return amd_pmu_adjust_nmi_window(handled);
} }
/*
* AMD-specific callback invoked through perf_snapshot_branch_stack static
* call, defined in include/linux/perf_event.h. See its definition for API
* details. It's up to caller to provide enough space in *entries* to fit all
* LBR records, otherwise returned result will be truncated to *cnt* entries.
*/
static int amd_pmu_v2_snapshot_branch_stack(struct perf_branch_entry *entries, unsigned int cnt)
{
struct cpu_hw_events *cpuc;
unsigned long flags;
/*
* The sequence of steps to freeze LBR should be completely inlined
* and contain no branches to minimize contamination of LBR snapshot
*/
local_irq_save(flags);
amd_pmu_core_disable_all();
__amd_pmu_lbr_disable();
cpuc = this_cpu_ptr(&cpu_hw_events);
amd_pmu_lbr_read();
cnt = min(cnt, x86_pmu.lbr_nr);
memcpy(entries, cpuc->lbr_entries, sizeof(struct perf_branch_entry) * cnt);
amd_pmu_v2_enable_all(0);
local_irq_restore(flags);
return cnt;
}
static int amd_pmu_v2_handle_irq(struct pt_regs *regs) static int amd_pmu_v2_handle_irq(struct pt_regs *regs)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
...@@ -1443,6 +1474,10 @@ static int __init amd_core_pmu_init(void) ...@@ -1443,6 +1474,10 @@ static int __init amd_core_pmu_init(void)
static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset); static_call_update(amd_pmu_branch_reset, amd_pmu_lbr_reset);
static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add); static_call_update(amd_pmu_branch_add, amd_pmu_lbr_add);
static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del); static_call_update(amd_pmu_branch_del, amd_pmu_lbr_del);
/* Only support branch_stack snapshot on perfmon v2 */
if (x86_pmu.handle_irq == amd_pmu_v2_handle_irq)
static_call_update(perf_snapshot_branch_stack, amd_pmu_v2_snapshot_branch_stack);
} else if (!amd_brs_init()) { } else if (!amd_brs_init()) {
/* /*
* BRS requires special event constraints and flushing on ctxsw. * BRS requires special event constraints and flushing on ctxsw.
......
...@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event) ...@@ -310,10 +310,6 @@ int amd_pmu_lbr_hw_config(struct perf_event *event)
{ {
int ret = 0; int ret = 0;
/* LBR is not recommended in counting mode */
if (!is_sampling_event(event))
return -EINVAL;
ret = amd_pmu_lbr_setup_filter(event); ret = amd_pmu_lbr_setup_filter(event);
if (!ret) if (!ret)
event->attach_state |= PERF_ATTACH_SCHED_CB; event->attach_state |= PERF_ATTACH_SCHED_CB;
...@@ -414,18 +410,11 @@ void amd_pmu_lbr_enable_all(void) ...@@ -414,18 +410,11 @@ void amd_pmu_lbr_enable_all(void)
void amd_pmu_lbr_disable_all(void) void amd_pmu_lbr_disable_all(void)
{ {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
u64 dbg_ctl, dbg_extn_cfg;
if (!cpuc->lbr_users || !x86_pmu.lbr_nr) if (!cpuc->lbr_users || !x86_pmu.lbr_nr)
return; return;
rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); __amd_pmu_lbr_disable();
wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
}
} }
__init int amd_pmu_lbr_init(void) __init int amd_pmu_lbr_init(void)
......
...@@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void); ...@@ -1329,6 +1329,19 @@ void amd_pmu_lbr_enable_all(void);
void amd_pmu_lbr_disable_all(void); void amd_pmu_lbr_disable_all(void);
int amd_pmu_lbr_hw_config(struct perf_event *event); int amd_pmu_lbr_hw_config(struct perf_event *event);
static __always_inline void __amd_pmu_lbr_disable(void)
{
u64 dbg_ctl, dbg_extn_cfg;
rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg);
wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN);
if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) {
rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl);
wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
}
}
#ifdef CONFIG_PERF_EVENTS_AMD_BRS #ifdef CONFIG_PERF_EVENTS_AMD_BRS
#define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */ #define AMD_FAM19H_BRS_EVENT 0xc4 /* RETIRED_TAKEN_BRANCH_INSTRUCTIONS */
......
...@@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = { ...@@ -675,10 +675,8 @@ static const struct attribute_group *rapl_attr_update[] = {
static int __init init_rapl_pmus(void) static int __init init_rapl_pmus(void)
{ {
int maxdie = topology_max_packages() * topology_max_dies_per_package(); int maxdie = topology_max_packages() * topology_max_dies_per_package();
size_t size;
size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *); rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, maxdie), GFP_KERNEL);
rapl_pmus = kzalloc(size, GFP_KERNEL);
if (!rapl_pmus) if (!rapl_pmus)
return -ENOMEM; return -ENOMEM;
...@@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { ...@@ -808,6 +806,9 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE_H, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(ARROWLAKE, &model_skl),
X86_MATCH_INTEL_FAM6_MODEL(LUNARLAKE_M, &model_skl),
{}, {},
}; };
MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
......
...@@ -809,11 +809,8 @@ struct perf_event { ...@@ -809,11 +809,8 @@ struct perf_event {
u64 (*clock)(void); u64 (*clock)(void);
perf_overflow_handler_t overflow_handler; perf_overflow_handler_t overflow_handler;
void *overflow_handler_context; void *overflow_handler_context;
#ifdef CONFIG_BPF_SYSCALL
perf_overflow_handler_t orig_overflow_handler;
struct bpf_prog *prog; struct bpf_prog *prog;
u64 bpf_cookie; u64 bpf_cookie;
#endif
#ifdef CONFIG_EVENT_TRACING #ifdef CONFIG_EVENT_TRACING
struct trace_event_call *tp_event; struct trace_event_call *tp_event;
...@@ -883,6 +880,7 @@ struct perf_event_pmu_context { ...@@ -883,6 +880,7 @@ struct perf_event_pmu_context {
unsigned int nr_events; unsigned int nr_events;
unsigned int nr_cgroups; unsigned int nr_cgroups;
unsigned int nr_freq;
atomic_t refcount; /* event <-> epc */ atomic_t refcount; /* event <-> epc */
struct rcu_head rcu_head; struct rcu_head rcu_head;
...@@ -897,6 +895,11 @@ struct perf_event_pmu_context { ...@@ -897,6 +895,11 @@ struct perf_event_pmu_context {
int rotate_necessary; int rotate_necessary;
}; };
static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}
struct perf_event_groups { struct perf_event_groups {
struct rb_root tree; struct rb_root tree;
u64 index; u64 index;
...@@ -1342,8 +1345,10 @@ extern int perf_event_output(struct perf_event *event, ...@@ -1342,8 +1345,10 @@ extern int perf_event_output(struct perf_event *event,
struct pt_regs *regs); struct pt_regs *regs);
static inline bool static inline bool
__is_default_overflow_handler(perf_overflow_handler_t overflow_handler) is_default_overflow_handler(struct perf_event *event)
{ {
perf_overflow_handler_t overflow_handler = event->overflow_handler;
if (likely(overflow_handler == perf_event_output_forward)) if (likely(overflow_handler == perf_event_output_forward))
return true; return true;
if (unlikely(overflow_handler == perf_event_output_backward)) if (unlikely(overflow_handler == perf_event_output_backward))
...@@ -1351,22 +1356,6 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler) ...@@ -1351,22 +1356,6 @@ __is_default_overflow_handler(perf_overflow_handler_t overflow_handler)
return false; return false;
} }
#define is_default_overflow_handler(event) \
__is_default_overflow_handler((event)->overflow_handler)
#ifdef CONFIG_BPF_SYSCALL
static inline bool uses_default_overflow_handler(struct perf_event *event)
{
if (likely(is_default_overflow_handler(event)))
return true;
return __is_default_overflow_handler(event->orig_overflow_handler);
}
#else
#define uses_default_overflow_handler(event) \
is_default_overflow_handler(event)
#endif
extern void extern void
perf_event_header__init_id(struct perf_event_header *header, perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data, struct perf_sample_data *data,
...@@ -1697,6 +1686,14 @@ perf_event_addr_filters(struct perf_event *event) ...@@ -1697,6 +1686,14 @@ perf_event_addr_filters(struct perf_event *event)
return ifh; return ifh;
} }
static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
/* Only the parent has fasync state */
if (event->parent)
event = event->parent;
return &event->fasync;
}
extern void perf_event_addr_filters_sync(struct perf_event *event); extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id); extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);
......
...@@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx) ...@@ -2302,8 +2302,10 @@ event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event)) if (!is_software_event(event))
cpc->active_oncpu--; cpc->active_oncpu--;
if (event->attr.freq && event->attr.sample_freq) if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq--; ctx->nr_freq--;
epc->nr_freq--;
}
if (event->attr.exclusive || !cpc->active_oncpu) if (event->attr.exclusive || !cpc->active_oncpu)
cpc->exclusive = 0; cpc->exclusive = 0;
...@@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx) ...@@ -2558,9 +2560,10 @@ event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
if (!is_software_event(event)) if (!is_software_event(event))
cpc->active_oncpu++; cpc->active_oncpu++;
if (event->attr.freq && event->attr.sample_freq) if (event->attr.freq && event->attr.sample_freq) {
ctx->nr_freq++; ctx->nr_freq++;
epc->nr_freq++;
}
if (event->attr.exclusive) if (event->attr.exclusive)
cpc->exclusive = 1; cpc->exclusive = 1;
...@@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo ...@@ -4123,30 +4126,14 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bo
} }
} }
/* static void perf_adjust_freq_unthr_events(struct list_head *event_list)
* combine freq adjustment with unthrottling to avoid two passes over the
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{ {
struct perf_event *event; struct perf_event *event;
struct hw_perf_event *hwc; struct hw_perf_event *hwc;
u64 now, period = TICK_NSEC; u64 now, period = TICK_NSEC;
s64 delta; s64 delta;
/* list_for_each_entry(event, event_list, active_list) {
* only need to iterate over all events iff:
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE) if (event->state != PERF_EVENT_STATE_ACTIVE)
continue; continue;
...@@ -4154,18 +4141,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) ...@@ -4154,18 +4141,17 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
if (!event_filter_match(event)) if (!event_filter_match(event))
continue; continue;
perf_pmu_disable(event->pmu);
hwc = &event->hw; hwc = &event->hw;
if (hwc->interrupts == MAX_INTERRUPTS) { if (hwc->interrupts == MAX_INTERRUPTS) {
hwc->interrupts = 0; hwc->interrupts = 0;
perf_log_throttle(event, 1); perf_log_throttle(event, 1);
if (!event->attr.freq || !event->attr.sample_freq)
event->pmu->start(event, 0); event->pmu->start(event, 0);
} }
if (!event->attr.freq || !event->attr.sample_freq) if (!event->attr.freq || !event->attr.sample_freq)
goto next; continue;
/* /*
* stop the event and update event->count * stop the event and update event->count
...@@ -4187,8 +4173,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle) ...@@ -4187,8 +4173,41 @@ perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
perf_adjust_period(event, period, delta, false); perf_adjust_period(event, period, delta, false);
event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
next: }
perf_pmu_enable(event->pmu); }
/*
* combine freq adjustment with unthrottling to avoid two passes over the
* events. At the same time, make sure, having freq events does not change
* the rate of unthrottling as that would introduce bias.
*/
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
struct perf_event_pmu_context *pmu_ctx;
/*
* only need to iterate over all events iff:
* - context have events in frequency mode (needs freq adjust)
* - there are events to unthrottle on this cpu
*/
if (!(ctx->nr_freq || unthrottle))
return;
raw_spin_lock(&ctx->lock);
list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
if (!(pmu_ctx->nr_freq || unthrottle))
continue;
if (!perf_pmu_ctx_is_active(pmu_ctx))
continue;
if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
continue;
perf_pmu_disable(pmu_ctx->pmu);
perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
perf_pmu_enable(pmu_ctx->pmu);
} }
raw_spin_unlock(&ctx->lock); raw_spin_unlock(&ctx->lock);
...@@ -6684,14 +6703,6 @@ static const struct file_operations perf_fops = { ...@@ -6684,14 +6703,6 @@ static const struct file_operations perf_fops = {
* to user-space before waking everybody up. * to user-space before waking everybody up.
*/ */
static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
/* only the parent has fasync state */
if (event->parent)
event = event->parent;
return &event->fasync;
}
void perf_event_wakeup(struct perf_event *event) void perf_event_wakeup(struct perf_event *event)
{ {
ring_buffer_wakeup(event); ring_buffer_wakeup(event);
...@@ -9544,6 +9555,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r ...@@ -9544,6 +9555,100 @@ static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *r
return true; return true;
} }
#ifdef CONFIG_BPF_SYSCALL
static int bpf_overflow_handler(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct bpf_perf_event_data_kern ctx = {
.data = data,
.event = event,
};
struct bpf_prog *prog;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
prog = READ_ONCE(event->prog);
if (prog) {
perf_prepare_sample(data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
return ret;
}
static inline int perf_event_set_bpf_handler(struct perf_event *event,
struct bpf_prog *prog,
u64 bpf_cookie)
{
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
if (event->prog)
return -EEXIST;
if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
return -EINVAL;
if (event->attr.precise_ip &&
prog->call_get_stack &&
(!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
* On perf_event with precise_ip, calling bpf_get_stack()
* may trigger unwinder warnings and occasional crashes.
* bpf_get_[stack|stackid] works around this issue by using
* callchain attached to perf_sample_data. If the
* perf_event does not full (kernel and user) callchain
* attached to perf_sample_data, do not allow attaching BPF
* program that calls bpf_get_[stack|stackid].
*/
return -EPROTO;
}
event->prog = prog;
event->bpf_cookie = bpf_cookie;
return 0;
}
static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
struct bpf_prog *prog = event->prog;
if (!prog)
return;
event->prog = NULL;
bpf_prog_put(prog);
}
#else
static inline int bpf_overflow_handler(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
return 1;
}
static inline int perf_event_set_bpf_handler(struct perf_event *event,
struct bpf_prog *prog,
u64 bpf_cookie)
{
return -EOPNOTSUPP;
}
static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif
/* /*
* Generic event overflow handling, sampling. * Generic event overflow handling, sampling.
*/ */
...@@ -9564,6 +9669,9 @@ static int __perf_event_overflow(struct perf_event *event, ...@@ -9564,6 +9669,9 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle); ret = __perf_event_account_interrupt(event, throttle);
if (event->prog && !bpf_overflow_handler(event, data, regs))
return ret;
/* /*
* XXX event_limit might not quite work as expected on inherited * XXX event_limit might not quite work as expected on inherited
* events * events
...@@ -10422,97 +10530,6 @@ static void perf_event_free_filter(struct perf_event *event) ...@@ -10422,97 +10530,6 @@ static void perf_event_free_filter(struct perf_event *event)
ftrace_profile_free_filter(event); ftrace_profile_free_filter(event);
} }
#ifdef CONFIG_BPF_SYSCALL
static void bpf_overflow_handler(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
struct bpf_perf_event_data_kern ctx = {
.data = data,
.event = event,
};
struct bpf_prog *prog;
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
prog = READ_ONCE(event->prog);
if (prog) {
perf_prepare_sample(data, event, regs);
ret = bpf_prog_run(prog, &ctx);
}
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
if (!ret)
return;
event->orig_overflow_handler(event, data, regs);
}
static int perf_event_set_bpf_handler(struct perf_event *event,
struct bpf_prog *prog,
u64 bpf_cookie)
{
if (event->overflow_handler_context)
/* hw breakpoint or kernel counter */
return -EINVAL;
if (event->prog)
return -EEXIST;
if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
return -EINVAL;
if (event->attr.precise_ip &&
prog->call_get_stack &&
(!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
event->attr.exclude_callchain_kernel ||
event->attr.exclude_callchain_user)) {
/*
* On perf_event with precise_ip, calling bpf_get_stack()
* may trigger unwinder warnings and occasional crashes.
* bpf_get_[stack|stackid] works around this issue by using
* callchain attached to perf_sample_data. If the
* perf_event does not full (kernel and user) callchain
* attached to perf_sample_data, do not allow attaching BPF
* program that calls bpf_get_[stack|stackid].
*/
return -EPROTO;
}
event->prog = prog;
event->bpf_cookie = bpf_cookie;
event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
return 0;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
struct bpf_prog *prog = event->prog;
if (!prog)
return;
WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
event->prog = NULL;
bpf_prog_put(prog);
}
#else
static int perf_event_set_bpf_handler(struct perf_event *event,
struct bpf_prog *prog,
u64 bpf_cookie)
{
return -EOPNOTSUPP;
}
static void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif
/* /*
* returns true if the event is a tracepoint, or a kprobe/upprobe created * returns true if the event is a tracepoint, or a kprobe/upprobe created
* with perf_event_open() * with perf_event_open()
...@@ -11971,13 +11988,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, ...@@ -11971,13 +11988,11 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
overflow_handler = parent_event->overflow_handler; overflow_handler = parent_event->overflow_handler;
context = parent_event->overflow_handler_context; context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
if (overflow_handler == bpf_overflow_handler) { if (parent_event->prog) {
struct bpf_prog *prog = parent_event->prog; struct bpf_prog *prog = parent_event->prog;
bpf_prog_inc(prog); bpf_prog_inc(prog);
event->prog = prog; event->prog = prog;
event->orig_overflow_handler =
parent_event->orig_overflow_handler;
} }
#endif #endif
} }
......
...@@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle) ...@@ -22,6 +22,10 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
atomic_set(&handle->rb->poll, EPOLLIN); atomic_set(&handle->rb->poll, EPOLLIN);
handle->event->pending_wakeup = 1; handle->event->pending_wakeup = 1;
if (*perf_event_fasync(handle->event) && !handle->event->pending_kill)
handle->event->pending_kill = POLL_IN;
irq_work_queue(&handle->event->pending_irq); irq_work_queue(&handle->event->pending_irq);
} }
......
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <test_progs.h>
#include "test_perf_skip.skel.h"
#include <linux/compiler.h>
#include <linux/hw_breakpoint.h>
#include <sys/mman.h>
#ifndef TRAP_PERF
#define TRAP_PERF 6
#endif
int sigio_count, sigtrap_count;
static void handle_sigio(int sig __always_unused)
{
++sigio_count;
}
static void handle_sigtrap(int signum __always_unused,
siginfo_t *info,
void *ucontext __always_unused)
{
ASSERT_EQ(info->si_code, TRAP_PERF, "si_code");
++sigtrap_count;
}
static noinline int test_function(void)
{
asm volatile ("");
return 0;
}
void serial_test_perf_skip(void)
{
struct sigaction action = {};
struct sigaction previous_sigtrap;
sighandler_t previous_sigio = SIG_ERR;
struct test_perf_skip *skel = NULL;
struct perf_event_attr attr = {};
int perf_fd = -1;
int err;
struct f_owner_ex owner;
struct bpf_link *prog_link = NULL;
action.sa_flags = SA_SIGINFO | SA_NODEFER;
action.sa_sigaction = handle_sigtrap;
sigemptyset(&action.sa_mask);
if (!ASSERT_OK(sigaction(SIGTRAP, &action, &previous_sigtrap), "sigaction"))
return;
previous_sigio = signal(SIGIO, handle_sigio);
if (!ASSERT_NEQ(previous_sigio, SIG_ERR, "signal"))
goto cleanup;
skel = test_perf_skip__open_and_load();
if (!ASSERT_OK_PTR(skel, "skel_load"))
goto cleanup;
attr.type = PERF_TYPE_BREAKPOINT;
attr.size = sizeof(attr);
attr.bp_type = HW_BREAKPOINT_X;
attr.bp_addr = (uintptr_t)test_function;
attr.bp_len = sizeof(long);
attr.sample_period = 1;
attr.sample_type = PERF_SAMPLE_IP;
attr.pinned = 1;
attr.exclude_kernel = 1;
attr.exclude_hv = 1;
attr.precise_ip = 3;
attr.sigtrap = 1;
attr.remove_on_exec = 1;
perf_fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0);
if (perf_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
printf("SKIP:no PERF_TYPE_BREAKPOINT/HW_BREAKPOINT_X\n");
test__skip();
goto cleanup;
}
if (!ASSERT_OK(perf_fd < 0, "perf_event_open"))
goto cleanup;
/* Configure the perf event to signal on sample. */
err = fcntl(perf_fd, F_SETFL, O_ASYNC);
if (!ASSERT_OK(err, "fcntl(F_SETFL, O_ASYNC)"))
goto cleanup;
owner.type = F_OWNER_TID;
owner.pid = syscall(__NR_gettid);
err = fcntl(perf_fd, F_SETOWN_EX, &owner);
if (!ASSERT_OK(err, "fcntl(F_SETOWN_EX)"))
goto cleanup;
/* Allow at most one sample. A sample rejected by bpf should
* not count against this.
*/
err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 1);
if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
goto cleanup;
prog_link = bpf_program__attach_perf_event(skel->progs.handler, perf_fd);
if (!ASSERT_OK_PTR(prog_link, "bpf_program__attach_perf_event"))
goto cleanup;
/* Configure the bpf program to suppress the sample. */
skel->bss->ip = (uintptr_t)test_function;
test_function();
ASSERT_EQ(sigio_count, 0, "sigio_count");
ASSERT_EQ(sigtrap_count, 0, "sigtrap_count");
/* Configure the bpf program to allow the sample. */
skel->bss->ip = 0;
test_function();
ASSERT_EQ(sigio_count, 1, "sigio_count");
ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
/* Test that the sample above is the only one allowed (by perf, not
* by bpf)
*/
test_function();
ASSERT_EQ(sigio_count, 1, "sigio_count");
ASSERT_EQ(sigtrap_count, 1, "sigtrap_count");
cleanup:
bpf_link__destroy(prog_link);
if (perf_fd >= 0)
close(perf_fd);
test_perf_skip__destroy(skel);
if (previous_sigio != SIG_ERR)
signal(SIGIO, previous_sigio);
sigaction(SIGTRAP, &previous_sigtrap, NULL);
}
// SPDX-License-Identifier: GPL-2.0
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
uintptr_t ip;
SEC("perf_event")
int handler(struct bpf_perf_event_data *data)
{
/* Skip events that have the correct ip. */
return ip != PT_REGS_IP(&data->regs);
}
char _license[] SEC("license") = "GPL";
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
sigtrap_threads sigtrap_threads
remove_on_exec remove_on_exec
watermark_signal
...@@ -2,5 +2,5 @@ ...@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES) CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread LDFLAGS += -lpthread
TEST_GEN_PROGS := sigtrap_threads remove_on_exec TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
include ../lib.mk include ../lib.mk
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <linux/perf_event.h>
#include <stddef.h>
#include <sched.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include <unistd.h>
#include "../kselftest_harness.h"
#define __maybe_unused __attribute__((__unused__))
static int sigio_count;
static void handle_sigio(int signum __maybe_unused,
siginfo_t *oh __maybe_unused,
void *uc __maybe_unused)
{
++sigio_count;
}
static void do_child(void)
{
raise(SIGSTOP);
for (int i = 0; i < 20; ++i)
sleep(1);
raise(SIGSTOP);
exit(0);
}
TEST(watermark_signal)
{
struct perf_event_attr attr;
struct perf_event_mmap_page *p = NULL;
struct sigaction previous_sigio, sigio = { 0 };
pid_t child = -1;
int child_status;
int fd = -1;
long page_size = sysconf(_SC_PAGE_SIZE);
sigio.sa_sigaction = handle_sigio;
EXPECT_EQ(sigaction(SIGIO, &sigio, &previous_sigio), 0);
memset(&attr, 0, sizeof(attr));
attr.size = sizeof(attr);
attr.type = PERF_TYPE_SOFTWARE;
attr.config = PERF_COUNT_SW_DUMMY;
attr.sample_period = 1;
attr.disabled = 1;
attr.watermark = 1;
attr.context_switch = 1;
attr.wakeup_watermark = 1;
child = fork();
EXPECT_GE(child, 0);
if (child == 0)
do_child();
else if (child < 0) {
perror("fork()");
goto cleanup;
}
if (waitpid(child, &child_status, WSTOPPED) != child ||
!(WIFSTOPPED(child_status) && WSTOPSIG(child_status) == SIGSTOP)) {
fprintf(stderr,
"failed to sycnhronize with child errno=%d status=%x\n",
errno,
child_status);
goto cleanup;
}
fd = syscall(__NR_perf_event_open, &attr, child, -1, -1,
PERF_FLAG_FD_CLOEXEC);
if (fd < 0) {
fprintf(stderr, "failed opening event %llx\n", attr.config);
goto cleanup;
}
if (fcntl(fd, F_SETFL, FASYNC)) {
perror("F_SETFL FASYNC");
goto cleanup;
}
if (fcntl(fd, F_SETOWN, getpid())) {
perror("F_SETOWN getpid()");
goto cleanup;
}
if (fcntl(fd, F_SETSIG, SIGIO)) {
perror("F_SETSIG SIGIO");
goto cleanup;
}
p = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (p == NULL) {
perror("mmap");
goto cleanup;
}
if (ioctl(fd, PERF_EVENT_IOC_ENABLE, 0)) {
perror("PERF_EVENT_IOC_ENABLE");
goto cleanup;
}
if (kill(child, SIGCONT) < 0) {
perror("SIGCONT");
goto cleanup;
}
if (waitpid(child, &child_status, WSTOPPED) != -1 || errno != EINTR)
fprintf(stderr,
"expected SIGIO to terminate wait errno=%d status=%x\n%d",
errno,
child_status,
sigio_count);
EXPECT_GE(sigio_count, 1);
cleanup:
if (p != NULL)
munmap(p, 2 * page_size);
if (fd >= 0)
close(fd);
if (child > 0) {
kill(child, SIGKILL);
waitpid(child, NULL, 0);
}
sigaction(SIGIO, &previous_sigio, NULL);
}
TEST_HARNESS_MAIN
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment