Commit 63e6053a authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events updates from Ingo Molnar:

 - Fix Intel Alder Lake PEBS memory access latency & data source
   profiling info bugs.

 - Use Intel large-PEBS hardware feature in more circumstances, to
   reduce PMI overhead & reduce sampling data.

 - Extend the lost-sample profiling output with the PERF_FORMAT_LOST ABI
   variant, which tells tooling the exact number of samples lost.

 - Add new IBS register bits definitions.

 - AMD uncore events: Add PerfMonV2 DF (Data Fabric) enhancements.

* tag 'perf-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  perf/x86/ibs: Add new IBS register bits into header
  perf/x86/intel: Fix PEBS data source encoding for ADL
  perf/x86/intel: Fix PEBS memory access info encoding for ADL
  perf/core: Add a new read format to get a number of lost samples
  perf/x86/amd/uncore: Add PerfMonV2 RDPMC assignments
  perf/x86/amd/uncore: Add PerfMonV2 DF event format
  perf/x86/amd/uncore: Detect available DF counters
  perf/x86/amd/uncore: Use attr_update for format attributes
  perf/x86/amd/uncore: Use dynamic events array
  x86/events/intel/ds: Enable large PEBS for PERF_SAMPLE_WEIGHT_TYPE
parents 22a39c3d 326ecc15
This diff is collapsed.
......@@ -4141,6 +4141,8 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
{
struct event_constraint *c;
c = intel_get_event_constraints(cpuc, idx, event);
/*
* :ppp means to do reduced skid PEBS,
* which is available on PMC0 and fixed counter 0.
......@@ -4153,8 +4155,6 @@ tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
return &counter0_constraint;
}
c = intel_get_event_constraints(cpuc, idx, event);
return c;
}
......@@ -6241,7 +6241,8 @@ __init int intel_pmu_init(void)
x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
x86_pmu.flags |= PMU_FL_MEM_LOADS_AUX;
x86_pmu.lbr_pt_coexist = true;
intel_pmu_pebs_data_source_skl(false);
intel_pmu_pebs_data_source_adl();
x86_pmu.pebs_latency_data = adl_latency_data_small;
x86_pmu.num_topdown_events = 8;
x86_pmu.update_topdown_event = adl_update_topdown_event;
x86_pmu.set_topdown_event_period = adl_set_topdown_event_period;
......
......@@ -94,15 +94,40 @@ void __init intel_pmu_pebs_data_source_nhm(void)
pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
}
void __init intel_pmu_pebs_data_source_skl(bool pmem)
static void __init __intel_pmu_pebs_data_source_skl(bool pmem, u64 *data_source)
{
u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4);
pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT);
data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT);
data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE);
data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD);
data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM);
}
void __init intel_pmu_pebs_data_source_skl(bool pmem)
{
__intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
}
static void __init intel_pmu_pebs_data_source_grt(u64 *data_source)
{
data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
}
void __init intel_pmu_pebs_data_source_adl(void)
{
u64 *data_source;
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX].pebs_data_source;
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
__intel_pmu_pebs_data_source_skl(false, data_source);
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
intel_pmu_pebs_data_source_grt(data_source);
}
static u64 precise_store_data(u64 status)
......@@ -171,7 +196,50 @@ static u64 precise_datala_hsw(struct perf_event *event, u64 status)
return dse.val;
}
static u64 load_latency_data(u64 status)
static inline void pebs_set_tlb_lock(u64 *val, bool tlb, bool lock)
{
/*
* TLB access
* 0 = did not miss 2nd level TLB
* 1 = missed 2nd level TLB
*/
if (tlb)
*val |= P(TLB, MISS) | P(TLB, L2);
else
*val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
/* locked prefix */
if (lock)
*val |= P(LOCK, LOCKED);
}
/* Retrieve the latency data for e-core of ADL */
u64 adl_latency_data_small(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
WARN_ON_ONCE(hybrid_pmu(event->pmu)->cpu_type == hybrid_big);
dse.val = status;
val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
/*
* For the atom core on ADL,
* bit 4: lock, bit 5: TLB access.
*/
pebs_set_tlb_lock(&val, dse.ld_locked, dse.ld_stlb_miss);
if (dse.ld_data_blk)
val |= P(BLK, DATA);
else
val |= P(BLK, NA);
return val;
}
static u64 load_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
......@@ -181,7 +249,7 @@ static u64 load_latency_data(u64 status)
/*
* use the mapping table for bit 0-3
*/
val = pebs_data_source[dse.ld_dse];
val = hybrid_var(event->pmu, pebs_data_source)[dse.ld_dse];
/*
* Nehalem models do not support TLB, Lock infos
......@@ -190,21 +258,8 @@ static u64 load_latency_data(u64 status)
val |= P(TLB, NA) | P(LOCK, NA);
return val;
}
/*
* bit 4: TLB access
* 0 = did not miss 2nd level TLB
* 1 = missed 2nd level TLB
*/
if (dse.ld_stlb_miss)
val |= P(TLB, MISS) | P(TLB, L2);
else
val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
/*
* bit 5: locked prefix
*/
if (dse.ld_locked)
val |= P(LOCK, LOCKED);
pebs_set_tlb_lock(&val, dse.ld_stlb_miss, dse.ld_locked);
/*
* Ice Lake and earlier models do not support block infos.
......@@ -233,7 +288,7 @@ static u64 load_latency_data(u64 status)
return val;
}
static u64 store_latency_data(u64 status)
static u64 store_latency_data(struct perf_event *event, u64 status)
{
union intel_x86_pebs_dse dse;
u64 val;
......@@ -243,23 +298,9 @@ static u64 store_latency_data(u64 status)
/*
* use the mapping table for bit 0-3
*/
val = pebs_data_source[dse.st_lat_dse];
val = hybrid_var(event->pmu, pebs_data_source)[dse.st_lat_dse];
/*
* bit 4: TLB access
* 0 = did not miss 2nd level TLB
* 1 = missed 2nd level TLB
*/
if (dse.st_lat_stlb_miss)
val |= P(TLB, MISS) | P(TLB, L2);
else
val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2);
/*
* bit 5: locked prefix
*/
if (dse.st_lat_locked)
val |= P(LOCK, LOCKED);
pebs_set_tlb_lock(&val, dse.st_lat_stlb_miss, dse.st_lat_locked);
val |= P(BLK, NA);
......@@ -781,8 +822,8 @@ struct event_constraint intel_glm_pebs_event_constraints[] = {
struct event_constraint intel_grt_pebs_event_constraints[] = {
/* Allow all events as PEBS with no flags */
INTEL_PLD_CONSTRAINT(0x5d0, 0xf),
INTEL_PSD_CONSTRAINT(0x6d0, 0xf),
INTEL_HYBRID_LAT_CONSTRAINT(0x5d0, 0xf),
INTEL_HYBRID_LAT_CONSTRAINT(0x6d0, 0xf),
EVENT_CONSTRAINT_END
};
......@@ -1443,9 +1484,11 @@ static u64 get_data_src(struct perf_event *event, u64 aux)
bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC);
if (fl & PERF_X86_EVENT_PEBS_LDLAT)
val = load_latency_data(aux);
val = load_latency_data(event, aux);
else if (fl & PERF_X86_EVENT_PEBS_STLAT)
val = store_latency_data(aux);
val = store_latency_data(event, aux);
else if (fl & PERF_X86_EVENT_PEBS_LAT_HYBRID)
val = x86_pmu.pebs_latency_data(event, aux);
else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC))
val = precise_datala_hsw(event, aux);
else if (fst)
......
......@@ -84,6 +84,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
#define PERF_X86_EVENT_TOPDOWN 0x04000 /* Count Topdown slots/metrics events */
#define PERF_X86_EVENT_PEBS_STLAT 0x08000 /* st+stlat data address sampling */
#define PERF_X86_EVENT_AMD_BRS 0x10000 /* AMD Branch Sampling */
#define PERF_X86_EVENT_PEBS_LAT_HYBRID 0x20000 /* ld and st lat for hybrid */
static inline bool is_topdown_count(struct perf_event *event)
{
......@@ -136,7 +137,8 @@ struct amd_nb {
PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \
PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE)
PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE | \
PERF_SAMPLE_WEIGHT_TYPE)
#define PEBS_GP_REGS \
((1ULL << PERF_REG_X86_AX) | \
......@@ -460,6 +462,10 @@ struct cpu_hw_events {
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST)
#define INTEL_HYBRID_LAT_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LAT_HYBRID)
/* Event constraint, but match on all event flags too. */
#define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS)
......@@ -638,6 +644,8 @@ enum {
x86_lbr_exclusive_max,
};
#define PERF_PEBS_DATA_SOURCE_MAX 0x10
struct x86_hybrid_pmu {
struct pmu pmu;
const char *name;
......@@ -665,6 +673,8 @@ struct x86_hybrid_pmu {
unsigned int late_ack :1,
mid_ack :1,
enabled_ack :1;
u64 pebs_data_source[PERF_PEBS_DATA_SOURCE_MAX];
};
static __always_inline struct x86_hybrid_pmu *hybrid_pmu(struct pmu *pmu)
......@@ -825,6 +835,7 @@ struct x86_pmu {
void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data);
struct event_constraint *pebs_constraints;
void (*pebs_aliases)(struct perf_event *event);
u64 (*pebs_latency_data)(struct perf_event *event, u64 status);
unsigned long large_pebs_flags;
u64 rtm_abort_event;
......@@ -1392,6 +1403,8 @@ void intel_pmu_disable_bts(void);
int intel_pmu_drain_bts_buffer(void);
u64 adl_latency_data_small(struct perf_event *event, u64 status);
extern struct event_constraint intel_core2_pebs_event_constraints[];
extern struct event_constraint intel_atom_pebs_event_constraints[];
......@@ -1499,6 +1512,8 @@ void intel_pmu_pebs_data_source_nhm(void);
void intel_pmu_pebs_data_source_skl(bool pmem);
void intel_pmu_pebs_data_source_adl(void);
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
......
......@@ -29,7 +29,10 @@ union ibs_fetch_ctl {
rand_en:1, /* 57: random tagging enable */
fetch_l2_miss:1,/* 58: L2 miss for sampled fetch
* (needs IbsFetchComp) */
reserved:5; /* 59-63: reserved */
l3_miss_only:1, /* 59: Collect L3 miss samples only */
fetch_oc_miss:1,/* 60: Op cache miss for the sampled fetch */
fetch_l3_miss:1,/* 61: L3 cache miss for the sampled fetch */
reserved:2; /* 62-63: reserved */
};
};
......@@ -38,14 +41,14 @@ union ibs_op_ctl {
__u64 val;
struct {
__u64 opmaxcnt:16, /* 0-15: periodic op max. count */
reserved0:1, /* 16: reserved */
l3_miss_only:1, /* 16: Collect L3 miss samples only */
op_en:1, /* 17: op sampling enable */
op_val:1, /* 18: op sample valid */
cnt_ctl:1, /* 19: periodic op counter control */
opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */
reserved1:5, /* 27-31: reserved */
reserved0:5, /* 27-31: reserved */
opcurcnt:27, /* 32-58: periodic op counter current count */
reserved2:5; /* 59-63: reserved */
reserved1:5; /* 59-63: reserved */
};
};
......@@ -71,11 +74,12 @@ union ibs_op_data {
union ibs_op_data2 {
__u64 val;
struct {
__u64 data_src:3, /* 0-2: data source */
__u64 data_src_lo:3, /* 0-2: data source low */
reserved0:1, /* 3: reserved */
rmt_node:1, /* 4: destination node */
cache_hit_st:1, /* 5: cache hit state */
reserved1:57; /* 5-63: reserved */
data_src_hi:2, /* 6-7: data source high */
reserved1:56; /* 8-63: reserved */
};
};
......
......@@ -89,6 +89,19 @@
#define AMD64_RAW_EVENT_MASK_NB \
(AMD64_EVENTSEL_EVENT | \
ARCH_PERFMON_EVENTSEL_UMASK)
#define AMD64_PERFMON_V2_EVENTSEL_EVENT_NB \
(AMD64_EVENTSEL_EVENT | \
GENMASK_ULL(37, 36))
#define AMD64_PERFMON_V2_EVENTSEL_UMASK_NB \
(ARCH_PERFMON_EVENTSEL_UMASK | \
GENMASK_ULL(27, 24))
#define AMD64_PERFMON_V2_RAW_EVENT_MASK_NB \
(AMD64_PERFMON_V2_EVENTSEL_EVENT_NB | \
AMD64_PERFMON_V2_EVENTSEL_UMASK_NB)
#define AMD64_NUM_COUNTERS 4
#define AMD64_NUM_COUNTERS_CORE 6
#define AMD64_NUM_COUNTERS_NB 4
......@@ -194,6 +207,9 @@ union cpuid_0x80000022_ebx {
struct {
/* Number of Core Performance Counters */
unsigned int num_core_pmc:4;
unsigned int reserved:6;
/* Number of Data Fabric Counters */
unsigned int num_df_pmc:6;
} split;
unsigned int full;
};
......
......@@ -759,6 +759,8 @@ struct perf_event {
struct pid_namespace *ns;
u64 id;
atomic64_t lost_samples;
u64 (*clock)(void);
perf_overflow_handler_t overflow_handler;
void *overflow_handler_context;
......
......@@ -301,6 +301,7 @@ enum {
* { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 id; } && PERF_FORMAT_ID
* { u64 lost; } && PERF_FORMAT_LOST
* } && !PERF_FORMAT_GROUP
*
* { u64 nr;
......@@ -308,6 +309,7 @@ enum {
* { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING
* { u64 value;
* { u64 id; } && PERF_FORMAT_ID
* { u64 lost; } && PERF_FORMAT_LOST
* } cntr[nr];
* } && PERF_FORMAT_GROUP
* };
......@@ -317,8 +319,9 @@ enum perf_event_read_format {
PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1,
PERF_FORMAT_ID = 1U << 2,
PERF_FORMAT_GROUP = 1U << 3,
PERF_FORMAT_LOST = 1U << 4,
PERF_FORMAT_MAX = 1U << 4, /* non-ABI */
PERF_FORMAT_MAX = 1U << 5, /* non-ABI */
};
#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
......
......@@ -1819,6 +1819,9 @@ static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
if (event->attr.read_format & PERF_FORMAT_ID)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_LOST)
entry += sizeof(u64);
if (event->attr.read_format & PERF_FORMAT_GROUP) {
nr += nr_siblings;
size += sizeof(u64);
......@@ -5260,11 +5263,15 @@ static int __perf_read_group_add(struct perf_event *leader,
values[n++] += perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
for_each_sibling_event(sub, leader) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&sub->lost_samples);
}
raw_spin_unlock_irqrestore(&ctx->lock, flags);
......@@ -5321,7 +5328,7 @@ static int perf_read_one(struct perf_event *event,
u64 read_format, char __user *buf)
{
u64 enabled, running;
u64 values[4];
u64 values[5];
int n = 0;
values[n++] = __perf_event_read_value(event, &enabled, &running);
......@@ -5331,6 +5338,8 @@ static int perf_read_one(struct perf_event *event,
values[n++] = running;
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&event->lost_samples);
if (copy_to_user(buf, values, n * sizeof(u64)))
return -EFAULT;
......@@ -6858,7 +6867,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
u64 values[4];
u64 values[5];
int n = 0;
values[n++] = perf_event_count(event);
......@@ -6872,6 +6881,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
}
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(event);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&event->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
......@@ -6882,7 +6893,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
u64 values[5];
u64 values[6];
int n = 0;
values[n++] = 1 + leader->nr_siblings;
......@@ -6900,6 +6911,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(leader);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&leader->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
......@@ -6913,6 +6926,8 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
if (read_format & PERF_FORMAT_LOST)
values[n++] = atomic64_read(&sub->lost_samples);
__output_copy(handle, values, n * sizeof(u64));
}
......
......@@ -172,8 +172,10 @@ __perf_output_begin(struct perf_output_handle *handle,
goto out;
if (unlikely(rb->paused)) {
if (rb->nr_pages)
if (rb->nr_pages) {
local_inc(&rb->lost);
atomic64_inc(&event->lost_samples);
}
goto out;
}
......@@ -254,6 +256,7 @@ __perf_output_begin(struct perf_output_handle *handle,
fail:
local_inc(&rb->lost);
atomic64_inc(&event->lost_samples);
perf_output_put_handle(handle);
out:
rcu_read_unlock();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment