Commit 3856b0f7 authored by Aravind Iddamsetty's avatar Aravind Iddamsetty Committed by Rodrigo Vivi

drm/xe/pmu: Enable PMU interface

There are a set of engine group busyness counters provided by HW which are
perfect fit to be exposed via PMU perf events.

BSPEC: 46559, 46560, 46722, 46729, 52071, 71028

events can be listed using:
perf list
  xe_0000_03_00.0/any-engine-group-busy-gt0/         [Kernel PMU event]
  xe_0000_03_00.0/copy-group-busy-gt0/               [Kernel PMU event]
  xe_0000_03_00.0/interrupts/                        [Kernel PMU event]
  xe_0000_03_00.0/media-group-busy-gt0/              [Kernel PMU event]
  xe_0000_03_00.0/render-group-busy-gt0/             [Kernel PMU event]

and can be read using:

perf stat -e "xe_0000_8c_00.0/render-group-busy-gt0/" -I 1000
           time             counts unit events
     1.001139062                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     2.003294678                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     3.005199582                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     4.007076497                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     5.008553068                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     6.010531563              43520 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     7.012468029              44800 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     8.013463515                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
     9.015300183                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
    10.017233010                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/
    10.971934120                  0 ns  xe_0000_8c_00.0/render-group-busy-gt0/

The pmu base implementation is taken from i915.

v2:
Store last known value when device is awake return that while the GT is
suspended and then update the driver copy when read during awake.

v3:
1. drop init_samples, as storing counters before going to suspend should
be sufficient.
2. ported the "drm/i915/pmu: Make PMU sample array two-dimensional" and
dropped helpers to store and read samples.
3. use xe_device_mem_access_get_if_ongoing to check if device is active
before reading the OA registers.
4. dropped format attr as no longer needed
5. introduce xe_pmu_suspend to call engine_group_busyness_store
6. few other nits.

v4: minor nits.

v5: take forcewake when accessing the OAG registers

v6:
1. drop engine_busyness_sample_type
2. update UAPI documentation

v7:
1. update UAPI documentation
2. drop MEDIA_GT specific change for media busyness counter.
Co-developed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Co-developed-by: default avatarBommu Krishnaiah <krishnaiah.bommu@intel.com>
Signed-off-by: default avatarAravind Iddamsetty <aravind.iddamsetty@linux.intel.com>
Reviewed-by: default avatarAshutosh Dixit <ashutosh.dixit@intel.com>
Signed-off-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
parent cd853419
......@@ -124,6 +124,8 @@ xe-y += xe_bb.o \
obj-$(CONFIG_DRM_XE) += xe.o
obj-$(CONFIG_DRM_XE_KUNIT_TEST) += tests/
xe-$(CONFIG_PERF_EVENTS) += xe_pmu.o
# header test
hdrtest_find_args := -not -path xe_rtp_helpers.h
......
......@@ -294,6 +294,11 @@
#define INVALIDATION_BROADCAST_MODE_DIS REG_BIT(12)
#define GLOBAL_INVALIDATION_MODE REG_BIT(2)
#define XE_OAG_RC0_ANY_ENGINE_BUSY_FREE XE_REG(0xdb80)
#define XE_OAG_ANY_MEDIA_FF_BUSY_FREE XE_REG(0xdba0)
#define XE_OAG_BLT_BUSY_FREE XE_REG(0xdbbc)
#define XE_OAG_RENDER_BUSY_FREE XE_REG(0xdbdc)
#define SAMPLER_MODE XE_REG_MCR(0xe18c, XE_REG_OPTION_MASKED)
#define ENABLE_SMALLPL REG_BIT(15)
#define SC_DISABLE_POWER_OPTIMIZATION_EBB REG_BIT(9)
......
......@@ -304,6 +304,8 @@ int xe_device_probe(struct xe_device *xe)
xe_debugfs_register(xe);
xe_pmu_register(&xe->pmu);
err = drmm_add_action_or_reset(&xe->drm, xe_device_sanitize, xe);
if (err)
return err;
......
......@@ -15,6 +15,7 @@
#include "xe_devcoredump_types.h"
#include "xe_gt_types.h"
#include "xe_platform_types.h"
#include "xe_pmu.h"
#include "xe_step_types.h"
struct xe_ggtt;
......@@ -342,6 +343,9 @@ struct xe_device {
*/
struct task_struct *pm_callback_task;
/** @pmu: performance monitoring unit */
struct xe_pmu pmu;
/* For pcode */
struct mutex sb_lock;
......
......@@ -652,6 +652,8 @@ int xe_gt_suspend(struct xe_gt *gt)
if (err)
goto err_msg;
xe_pmu_suspend(gt);
err = xe_uc_suspend(&gt->uc);
if (err)
goto err_force_wake;
......
......@@ -26,6 +26,20 @@
#define IIR(offset) XE_REG(offset + 0x8)
#define IER(offset) XE_REG(offset + 0xc)
/*
* Interrupt statistic for PMU. Increments the counter only if the
* interrupt originated from the GPU so interrupts from a device which
* shares the interrupt line are not accounted.
*/
static __always_inline void xe_pmu_irq_stats(struct xe_device *xe)
{
/*
* A clever compiler translates that into INC. A not so clever one
* should at least prevent store tearing.
*/
WRITE_ONCE(xe->pmu.irq_count, xe->pmu.irq_count + 1);
}
static void assert_iir_is_zero(struct xe_gt *mmio, struct xe_reg reg)
{
u32 val = xe_mmio_read32(mmio, reg);
......@@ -332,6 +346,8 @@ static irqreturn_t xelp_irq_handler(int irq, void *arg)
xelp_intr_enable(xe, false);
xe_pmu_irq_stats(xe);
return IRQ_HANDLED;
}
......@@ -425,6 +441,8 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
dg1_intr_enable(xe, false);
xe_pmu_irq_stats(xe);
return IRQ_HANDLED;
}
......
......@@ -12,6 +12,7 @@
#include "xe_hw_fence.h"
#include "xe_module.h"
#include "xe_pci.h"
#include "xe_pmu.h"
#include "xe_sched_job.h"
bool force_execlist = false;
......@@ -45,6 +46,10 @@ static const struct init_funcs init_funcs[] = {
.init = xe_sched_job_module_init,
.exit = xe_sched_job_module_exit,
},
{
.init = xe_pmu_init,
.exit = xe_pmu_exit,
},
{
.init = xe_register_pci_driver,
.exit = xe_unregister_pci_driver,
......
This diff is collapsed.
/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2023 Intel Corporation
*/
#ifndef _XE_PMU_H_
#define _XE_PMU_H_
#include "xe_gt_types.h"
#include "xe_pmu_types.h"
#if IS_ENABLED(CONFIG_PERF_EVENTS)
int xe_pmu_init(void);
void xe_pmu_exit(void);
void xe_pmu_register(struct xe_pmu *pmu);
void xe_pmu_suspend(struct xe_gt *gt);
#else
static inline int xe_pmu_init(void) { return 0; }
static inline void xe_pmu_exit(void) {}
static inline void xe_pmu_register(struct xe_pmu *pmu) {}
static inline void xe_pmu_suspend(struct xe_gt *gt) {}
#endif
#endif
/* SPDX-License-Identifier: MIT */
/*
* Copyright © 2023 Intel Corporation
*/
#ifndef _XE_PMU_TYPES_H_
#define _XE_PMU_TYPES_H_
#include <linux/perf_event.h>
#include <linux/spinlock_types.h>
#include <uapi/drm/xe_drm.h>
enum {
__XE_SAMPLE_RENDER_GROUP_BUSY,
__XE_SAMPLE_COPY_GROUP_BUSY,
__XE_SAMPLE_MEDIA_GROUP_BUSY,
__XE_SAMPLE_ANY_ENGINE_GROUP_BUSY,
__XE_NUM_PMU_SAMPLERS
};
#define XE_PMU_MAX_GT 2
struct xe_pmu {
/**
* @cpuhp: Struct used for CPU hotplug handling.
*/
struct {
struct hlist_node node;
unsigned int cpu;
} cpuhp;
/**
* @base: PMU base.
*/
struct pmu base;
/**
* @closed: xe is unregistering.
*/
bool closed;
/**
* @name: Name as registered with perf core.
*/
const char *name;
/**
* @lock: Lock protecting enable mask and ref count handling.
*/
spinlock_t lock;
/**
* @sample: Current and previous (raw) counters.
*
* These counters are updated when the device is awake.
*
*/
u64 sample[XE_PMU_MAX_GT][__XE_NUM_PMU_SAMPLERS];
/**
* @irq_count: Number of interrupts
*
* Intentionally unsigned long to avoid atomics or heuristics on 32bit.
* 4e9 interrupts are a lot and postprocessing can really deal with an
* occasional wraparound easily. It's 32bit after all.
*/
unsigned long irq_count;
/**
* @events_attr_group: Device events attribute group.
*/
struct attribute_group events_attr_group;
/**
* @xe_attr: Memory block holding device attributes.
*/
void *xe_attr;
/**
* @pmu_attr: Memory block holding device attributes.
*/
void *pmu_attr;
};
#endif
......@@ -1053,6 +1053,46 @@ struct drm_xe_vm_madvise {
__u64 reserved[2];
};
/**
* DOC: XE PMU event config IDs
*
* Check 'man perf_event_open' to use the ID's XE_PMU_XXXX listed in xe_drm.h
* in 'struct perf_event_attr' as part of perf_event_open syscall to read a
* particular event.
*
* For example to open the XE_PMU_INTERRUPTS(0):
*
* .. code-block:: C
*
* struct perf_event_attr attr;
* long long count;
* int cpu = 0;
* int fd;
*
* memset(&attr, 0, sizeof(struct perf_event_attr));
* attr.type = type; // eg: /sys/bus/event_source/devices/xe_0000_56_00.0/type
* attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED;
* attr.use_clockid = 1;
* attr.clockid = CLOCK_MONOTONIC;
* attr.config = XE_PMU_INTERRUPTS(0);
*
* fd = syscall(__NR_perf_event_open, &attr, -1, cpu, -1, 0);
*/
/*
* Top bits of every counter are GT id.
*/
#define __XE_PMU_GT_SHIFT (56)
#define ___XE_PMU_OTHER(gt, x) \
(((__u64)(x)) | ((__u64)(gt) << __XE_PMU_GT_SHIFT))
#define XE_PMU_INTERRUPTS(gt) ___XE_PMU_OTHER(gt, 0)
#define XE_PMU_RENDER_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 1)
#define XE_PMU_COPY_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 2)
#define XE_PMU_MEDIA_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 3)
#define XE_PMU_ANY_ENGINE_GROUP_BUSY(gt) ___XE_PMU_OTHER(gt, 4)
#if defined(__cplusplus)
}
#endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment