Commit cc009e61 authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher

drm/amdkfd: Add KFD support for soc21 v3

Add initial support for soc21 in KFD compute
driver (Mukul)
- Add new definition for soc21 device.
- Add new file for amdgpu-kfd interface for GFX11 family.
- Add new file for queue management, interrupt handling,
  mqd management for GFX11 family in KFD driver.
- Related changes/updates for soc21 device in
  KFD driver.
- Repurpose last 2 entries of SDMA MQD for driver use.

v2: Add an optional argument into update queue operation (Mukul)

v3: Switch to ip version check, replace kgd_dev with
    amdgpu_device (Hawking)
Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarOak Zeng <Oak.Zeng@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3b9186fa
......@@ -215,7 +215,8 @@ amdgpu-y += \
amdgpu_amdkfd_arcturus.o \
amdgpu_amdkfd_aldebaran.o \
amdgpu_amdkfd_gfx_v10.o \
amdgpu_amdkfd_gfx_v10_3.o
amdgpu_amdkfd_gfx_v10_3.o \
amdgpu_amdkfd_gfx_v11.o
ifneq ($(CONFIG_DRM_AMDGPU_CIK),)
amdgpu-y += amdgpu_amdkfd_gfx_v7.o
......
......@@ -100,7 +100,18 @@ static void amdgpu_doorbell_get_kfd_info(struct amdgpu_device *adev,
* The first num_doorbells are used by amdgpu.
* amdkfd takes whatever's left in the aperture.
*/
if (adev->doorbell.size > adev->doorbell.num_doorbells * sizeof(u32)) {
if (adev->enable_mes) {
/*
* With MES enabled, we only need to initialize
* the base address. The size and offset are
* not initialized as AMDGPU manages the whole
* doorbell space.
*/
*aperture_base = adev->doorbell.base;
*aperture_size = 0;
*start_offset = 0;
} else if (adev->doorbell.size > adev->doorbell.num_doorbells *
sizeof(u32)) {
*aperture_base = adev->doorbell.base;
*aperture_size = adev->doorbell.size;
*start_offset = adev->doorbell.num_doorbells * sizeof(u32);
......@@ -128,7 +139,7 @@ void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
AMDGPU_GMC_HOLE_START),
.drm_render_minor = adev_to_drm(adev)->render->index,
.sdma_doorbell_idx = adev->doorbell_index.sdma_engine,
.enable_mes = adev->enable_mes,
};
/* this is going to have a few of the MSBs set that we need to
......
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <linux/mmu_context.h>
#include "amdgpu.h"
#include "amdgpu_amdkfd.h"
#include "gc/gc_11_0_0_offset.h"
#include "gc/gc_11_0_0_sh_mask.h"
#include "oss/osssys_6_0_0_offset.h"
#include "oss/osssys_6_0_0_sh_mask.h"
#include "soc15_common.h"
#include "soc15d.h"
#include "v11_structs.h"
#include "soc21.h"
enum hqd_dequeue_request_type {
NO_ACTION = 0,
DRAIN_PIPE,
RESET_WAVES,
SAVE_WAVES
};
static void lock_srbm(struct amdgpu_device *adev, uint32_t mec, uint32_t pipe,
uint32_t queue, uint32_t vmid)
{
mutex_lock(&adev->srbm_mutex);
soc21_grbm_select(adev, mec, pipe, queue, vmid);
}
static void unlock_srbm(struct amdgpu_device *adev)
{
soc21_grbm_select(adev, 0, 0, 0, 0);
mutex_unlock(&adev->srbm_mutex);
}
static void acquire_queue(struct amdgpu_device *adev, uint32_t pipe_id,
uint32_t queue_id)
{
uint32_t mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
uint32_t pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
lock_srbm(adev, mec, pipe, queue_id, 0);
}
static uint64_t get_queue_mask(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id)
{
unsigned int bit = pipe_id * adev->gfx.mec.num_queue_per_pipe +
queue_id;
return 1ull << bit;
}
static void release_queue(struct amdgpu_device *adev)
{
unlock_srbm(adev);
}
static void program_sh_mem_settings_v11(struct amdgpu_device *adev, uint32_t vmid,
uint32_t sh_mem_config,
uint32_t sh_mem_ape1_base,
uint32_t sh_mem_ape1_limit,
uint32_t sh_mem_bases)
{
lock_srbm(adev, 0, 0, 0, vmid);
WREG32(SOC15_REG_OFFSET(GC, 0, regSH_MEM_CONFIG), sh_mem_config);
WREG32(SOC15_REG_OFFSET(GC, 0, regSH_MEM_BASES), sh_mem_bases);
unlock_srbm(adev);
}
static int set_pasid_vmid_mapping_v11(struct amdgpu_device *adev, unsigned int pasid,
unsigned int vmid)
{
uint32_t value = pasid << IH_VMID_0_LUT__PASID__SHIFT;
/* Mapping vmid to pasid also for IH block */
pr_debug("mapping vmid %d -> pasid %d in IH block for GFX client\n",
vmid, pasid);
WREG32(SOC15_REG_OFFSET(OSSSYS, 0, regIH_VMID_0_LUT) + vmid, value);
return 0;
}
static int init_interrupts_v11(struct amdgpu_device *adev, uint32_t pipe_id)
{
uint32_t mec;
uint32_t pipe;
mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
lock_srbm(adev, mec, pipe, 0, 0);
WREG32(SOC15_REG_OFFSET(GC, 0, regCPC_INT_CNTL),
CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK |
CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK);
unlock_srbm(adev);
return 0;
}
static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
unsigned int engine_id,
unsigned int queue_id)
{
uint32_t sdma_engine_reg_base = 0;
uint32_t sdma_rlc_reg_offset;
switch (engine_id) {
case 0:
sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, 0,
regSDMA0_QUEUE0_RB_CNTL) - regSDMA0_QUEUE0_RB_CNTL;
break;
case 1:
sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA1, 0,
regSDMA1_QUEUE0_RB_CNTL) - regSDMA0_QUEUE0_RB_CNTL;
break;
default:
BUG();
}
sdma_rlc_reg_offset = sdma_engine_reg_base
+ queue_id * (regSDMA0_QUEUE1_RB_CNTL - regSDMA0_QUEUE0_RB_CNTL);
pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
queue_id, sdma_rlc_reg_offset);
return sdma_rlc_reg_offset;
}
static inline struct v11_compute_mqd *get_mqd(void *mqd)
{
return (struct v11_compute_mqd *)mqd;
}
static inline struct v11_sdma_mqd *get_sdma_mqd(void *mqd)
{
return (struct v11_sdma_mqd *)mqd;
}
static int hqd_load_v11(struct amdgpu_device *adev, void *mqd, uint32_t pipe_id,
uint32_t queue_id, uint32_t __user *wptr,
uint32_t wptr_shift, uint32_t wptr_mask,
struct mm_struct *mm)
{
struct v11_compute_mqd *m;
uint32_t *mqd_hqd;
uint32_t reg, hqd_base, data;
m = get_mqd(mqd);
pr_debug("Load hqd of pipe %d queue %d\n", pipe_id, queue_id);
acquire_queue(adev, pipe_id, queue_id);
/* HIQ is set during driver init period with vmid set to 0*/
if (m->cp_hqd_vmid == 0) {
uint32_t value, mec, pipe;
mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
mec, pipe, queue_id);
value = RREG32(SOC15_REG_OFFSET(GC, 0, regRLC_CP_SCHEDULERS));
value = REG_SET_FIELD(value, RLC_CP_SCHEDULERS, scheduler1,
((mec << 5) | (pipe << 3) | queue_id | 0x80));
WREG32(SOC15_REG_OFFSET(GC, 0, regRLC_CP_SCHEDULERS), value);
}
/* HQD registers extend from CP_MQD_BASE_ADDR to CP_HQD_EOP_WPTR_MEM. */
mqd_hqd = &m->cp_mqd_base_addr_lo;
hqd_base = SOC15_REG_OFFSET(GC, 0, regCP_MQD_BASE_ADDR);
for (reg = hqd_base;
reg <= SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI); reg++)
WREG32(reg, mqd_hqd[reg - hqd_base]);
/* Activate doorbell logic before triggering WPTR poll. */
data = REG_SET_FIELD(m->cp_hqd_pq_doorbell_control,
CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL), data);
if (wptr) {
/* Don't read wptr with get_user because the user
* context may not be accessible (if this function
* runs in a work queue). Instead trigger a one-shot
* polling read from memory in the CP. This assumes
* that wptr is GPU-accessible in the queue's VMID via
* ATC or SVM. WPTR==RPTR before starting the poll so
* the CP starts fetching new commands from the right
* place.
*
* Guessing a 64-bit WPTR from a 32-bit RPTR is a bit
* tricky. Assume that the queue didn't overflow. The
* number of valid bits in the 32-bit RPTR depends on
* the queue size. The remaining bits are taken from
* the saved 64-bit WPTR. If the WPTR wrapped, add the
* queue size.
*/
uint32_t queue_size =
2 << REG_GET_FIELD(m->cp_hqd_pq_control,
CP_HQD_PQ_CONTROL, QUEUE_SIZE);
uint64_t guessed_wptr = m->cp_hqd_pq_rptr & (queue_size - 1);
if ((m->cp_hqd_pq_wptr_lo & (queue_size - 1)) < guessed_wptr)
guessed_wptr += queue_size;
guessed_wptr += m->cp_hqd_pq_wptr_lo & ~(queue_size - 1);
guessed_wptr += (uint64_t)m->cp_hqd_pq_wptr_hi << 32;
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_LO),
lower_32_bits(guessed_wptr));
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI),
upper_32_bits(guessed_wptr));
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_POLL_ADDR),
lower_32_bits((uint64_t)wptr));
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_POLL_ADDR_HI),
upper_32_bits((uint64_t)wptr));
pr_debug("%s setting CP_PQ_WPTR_POLL_CNTL1 to %x\n", __func__,
(uint32_t)get_queue_mask(adev, pipe_id, queue_id));
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_PQ_WPTR_POLL_CNTL1),
(uint32_t)get_queue_mask(adev, pipe_id, queue_id));
}
/* Start the EOP fetcher */
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_EOP_RPTR),
REG_SET_FIELD(m->cp_hqd_eop_rptr,
CP_HQD_EOP_RPTR, INIT_FETCHER, 1));
data = REG_SET_FIELD(m->cp_hqd_active, CP_HQD_ACTIVE, ACTIVE, 1);
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE), data);
release_queue(adev);
return 0;
}
static int hiq_mqd_load_v11(struct amdgpu_device *adev, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
uint32_t doorbell_off)
{
struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
struct v11_compute_mqd *m;
uint32_t mec, pipe;
int r;
m = get_mqd(mqd);
acquire_queue(adev, pipe_id, queue_id);
mec = (pipe_id / adev->gfx.mec.num_pipe_per_mec) + 1;
pipe = (pipe_id % adev->gfx.mec.num_pipe_per_mec);
pr_debug("kfd: set HIQ, mec:%d, pipe:%d, queue:%d.\n",
mec, pipe, queue_id);
spin_lock(&adev->gfx.kiq.ring_lock);
r = amdgpu_ring_alloc(kiq_ring, 7);
if (r) {
pr_err("Failed to alloc KIQ (%d).\n", r);
goto out_unlock;
}
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
amdgpu_ring_write(kiq_ring,
PACKET3_MAP_QUEUES_QUEUE_SEL(0) | /* Queue_Sel */
PACKET3_MAP_QUEUES_VMID(m->cp_hqd_vmid) | /* VMID */
PACKET3_MAP_QUEUES_QUEUE(queue_id) |
PACKET3_MAP_QUEUES_PIPE(pipe) |
PACKET3_MAP_QUEUES_ME((mec - 1)) |
PACKET3_MAP_QUEUES_QUEUE_TYPE(0) | /*queue_type: normal compute queue */
PACKET3_MAP_QUEUES_ALLOC_FORMAT(0) | /* alloc format: all_on_one_pipe */
PACKET3_MAP_QUEUES_ENGINE_SEL(1) | /* engine_sel: hiq */
PACKET3_MAP_QUEUES_NUM_QUEUES(1)); /* num_queues: must be 1 */
amdgpu_ring_write(kiq_ring,
PACKET3_MAP_QUEUES_DOORBELL_OFFSET(doorbell_off));
amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_lo);
amdgpu_ring_write(kiq_ring, m->cp_mqd_base_addr_hi);
amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_lo);
amdgpu_ring_write(kiq_ring, m->cp_hqd_pq_wptr_poll_addr_hi);
amdgpu_ring_commit(kiq_ring);
out_unlock:
spin_unlock(&adev->gfx.kiq.ring_lock);
release_queue(adev);
return r;
}
static int hqd_dump_v11(struct amdgpu_device *adev,
uint32_t pipe_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs)
{
uint32_t i = 0, reg;
#define HQD_N_REGS 56
#define DUMP_REG(addr) do { \
if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
break; \
(*dump)[i][0] = (addr) << 2; \
(*dump)[i++][1] = RREG32(addr); \
} while (0)
*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;
acquire_queue(adev, pipe_id, queue_id);
for (reg = SOC15_REG_OFFSET(GC, 0, regCP_MQD_BASE_ADDR);
reg <= SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_WPTR_HI); reg++)
DUMP_REG(reg);
release_queue(adev);
WARN_ON_ONCE(i != HQD_N_REGS);
*n_regs = i;
return 0;
}
static int hqd_sdma_load_v11(struct amdgpu_device *adev, void *mqd,
uint32_t __user *wptr, struct mm_struct *mm)
{
struct v11_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
unsigned long end_jiffies;
uint32_t data;
uint64_t data64;
uint64_t __user *wptr64 = (uint64_t __user *)wptr;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL,
m->sdmax_rlcx_rb_cntl & (~SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK));
end_jiffies = msecs_to_jiffies(2000) + jiffies;
while (true) {
data = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_CONTEXT_STATUS);
if (data & SDMA0_QUEUE0_CONTEXT_STATUS__IDLE_MASK)
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("SDMA RLC not idle in %s\n", __func__);
return -ETIME;
}
usleep_range(500, 1000);
}
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL_OFFSET,
m->sdmax_rlcx_doorbell_offset);
data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA0_QUEUE0_DOORBELL,
ENABLE, 1);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL, data);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR,
m->sdmax_rlcx_rb_rptr);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_HI,
m->sdmax_rlcx_rb_rptr_hi);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_MINOR_PTR_UPDATE, 1);
if (read_user_wptr(mm, wptr64, data64)) {
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR,
lower_32_bits(data64));
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR_HI,
upper_32_bits(data64));
} else {
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR,
m->sdmax_rlcx_rb_rptr);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_WPTR_HI,
m->sdmax_rlcx_rb_rptr_hi);
}
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_MINOR_PTR_UPDATE, 0);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_BASE, m->sdmax_rlcx_rb_base);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_BASE_HI,
m->sdmax_rlcx_rb_base_hi);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_ADDR_LO,
m->sdmax_rlcx_rb_rptr_addr_lo);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_ADDR_HI,
m->sdmax_rlcx_rb_rptr_addr_hi);
data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA0_QUEUE0_RB_CNTL,
RB_ENABLE, 1);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, data);
return 0;
}
static int hqd_sdma_dump_v11(struct amdgpu_device *adev,
uint32_t engine_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs)
{
uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
engine_id, queue_id);
uint32_t i = 0, reg;
#undef HQD_N_REGS
#define HQD_N_REGS (7+11+1+12+12)
*dump = kmalloc(HQD_N_REGS*2*sizeof(uint32_t), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;
for (reg = regSDMA0_QUEUE0_RB_CNTL;
reg <= regSDMA0_QUEUE0_RB_WPTR_HI; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA0_QUEUE0_RB_RPTR_ADDR_HI;
reg <= regSDMA0_QUEUE0_DOORBELL; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA0_QUEUE0_DOORBELL_LOG;
reg <= regSDMA0_QUEUE0_DOORBELL_LOG; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA0_QUEUE0_DOORBELL_OFFSET;
reg <= regSDMA0_QUEUE0_RB_PREEMPT; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA0_QUEUE0_MIDCMD_DATA0;
reg <= regSDMA0_QUEUE0_MIDCMD_CNTL; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
WARN_ON_ONCE(i != HQD_N_REGS);
*n_regs = i;
return 0;
}
static bool hqd_is_occupied_v11(struct amdgpu_device *adev, uint64_t queue_address,
uint32_t pipe_id, uint32_t queue_id)
{
uint32_t act;
bool retval = false;
uint32_t low, high;
acquire_queue(adev, pipe_id, queue_id);
act = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE));
if (act) {
low = lower_32_bits(queue_address >> 8);
high = upper_32_bits(queue_address >> 8);
if (low == RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_BASE)) &&
high == RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_PQ_BASE_HI)))
retval = true;
}
release_queue(adev);
return retval;
}
static bool hqd_sdma_is_occupied_v11(struct amdgpu_device *adev, void *mqd)
{
struct v11_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
uint32_t sdma_rlc_rb_cntl;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL);
if (sdma_rlc_rb_cntl & SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK)
return true;
return false;
}
static int hqd_destroy_v11(struct amdgpu_device *adev, void *mqd,
enum kfd_preempt_type reset_type,
unsigned int utimeout, uint32_t pipe_id,
uint32_t queue_id)
{
enum hqd_dequeue_request_type type;
unsigned long end_jiffies;
uint32_t temp;
struct v11_compute_mqd *m = get_mqd(mqd);
acquire_queue(adev, pipe_id, queue_id);
if (m->cp_hqd_vmid == 0)
WREG32_FIELD15_PREREG(GC, 0, RLC_CP_SCHEDULERS, scheduler1, 0);
switch (reset_type) {
case KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN:
type = DRAIN_PIPE;
break;
case KFD_PREEMPT_TYPE_WAVEFRONT_RESET:
type = RESET_WAVES;
break;
default:
type = DRAIN_PIPE;
break;
}
WREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_DEQUEUE_REQUEST), type);
end_jiffies = (utimeout * HZ / 1000) + jiffies;
while (true) {
temp = RREG32(SOC15_REG_OFFSET(GC, 0, regCP_HQD_ACTIVE));
if (!(temp & CP_HQD_ACTIVE__ACTIVE_MASK))
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue pipe %d queue %d preemption failed\n",
pipe_id, queue_id);
release_queue(adev);
return -ETIME;
}
usleep_range(500, 1000);
}
release_queue(adev);
return 0;
}
static int hqd_sdma_destroy_v11(struct amdgpu_device *adev, void *mqd,
unsigned int utimeout)
{
struct v11_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
uint32_t temp;
unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
temp = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL);
temp = temp & ~SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK;
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL, temp);
while (true) {
temp = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_CONTEXT_STATUS);
if (temp & SDMA0_QUEUE0_CONTEXT_STATUS__IDLE_MASK)
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("SDMA RLC not idle in %s\n", __func__);
return -ETIME;
}
usleep_range(500, 1000);
}
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_DOORBELL, 0);
WREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL,
RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_CNTL) |
SDMA0_QUEUE0_RB_CNTL__RB_ENABLE_MASK);
m->sdmax_rlcx_rb_rptr = RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR);
m->sdmax_rlcx_rb_rptr_hi =
RREG32(sdma_rlc_reg_offset + regSDMA0_QUEUE0_RB_RPTR_HI);
return 0;
}
static int wave_control_execute_v11(struct amdgpu_device *adev,
uint32_t gfx_index_val,
uint32_t sq_cmd)
{
uint32_t data = 0;
mutex_lock(&adev->grbm_idx_mutex);
WREG32(SOC15_REG_OFFSET(GC, 0, regGRBM_GFX_INDEX), gfx_index_val);
WREG32(SOC15_REG_OFFSET(GC, 0, regSQ_CMD), sq_cmd);
data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
INSTANCE_BROADCAST_WRITES, 1);
data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
SA_BROADCAST_WRITES, 1);
data = REG_SET_FIELD(data, GRBM_GFX_INDEX,
SE_BROADCAST_WRITES, 1);
WREG32(SOC15_REG_OFFSET(GC, 0, regGRBM_GFX_INDEX), data);
mutex_unlock(&adev->grbm_idx_mutex);
return 0;
}
static void set_vm_context_page_table_base_v11(struct amdgpu_device *adev,
uint32_t vmid, uint64_t page_table_base)
{
if (!amdgpu_amdkfd_is_kfd_vmid(adev, vmid)) {
pr_err("trying to set page table base for wrong VMID %u\n",
vmid);
return;
}
/* SDMA is on gfxhub as well for gfx11 adapters */
adev->gfxhub.funcs->setup_vm_pt_regs(adev, vmid, page_table_base);
}
const struct kfd2kgd_calls gfx_v11_kfd2kgd = {
.program_sh_mem_settings = program_sh_mem_settings_v11,
.set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11,
.init_interrupts = init_interrupts_v11,
.hqd_load = hqd_load_v11,
.hiq_mqd_load = hiq_mqd_load_v11,
.hqd_sdma_load = hqd_sdma_load_v11,
.hqd_dump = hqd_dump_v11,
.hqd_sdma_dump = hqd_sdma_dump_v11,
.hqd_is_occupied = hqd_is_occupied_v11,
.hqd_sdma_is_occupied = hqd_sdma_is_occupied_v11,
.hqd_destroy = hqd_destroy_v11,
.hqd_sdma_destroy = hqd_sdma_destroy_v11,
.wave_control_execute = wave_control_execute_v11,
.get_atc_vmid_pasid_mapping_info = NULL,
.set_vm_context_page_table_base = set_vm_context_page_table_base_v11,
};
......@@ -37,6 +37,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_mqd_manager_vi.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v9.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v10.o \
$(AMDKFD_PATH)/kfd_mqd_manager_v11.o \
$(AMDKFD_PATH)/kfd_kernel_queue.o \
$(AMDKFD_PATH)/kfd_packet_manager.o \
$(AMDKFD_PATH)/kfd_packet_manager_vi.o \
......@@ -47,10 +48,12 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_vi.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v9.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v10.o \
$(AMDKFD_PATH)/kfd_device_queue_manager_v11.o \
$(AMDKFD_PATH)/kfd_interrupt.o \
$(AMDKFD_PATH)/kfd_events.o \
$(AMDKFD_PATH)/cik_event_interrupt.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_int_process_v11.o \
$(AMDKFD_PATH)/kfd_smi_events.o \
$(AMDKFD_PATH)/kfd_crat.o
......
......@@ -1315,6 +1315,8 @@ static int fill_in_l2_l3_pcache(struct crat_subtype_cache *pcache,
return 1;
}
#define KFD_MAX_CACHE_TYPES 6
static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev,
struct kfd_gpu_cache_info *pcache_info)
{
......@@ -1408,6 +1410,7 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
int *num_of_entries)
{
struct kfd_gpu_cache_info *pcache_info;
struct kfd_gpu_cache_info cache_info[KFD_MAX_CACHE_TYPES];
int num_of_cache_types = 0;
int i, j, k;
int ct = 0;
......@@ -1516,6 +1519,11 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev,
pcache_info = yellow_carp_cache_info;
num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info);
break;
case IP_VERSION(11, 0, 0):
pcache_info = cache_info;
num_of_cache_types =
kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info);
break;
default:
return -EINVAL;
}
......
......@@ -53,6 +53,7 @@ extern const struct kfd2kgd_calls arcturus_kfd2kgd;
extern const struct kfd2kgd_calls aldebaran_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v10_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd;
extern const struct kfd2kgd_calls gfx_v11_kfd2kgd;
static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size,
unsigned int chunk_size);
......@@ -60,7 +61,7 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd);
static int kfd_resume(struct kfd_dev *kfd);
static void kfd_device_info_set_sdma_queue_num(struct kfd_dev *kfd)
static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd)
{
uint32_t sdma_version = kfd->adev->ip_versions[SDMA0_HWIP][0];
......@@ -85,6 +86,7 @@ static void kfd_device_info_set_sdma_queue_num(struct kfd_dev *kfd)
case IP_VERSION(5, 2, 2):/* NAVY_FLOUNDER */
case IP_VERSION(5, 2, 4):/* DIMGREY_CAVEFISH */
case IP_VERSION(5, 2, 5):/* BEIGE_GOBY */
case IP_VERSION(6, 0, 0):
kfd->device_info.num_sdma_queues_per_engine = 8;
break;
default:
......@@ -93,6 +95,17 @@ static void kfd_device_info_set_sdma_queue_num(struct kfd_dev *kfd)
sdma_version);
kfd->device_info.num_sdma_queues_per_engine = 8;
}
switch (sdma_version) {
case IP_VERSION(6, 0, 0):
/* Reserve 1 for paging and 1 for gfx */
kfd->device_info.num_reserved_sdma_queues_per_engine = 2;
/* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */
kfd->device_info.reserved_sdma_queues_bitmap = 0xFULL;
break;
default:
break;
}
}
static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
......@@ -121,6 +134,9 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd)
case IP_VERSION(10, 3, 5): /* BEIGE_GOBY */
kfd->device_info.event_interrupt_class = &event_interrupt_class_v9;
break;
case IP_VERSION(11, 0, 0):
kfd->device_info.event_interrupt_class = &event_interrupt_class_v11;
break;
default:
dev_warn(kfd_device, "v9 event interrupt handler is set due to "
"mismatch of gc ip block(GC_HWIP:0x%x).\n", gc_version);
......@@ -145,7 +161,7 @@ static void kfd_device_info_init(struct kfd_dev *kfd,
kfd->device_info.ih_ring_entry_size = 8 * sizeof(uint32_t);
kfd->device_info.supports_cwsr = true;
kfd_device_info_set_sdma_queue_num(kfd);
kfd_device_info_set_sdma_info(kfd);
kfd_device_info_set_event_interrupt_class(kfd);
......@@ -346,6 +362,10 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf)
if (!vf)
f2g = &gfx_v10_3_kfd2kgd;
break;
case IP_VERSION(11, 0, 0):
gfx_target_version = 110000;
f2g = &gfx_v11_kfd2kgd;
break;
default:
break;
}
......
......@@ -35,6 +35,7 @@
#include "cik_regs.h"
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
#include "mes_api_def.h"
/* Size of the per-pipe EOP queue */
#define CIK_HPD_EOP_BYTES_LOG2 11
......@@ -118,6 +119,11 @@ unsigned int get_num_xgmi_sdma_queues(struct device_queue_manager *dqm)
dqm->dev->device_info.num_sdma_queues_per_engine;
}
static inline uint64_t get_reserved_sdma_queues_bitmap(struct device_queue_manager *dqm)
{
return dqm->dev->device_info.reserved_sdma_queues_bitmap;
}
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
......@@ -129,6 +135,151 @@ void program_sh_mem_settings(struct device_queue_manager *dqm,
qpd->sh_mem_bases);
}
static void kfd_hws_hang(struct device_queue_manager *dqm)
{
/*
* Issue a GPU reset if HWS is unresponsive
*/
dqm->is_hws_hang = true;
/* It's possible we're detecting a HWS hang in the
* middle of a GPU reset. No need to schedule another
* reset in this case.
*/
if (!dqm->is_resetting)
schedule_work(&dqm->hw_exception_work);
}
static int convert_to_mes_queue_type(int queue_type)
{
int mes_queue_type;
switch (queue_type) {
case KFD_QUEUE_TYPE_COMPUTE:
mes_queue_type = MES_QUEUE_TYPE_COMPUTE;
break;
case KFD_QUEUE_TYPE_SDMA:
mes_queue_type = MES_QUEUE_TYPE_SDMA;
break;
default:
WARN(1, "Invalid queue type %d", queue_type);
mes_queue_type = -EINVAL;
break;
}
return mes_queue_type;
}
static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
struct kfd_process_device *pdd = qpd_to_pdd(qpd);
struct mes_add_queue_input queue_input;
int r;
if (dqm->is_hws_hang)
return -EIO;
memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
queue_input.process_id = qpd->pqm->process->pasid;
queue_input.page_table_base_addr = qpd->page_table_base;
queue_input.process_va_start = 0;
queue_input.process_va_end = adev->vm_manager.max_pfn - 1;
/* MES unit for quantum is 100ns */
queue_input.process_quantum = KFD_MES_PROCESS_QUANTUM; /* Equivalent to 10ms. */
queue_input.process_context_addr = pdd->proc_ctx_gpu_addr;
queue_input.gang_quantum = KFD_MES_GANG_QUANTUM; /* Equivalent to 1ms */
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
queue_input.inprocess_gang_priority = q->properties.priority;
queue_input.gang_global_priority_level =
AMDGPU_MES_PRIORITY_LEVEL_NORMAL;
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.mqd_addr = q->gart_mqd_addr;
queue_input.wptr_addr = (uint64_t)q->properties.write_ptr;
queue_input.paging = false;
queue_input.tba_addr = qpd->tba_addr;
queue_input.tma_addr = qpd->tma_addr;
queue_input.queue_type = convert_to_mes_queue_type(q->properties.type);
if (queue_input.queue_type < 0) {
pr_err("Queue type not supported with MES, queue:%d\n",
q->properties.type);
return -EINVAL;
}
if (q->gws) {
queue_input.gws_base = 0;
queue_input.gws_size = qpd->num_gws;
}
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
if (r) {
pr_err("failed to add hardware queue to MES, doorbell=0x%x\n",
q->properties.doorbell_off);
pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
return r;
}
static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev;
int r;
struct mes_remove_queue_input queue_input;
if (dqm->is_hws_hang)
return -EIO;
memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
queue_input.doorbell_offset = q->properties.doorbell_off;
queue_input.gang_context_addr = q->gang_ctx_gpu_addr;
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
if (r) {
pr_err("failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
return r;
}
static int remove_all_queues_mes(struct device_queue_manager *dqm)
{
struct device_process_node *cur;
struct qcm_process_device *qpd;
struct queue *q;
int retval = 0;
list_for_each_entry(cur, &dqm->queues, list) {
qpd = cur->qpd;
list_for_each_entry(q, &qpd->queues_list, list) {
if (q->properties.is_active) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval) {
pr_err("%s: Failed to remove queue %d for dev %d",
__func__,
q->properties.queue_id,
dqm->dev->id);
return retval;
}
}
}
}
return retval;
}
static void increment_queue_count(struct device_queue_manager *dqm,
struct qcm_process_device *qpd,
struct queue *q)
......@@ -659,6 +810,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
bool prev_active = false;
bool add_queue = false;
dqm_lock(dqm);
pdd = kfd_get_process_device_data(q->device, q->process);
......@@ -674,8 +826,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
/* Make sure the queue is unmapped before updating the MQD */
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
retval = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
if (!dqm->dev->shared_resources.enable_mes)
retval = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, false);
else if (prev_active)
retval = remove_queue_mes(dqm, q, &pdd->qpd);
if (retval) {
pr_err("unmap queue failed\n");
goto out_unlock;
......@@ -727,9 +883,12 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q,
q->properties.is_gws = false;
}
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS)
retval = map_queues_cpsch(dqm);
else if (q->properties.is_active &&
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) {
if (!dqm->dev->shared_resources.enable_mes)
retval = map_queues_cpsch(dqm);
else if (add_queue)
retval = add_queue_mes(dqm, q, &pdd->qpd);
} else if (q->properties.is_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) {
......@@ -822,12 +981,22 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
q->properties.is_active = false;
decrement_queue_count(dqm, qpd, q);
if (dqm->dev->shared_resources.enable_mes) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval) {
pr_err("Failed to evict queue %d\n",
q->properties.queue_id);
goto out;
}
}
}
pdd->last_evict_timestamp = get_jiffies_64();
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (!dqm->dev->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
qpd->is_debug ?
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES :
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
out:
dqm_unlock(dqm);
......@@ -951,9 +1120,19 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
q->properties.is_active = true;
increment_queue_count(dqm, &pdd->qpd, q);
if (dqm->dev->shared_resources.enable_mes) {
retval = add_queue_mes(dqm, q, qpd);
if (retval) {
pr_err("Failed to restore queue %d\n",
q->properties.queue_id);
goto out;
}
}
}
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (!dqm->dev->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
qpd->evicted = 0;
eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
......@@ -1081,6 +1260,9 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
memset(dqm->vmid_pasid, 0, sizeof(dqm->vmid_pasid));
dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm));
dqm->sdma_bitmap &= ~(get_reserved_sdma_queues_bitmap(dqm));
pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap);
dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm));
return 0;
......@@ -1277,6 +1459,9 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
else
dqm->sdma_bitmap = (BIT_ULL(num_sdma_queues) - 1);
dqm->sdma_bitmap &= ~(get_reserved_sdma_queues_bitmap(dqm));
pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap);
num_xgmi_sdma_queues = get_num_xgmi_sdma_queues(dqm);
if (num_xgmi_sdma_queues >= BITS_PER_TYPE(dqm->xgmi_sdma_bitmap))
dqm->xgmi_sdma_bitmap = ULLONG_MAX;
......@@ -1295,14 +1480,16 @@ static int start_cpsch(struct device_queue_manager *dqm)
retval = 0;
dqm_lock(dqm);
retval = pm_init(&dqm->packet_mgr, dqm);
if (retval)
goto fail_packet_manager_init;
retval = set_sched_resources(dqm);
if (retval)
goto fail_set_sched_resources;
if (!dqm->dev->shared_resources.enable_mes) {
retval = pm_init(&dqm->packet_mgr, dqm);
if (retval)
goto fail_packet_manager_init;
retval = set_sched_resources(dqm);
if (retval)
goto fail_set_sched_resources;
}
pr_debug("Allocating fence memory\n");
/* allocate fence memory on the gart */
......@@ -1321,13 +1508,15 @@ static int start_cpsch(struct device_queue_manager *dqm)
dqm->is_hws_hang = false;
dqm->is_resetting = false;
dqm->sched_running = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (!dqm->dev->shared_resources.enable_mes)
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
dqm_unlock(dqm);
return 0;
fail_allocate_vidmem:
fail_set_sched_resources:
pm_uninit(&dqm->packet_mgr, false);
if (!dqm->dev->shared_resources.enable_mes)
pm_uninit(&dqm->packet_mgr, false);
fail_packet_manager_init:
dqm_unlock(dqm);
return retval;
......@@ -1343,15 +1532,22 @@ static int stop_cpsch(struct device_queue_manager *dqm)
return 0;
}
if (!dqm->is_hws_hang)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
if (!dqm->is_hws_hang) {
if (!dqm->dev->shared_resources.enable_mes)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, false);
else
remove_all_queues_mes(dqm);
}
hanging = dqm->is_hws_hang || dqm->is_resetting;
dqm->sched_running = false;
pm_release_ib(&dqm->packet_mgr);
if (!dqm->dev->shared_resources.enable_mes)
pm_release_ib(&dqm->packet_mgr);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
pm_uninit(&dqm->packet_mgr, hanging);
if (!dqm->dev->shared_resources.enable_mes)
pm_uninit(&dqm->packet_mgr, hanging);
dqm_unlock(dqm);
return 0;
......@@ -1469,8 +1665,14 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
if (q->properties.is_active) {
increment_queue_count(dqm, qpd, q);
execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (!dqm->dev->shared_resources.enable_mes) {
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
} else {
retval = add_queue_mes(dqm, q, qpd);
if (retval)
goto cleanup_queue;
}
}
/*
......@@ -1485,6 +1687,13 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
dqm_unlock(dqm);
return retval;
cleanup_queue:
qpd->queue_count--;
list_del(&q->list);
if (q->properties.is_active)
decrement_queue_count(dqm, qpd, q);
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
dqm_unlock(dqm);
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
......@@ -1572,13 +1781,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
queue_preemption_timeout_ms);
if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
dqm->is_hws_hang = true;
/* It's possible we're detecting a HWS hang in the
* middle of a GPU reset. No need to schedule another
* reset in this case.
*/
if (!dqm->is_resetting)
schedule_work(&dqm->hw_exception_work);
kfd_hws_hang(dqm);
return retval;
}
......@@ -1683,11 +1886,15 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
list_del(&q->list);
qpd->queue_count--;
if (q->properties.is_active) {
decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
if (!dqm->dev->shared_resources.enable_mes) {
decrement_queue_count(dqm, qpd, q);
retval = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
} else {
retval = remove_queue_mes(dqm, q, qpd);
}
}
/*
......@@ -1941,9 +2148,17 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
deallocate_sdma_queue(dqm, q);
if (q->properties.is_active)
if (q->properties.is_active) {
decrement_queue_count(dqm, qpd, q);
if (dqm->dev->shared_resources.enable_mes) {
retval = remove_queue_mes(dqm, q, qpd);
if (retval)
pr_err("Failed to remove queue %d\n",
q->properties.queue_id);
}
}
dqm->total_queue_count--;
}
......@@ -1958,7 +2173,9 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
}
}
retval = execute_queues_cpsch(dqm, filter, 0);
if (!dqm->dev->shared_resources.enable_mes)
retval = execute_queues_cpsch(dqm, filter, 0);
if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
......@@ -2133,7 +2350,9 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev)
break;
default:
if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1))
if (KFD_GC_VERSION(dev) >= IP_VERSION(11, 0, 0))
device_queue_manager_init_v11(&dqm->asic_ops);
else if (KFD_GC_VERSION(dev) >= IP_VERSION(10, 1, 1))
device_queue_manager_init_v10_navi10(&dqm->asic_ops);
else if (KFD_GC_VERSION(dev) >= IP_VERSION(9, 0, 1))
device_queue_manager_init_v9(&dqm->asic_ops);
......
......@@ -35,6 +35,9 @@
#define VMID_NUM 16
#define KFD_MES_PROCESS_QUANTUM 100000
#define KFD_MES_GANG_QUANTUM 10000
struct device_process_node {
struct qcm_process_device *qpd;
struct list_head list;
......@@ -267,6 +270,8 @@ void device_queue_manager_init_v9(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v10_navi10(
struct device_queue_manager_asic_ops *asic_ops);
void device_queue_manager_init_v11(
struct device_queue_manager_asic_ops *asic_ops);
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
unsigned int get_cp_queues_num(struct device_queue_manager *dqm);
......
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include "kfd_device_queue_manager.h"
#include "gc/gc_11_0_0_offset.h"
#include "gc/gc_11_0_0_sh_mask.h"
#include "soc21_enum.h"
static int update_qpd_v11(struct device_queue_manager *dqm,
struct qcm_process_device *qpd);
static void init_sdma_vm_v11(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd);
void device_queue_manager_init_v11(
struct device_queue_manager_asic_ops *asic_ops)
{
asic_ops->update_qpd = update_qpd_v11;
asic_ops->init_sdma_vm = init_sdma_vm_v11;
asic_ops->mqd_manager_init = mqd_manager_init_v11;
}
static uint32_t compute_sh_mem_bases_64bit(struct kfd_process_device *pdd)
{
uint32_t shared_base = pdd->lds_base >> 48;
uint32_t private_base = pdd->scratch_base >> 48;
return (shared_base << SH_MEM_BASES__SHARED_BASE__SHIFT) |
private_base;
}
static int update_qpd_v11(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct kfd_process_device *pdd;
pdd = qpd_to_pdd(qpd);
/* check if sh_mem_config register already configured */
if (qpd->sh_mem_config == 0) {
qpd->sh_mem_config =
(SH_MEM_ALIGNMENT_MODE_UNALIGNED <<
SH_MEM_CONFIG__ALIGNMENT_MODE__SHIFT) |
(3 << SH_MEM_CONFIG__INITIAL_INST_PREFETCH__SHIFT);
qpd->sh_mem_ape1_limit = 0;
qpd->sh_mem_ape1_base = 0;
}
qpd->sh_mem_bases = compute_sh_mem_bases_64bit(pdd);
pr_debug("sh_mem_bases 0x%X\n", qpd->sh_mem_bases);
return 0;
}
static void init_sdma_vm_v11(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
/* Not needed on SDMAv4 onwards any more */
q->properties.sdma_vm_addr = 0;
}
......@@ -49,9 +49,13 @@
/* # of doorbell bytes allocated for each process. */
size_t kfd_doorbell_process_slice(struct kfd_dev *kfd)
{
return roundup(kfd->device_info.doorbell_size *
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
PAGE_SIZE);
if (!kfd->shared_resources.enable_mes)
return roundup(kfd->device_info.doorbell_size *
KFD_MAX_NUM_OF_QUEUES_PER_PROCESS,
PAGE_SIZE);
else
return amdgpu_mes_doorbell_process_slice(
(struct amdgpu_device *)kfd->adev);
}
/* Doorbell calculations for device init. */
......@@ -61,6 +65,16 @@ int kfd_doorbell_init(struct kfd_dev *kfd)
size_t doorbell_aperture_size;
size_t doorbell_process_limit;
/*
* With MES enabled, just set the doorbell base as it is needed
* to calculate doorbell physical address.
*/
if (kfd->shared_resources.enable_mes) {
kfd->doorbell_base =
kfd->shared_resources.doorbell_physical_address;
return 0;
}
/*
* We start with calculations in bytes because the input data might
* only be byte-aligned.
......@@ -237,10 +251,16 @@ unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd,
* the process's doorbells. The offset returned is in dword
* units regardless of the ASIC-dependent doorbell size.
*/
return kfd->doorbell_base_dw_offset +
pdd->doorbell_index
* kfd_doorbell_process_slice(kfd) / sizeof(u32) +
doorbell_id * kfd->device_info.doorbell_size / sizeof(u32);
if (!kfd->shared_resources.enable_mes)
return kfd->doorbell_base_dw_offset +
pdd->doorbell_index
* kfd_doorbell_process_slice(kfd) / sizeof(u32) +
doorbell_id *
kfd->device_info.doorbell_size / sizeof(u32);
else
return amdgpu_mes_get_doorbell_dw_offset_in_bar(
(struct amdgpu_device *)kfd->adev,
pdd->doorbell_index, doorbell_id);
}
uint64_t kfd_get_number_elems(struct kfd_dev *kfd)
......@@ -261,8 +281,16 @@ phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd)
int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_index)
{
int r = ida_simple_get(&kfd->doorbell_ida, 1, kfd->max_doorbell_slices,
GFP_KERNEL);
int r = 0;
if (!kfd->shared_resources.enable_mes)
r = ida_simple_get(&kfd->doorbell_ida, 1,
kfd->max_doorbell_slices, GFP_KERNEL);
else
r = amdgpu_mes_alloc_process_doorbells(
(struct amdgpu_device *)kfd->adev,
doorbell_index);
if (r > 0)
*doorbell_index = r;
......@@ -271,6 +299,12 @@ int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_inde
void kfd_free_process_doorbells(struct kfd_dev *kfd, unsigned int doorbell_index)
{
if (doorbell_index)
ida_simple_remove(&kfd->doorbell_ida, doorbell_index);
if (doorbell_index) {
if (!kfd->shared_resources.enable_mes)
ida_simple_remove(&kfd->doorbell_ida, doorbell_index);
else
amdgpu_mes_free_process_doorbells(
(struct amdgpu_device *)kfd->adev,
doorbell_index);
}
}
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "kfd_priv.h"
#include "kfd_events.h"
#include "soc15_int.h"
#include "kfd_device_queue_manager.h"
#include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
#include "kfd_smi_events.h"
/*
* GFX11 SQ Interrupts
*
* There are 3 encoding types of interrupts sourced from SQ sent as a 44-bit
* packet to the Interrupt Handler:
* Auto - Generated by the SQG (various cmd overflows, timestamps etc)
* Wave - Generated by S_SENDMSG through a shader program
* Error - HW generated errors (Illegal instructions, Memviols, EDC etc)
*
* The 44-bit packet is mapped as {context_id1[7:0],context_id0[31:0]} plus
* 4-bits for VMID (SOC15_VMID_FROM_IH_ENTRY) as such:
*
* - context_id1[7:6]
* Encoding type (0 = Auto, 1 = Wave, 2 = Error)
*
* - context_id0[26]
* PRIV bit indicates that Wave S_SEND or error occurred within trap
*
* - context_id0[24:0]
* 25-bit data with the following layout per encoding type:
* Auto - only context_id0[8:0] is used, which reports various interrupts
* generated by SQG. The rest is 0.
* Wave - user data sent from m0 via S_SENDMSG (context_id0[23:0])
* Error - Error Type (context_id0[24:21]), Error Details (context_id0[20:0])
*
* The other context_id bits show coordinates (SE/SH/CU/SIMD/WGP) for wave
* S_SENDMSG and Errors. These are 0 for Auto.
*/
enum SQ_INTERRUPT_WORD_ENCODING {
SQ_INTERRUPT_WORD_ENCODING_AUTO = 0x0,
SQ_INTERRUPT_WORD_ENCODING_INST,
SQ_INTERRUPT_WORD_ENCODING_ERROR,
};
enum SQ_INTERRUPT_ERROR_TYPE {
SQ_INTERRUPT_ERROR_TYPE_EDC_FUE = 0x0,
SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST,
SQ_INTERRUPT_ERROR_TYPE_MEMVIOL,
SQ_INTERRUPT_ERROR_TYPE_EDC_FED,
};
/* SQ_INTERRUPT_WORD_AUTO_CTXID */
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE__SHIFT 0
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT__SHIFT 1
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF_FULL__SHIFT 2
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__REG_TIMESTAMP__SHIFT 3
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__CMD_TIMESTAMP__SHIFT 4
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_CMD_OVERFLOW__SHIFT 5
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_REG_OVERFLOW__SHIFT 6
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__IMMED_OVERFLOW__SHIFT 7
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR__SHIFT 8
#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING__SHIFT 6
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_MASK 0x00000001
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__WLT_MASK 0x00000002
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_BUF_FULL_MASK 0x00000004
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__REG_TIMESTAMP_MASK 0x00000008
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__CMD_TIMESTAMP_MASK 0x00000010
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_CMD_OVERFLOW_MASK 0x00000020
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__HOST_REG_OVERFLOW_MASK 0x00000040
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__IMMED_OVERFLOW_MASK 0x00000080
#define SQ_INTERRUPT_WORD_AUTO_CTXID0__THREAD_TRACE_UTC_ERROR_MASK 0x00000100
#define SQ_INTERRUPT_WORD_AUTO_CTXID1__ENCODING_MASK 0x000000c0
/* SQ_INTERRUPT_WORD_WAVE_CTXID */
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA__SHIFT 0
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SH_ID__SHIFT 25
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV__SHIFT 26
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID__SHIFT 27
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID__SHIFT 0
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID__SHIFT 2
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING__SHIFT 6
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__DATA_MASK 0x00ffffff /* [23:0] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__SH_ID_MASK 0x02000000 /* [25] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__PRIV_MASK 0x04000000 /* [26] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */
#define SQ_INTERRUPT_WORD_WAVE_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */
/* SQ_INTERRUPT_WORD_ERROR_CTXID */
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL__SHIFT 0
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE__SHIFT 21
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SH_ID__SHIFT 25
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV__SHIFT 26
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID__SHIFT 27
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID__SHIFT 0
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID__SHIFT 2
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING__SHIFT 6
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__DETAIL_MASK 0x001fffff /* [20:0] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__TYPE_MASK 0x01e00000 /* [24:21] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__SH_ID_MASK 0x02000000 /* [25] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__PRIV_MASK 0x04000000 /* [26] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID0__WAVE_ID_MASK 0xf8000000 /* [31:27] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__SIMD_ID_MASK 0x00000003 /* [33:32] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__WGP_ID_MASK 0x0000003c /* [37:34] */
#define SQ_INTERRUPT_WORD_ERROR_CTXID1__ENCODING_MASK 0x000000c0 /* [39:38] */
/*
* The debugger will send user data(m0) with PRIV=1 to indicate it requires
* notification from the KFD with the following queue id (DOORBELL_ID) and
* trap code (TRAP_CODE).
*/
#define KFD_CTXID0_TRAP_CODE_SHIFT 10
#define KFD_CTXID0_TRAP_CODE_MASK 0xfffc00
#define KFD_CTXID0_CP_BAD_OP_ECODE_MASK 0x3ffffff
#define KFD_CTXID0_DOORBELL_ID_MASK 0x0003ff
#define KFD_CTXID0_TRAP_CODE(ctxid0) (((ctxid0) & \
KFD_CTXID0_TRAP_CODE_MASK) >> \
KFD_CTXID0_TRAP_CODE_SHIFT)
#define KFD_CTXID0_CP_BAD_OP_ECODE(ctxid0) (((ctxid0) & \
KFD_CTXID0_CP_BAD_OP_ECODE_MASK) >> \
KFD_CTXID0_TRAP_CODE_SHIFT)
#define KFD_CTXID0_DOORBELL_ID(ctxid0) ((ctxid0) & \
KFD_CTXID0_DOORBELL_ID_MASK)
static void print_sq_intr_info_auto(uint32_t context_id0, uint32_t context_id1)
{
pr_debug(
"sq_intr: auto, ttrace %d, wlt %d, ttrace_buf_full %d, reg_tms %d, cmd_tms %d, host_cmd_ovf %d, host_reg_ovf %d, immed_ovf %d, ttrace_utc_err %d\n",
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, WLT),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_BUF_FULL),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, REG_TIMESTAMP),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, CMD_TIMESTAMP),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_CMD_OVERFLOW),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, HOST_REG_OVERFLOW),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, IMMED_OVERFLOW),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_AUTO_CTXID0, THREAD_TRACE_UTC_ERROR));
}
static void print_sq_intr_info_inst(uint32_t context_id0, uint32_t context_id1)
{
pr_debug(
"sq_intr: inst, data 0x%08x, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, DATA),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, SH_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, WAVE_ID),
REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, SIMD_ID),
REG_GET_FIELD(context_id1, SQ_INTERRUPT_WORD_WAVE_CTXID1, WGP_ID));
}
static void print_sq_intr_info_error(uint32_t context_id0, uint32_t context_id1)
{
pr_warn(
"sq_intr: error, detail 0x%08x, type %d, sh %d, priv %d, wave_id %d, simd_id %d, wgp_id %d\n",
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, DETAIL),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, SH_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, PRIV),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID0, WAVE_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, SIMD_ID),
REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_ERROR_CTXID1, WGP_ID));
}
static void event_interrupt_poison_consumption_v11(struct kfd_dev *dev,
uint16_t pasid, uint16_t source_id)
{
int ret = -EINVAL;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
return;
/* all queues of a process will be unmapped in one time */
if (atomic_read(&p->poison)) {
kfd_unref_process(p);
return;
}
atomic_set(&p->poison, 1);
kfd_unref_process(p);
switch (source_id) {
case SOC15_INTSRC_SQ_INTERRUPT_MSG:
if (dev->dqm->ops.reset_queues)
ret = dev->dqm->ops.reset_queues(dev->dqm, pasid);
break;
case SOC21_INTSRC_SDMA_ECC:
default:
break;
}
kfd_signal_poison_consumed_event(dev, pasid);
/* resetting queue passes, do page retirement without gpu reset
resetting queue fails, fallback to gpu reset solution */
if (!ret)
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
else
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
}
static bool event_interrupt_isr_v11(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
bool *patched_flag)
{
uint16_t source_id, client_id, pasid, vmid;
const uint32_t *data = ih_ring_entry;
uint32_t context_id0;
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
/* Only handle interrupts from KFD VMIDs */
vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
if (/*!KFD_IRQ_IS_FENCE(client_id, source_id) &&*/
(vmid < dev->vm_info.first_vmid_kfd ||
vmid > dev->vm_info.last_vmid_kfd))
return 0;
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
if ((source_id == SOC15_INTSRC_CP_END_OF_PIPE) &&
(context_id0 & AMDGPU_FENCE_MES_QUEUE_FLAG))
return 0;
pr_debug("client id 0x%x, source id %d, vmid %d, pasid 0x%x. raw data:\n",
client_id, source_id, vmid, pasid);
pr_debug("%8X, %8X, %8X, %8X, %8X, %8X, %8X, %8X.\n",
data[0], data[1], data[2], data[3],
data[4], data[5], data[6], data[7]);
/* If there is no valid PASID, it's likely a bug */
if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
return 0;
/* Interrupt types we care about: various signals and faults.
* They will be forwarded to a work queue (see below).
*/
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
source_id == SOC21_INTSRC_SDMA_TRAP ||
client_id == SOC21_IH_CLIENTID_VMC ||
((client_id == SOC21_IH_CLIENTID_GFX) &&
(source_id == UTCL2_1_0__SRCID__FAULT)) /*||
KFD_IRQ_IS_FENCE(client_id, source_id)*/;
}
static void event_interrupt_wq_v11(struct kfd_dev *dev,
const uint32_t *ih_ring_entry)
{
uint16_t source_id, client_id, ring_id, pasid, vmid;
uint32_t context_id0, context_id1;
uint8_t sq_int_enc, sq_int_errtype, sq_int_priv;
struct kfd_vm_fault_info info = {0};
struct kfd_hsa_memory_exception_data exception_data;
source_id = SOC15_SOURCE_ID_FROM_IH_ENTRY(ih_ring_entry);
client_id = SOC15_CLIENT_ID_FROM_IH_ENTRY(ih_ring_entry);
ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
pasid = SOC15_PASID_FROM_IH_ENTRY(ih_ring_entry);
vmid = SOC15_VMID_FROM_IH_ENTRY(ih_ring_entry);
context_id0 = SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
context_id1 = SOC15_CONTEXT_ID1_FROM_IH_ENTRY(ih_ring_entry);
/* VMC, UTCL2 */
if (client_id == SOC21_IH_CLIENTID_VMC ||
((client_id == SOC21_IH_CLIENTID_GFX) &&
(source_id == UTCL2_1_0__SRCID__FAULT))) {
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
(uint64_t)(ih_ring_entry[5] & 0xf) << 32;
info.prot_valid = ring_id & 0x08;
info.prot_read = ring_id & 0x10;
info.prot_write = ring_id & 0x20;
memset(&exception_data, 0, sizeof(exception_data));
exception_data.gpu_id = dev->id;
exception_data.va = (info.page_addr) << PAGE_SHIFT;
exception_data.failure.NotPresent = info.prot_valid ? 1 : 0;
exception_data.failure.NoExecute = info.prot_exec ? 1 : 0;
exception_data.failure.ReadOnly = info.prot_write ? 1 : 0;
exception_data.failure.imprecise = 0;
/*kfd_set_dbg_ev_from_interrupt(dev, pasid, -1,
KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION),
&exception_data, sizeof(exception_data));*/
kfd_smi_event_update_vmfault(dev, pasid);
/* GRBM, SDMA, SE, PMM */
} else if (client_id == SOC21_IH_CLIENTID_GRBM_CP ||
client_id == SOC21_IH_CLIENTID_GFX) {
/* CP */
if (source_id == SOC15_INTSRC_CP_END_OF_PIPE)
kfd_signal_event_interrupt(pasid, context_id0, 32);
/*else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE)
kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),
KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)),
NULL, 0);*/
/* SDMA */
else if (source_id == SOC21_INTSRC_SDMA_TRAP)
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
else if (source_id == SOC21_INTSRC_SDMA_ECC) {
event_interrupt_poison_consumption_v11(dev, pasid, source_id);
return;
}
/* SQ */
else if (source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG) {
sq_int_enc = REG_GET_FIELD(context_id1,
SQ_INTERRUPT_WORD_WAVE_CTXID1, ENCODING);
switch (sq_int_enc) {
case SQ_INTERRUPT_WORD_ENCODING_AUTO:
print_sq_intr_info_auto(context_id0, context_id1);
break;
case SQ_INTERRUPT_WORD_ENCODING_INST:
print_sq_intr_info_inst(context_id0, context_id1);
sq_int_priv = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV);
if (sq_int_priv /*&& (kfd_set_dbg_ev_from_interrupt(dev, pasid,
KFD_CTXID0_DOORBELL_ID(context_id0),
KFD_CTXID0_TRAP_CODE(context_id0),
NULL, 0))*/)
return;
break;
case SQ_INTERRUPT_WORD_ENCODING_ERROR:
print_sq_intr_info_error(context_id0, context_id1);
sq_int_errtype = REG_GET_FIELD(context_id0,
SQ_INTERRUPT_WORD_ERROR_CTXID0, TYPE);
if (sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_int_errtype != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
event_interrupt_poison_consumption_v11(
dev, pasid, source_id);
return;
}
break;
default:
break;
}
kfd_signal_event_interrupt(pasid, context_id0 & 0xffffff, 24);
}
/*} else if (KFD_IRQ_IS_FENCE(client_id, source_id)) {
kfd_process_close_interrupt_drain(pasid);*/
}
}
const struct kfd_event_interrupt_class event_interrupt_class_v11 = {
.interrupt_isr = event_interrupt_isr_v11,
.interrupt_wq = event_interrupt_wq_v11,
};
......@@ -90,7 +90,7 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
static void event_interrupt_poison_consumption(struct kfd_dev *dev,
static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev,
uint16_t pasid, uint16_t client_id)
{
int old_poison, ret = -EINVAL;
......@@ -316,7 +316,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
event_interrupt_poison_consumption(dev, pasid, client_id);
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
break;
......@@ -337,7 +337,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
event_interrupt_poison_consumption(dev, pasid, client_id);
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
......@@ -348,7 +348,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
event_interrupt_poison_consumption(dev, pasid, client_id);
event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
......
......@@ -100,7 +100,7 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
{
struct kfd_cu_info cu_info;
uint32_t cu_per_sh[KFD_MAX_NUM_SE][KFD_MAX_NUM_SH_PER_SE] = {0};
int i, se, sh, cu;
int i, se, sh, cu, cu_bitmap_sh_mul;
amdgpu_amdkfd_get_cu_info(mm->dev->adev, &cu_info);
......@@ -120,6 +120,10 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
cu_info.num_shader_arrays_per_engine * cu_info.num_shader_engines);
return;
}
cu_bitmap_sh_mul = (KFD_GC_VERSION(mm->dev) >= IP_VERSION(11, 0, 0) &&
KFD_GC_VERSION(mm->dev) < IP_VERSION(12, 0, 0)) ? 2 : 1;
/* Count active CUs per SH.
*
* Some CUs in an SH may be disabled. HW expects disabled CUs to be
......@@ -129,10 +133,12 @@ void mqd_symmetrically_map_cu_mask(struct mqd_manager *mm,
* Each half of se_mask must be filled only on bits 0-cu_per_sh[se][sh]-1.
*
* See note on Arcturus cu_bitmap layout in gfx_v9_0_get_cu_info.
* See note on GFX11 cu_bitmap layout in gfx_v11_0_get_cu_info.
*/
for (se = 0; se < cu_info.num_shader_engines; se++)
for (sh = 0; sh < cu_info.num_shader_arrays_per_engine; sh++)
cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
cu_per_sh[se][sh] = hweight32(
cu_info.cu_bitmap[se % 4][sh + (se / 4) * cu_bitmap_sh_mul]);
/* Symmetrically map cu_mask to all SEs & SHs:
* se_mask programs up to 2 SH in the upper and lower 16 bits.
......
/*
* Copyright 2021 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include "kfd_priv.h"
#include "kfd_mqd_manager.h"
#include "v11_structs.h"
#include "gc/gc_11_0_0_offset.h"
#include "gc/gc_11_0_0_sh_mask.h"
#include "amdgpu_amdkfd.h"
static inline struct v11_compute_mqd *get_mqd(void *mqd)
{
return (struct v11_compute_mqd *)mqd;
}
static inline struct v11_sdma_mqd *get_sdma_mqd(void *mqd)
{
return (struct v11_sdma_mqd *)mqd;
}
static void update_cu_mask(struct mqd_manager *mm, void *mqd,
struct mqd_update_info *minfo)
{
struct v11_compute_mqd *m;
uint32_t se_mask[KFD_MAX_NUM_SE] = {0};
if (!minfo || (minfo->update_flag != UPDATE_FLAG_CU_MASK) ||
!minfo->cu_mask.ptr)
return;
mqd_symmetrically_map_cu_mask(mm,
minfo->cu_mask.ptr, minfo->cu_mask.count, se_mask);
m = get_mqd(mqd);
m->compute_static_thread_mgmt_se0 = se_mask[0];
m->compute_static_thread_mgmt_se1 = se_mask[1];
m->compute_static_thread_mgmt_se2 = se_mask[2];
m->compute_static_thread_mgmt_se3 = se_mask[3];
m->compute_static_thread_mgmt_se4 = se_mask[4];
m->compute_static_thread_mgmt_se5 = se_mask[5];
m->compute_static_thread_mgmt_se6 = se_mask[6];
m->compute_static_thread_mgmt_se7 = se_mask[7];
pr_debug("update cu mask to %#x %#x %#x %#x %#x %#x %#x %#x\n",
m->compute_static_thread_mgmt_se0,
m->compute_static_thread_mgmt_se1,
m->compute_static_thread_mgmt_se2,
m->compute_static_thread_mgmt_se3,
m->compute_static_thread_mgmt_se4,
m->compute_static_thread_mgmt_se5,
m->compute_static_thread_mgmt_se6,
m->compute_static_thread_mgmt_se7);
}
static void set_priority(struct v11_compute_mqd *m, struct queue_properties *q)
{
m->cp_hqd_pipe_priority = pipe_priority_map[q->priority];
m->cp_hqd_queue_priority = q->priority;
}
static struct kfd_mem_obj *allocate_mqd(struct kfd_dev *kfd,
struct queue_properties *q)
{
struct kfd_mem_obj *mqd_mem_obj;
int size;
/*
* MES write to areas beyond MQD size. So allocate
* 1 PAGE_SIZE memory for MQD is MES is enabled.
*/
if (kfd->shared_resources.enable_mes)
size = PAGE_SIZE;
else
size = sizeof(struct v11_compute_mqd);
if (kfd_gtt_sa_allocate(kfd, size, &mqd_mem_obj))
return NULL;
return mqd_mem_obj;
}
static void init_mqd(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
uint64_t addr;
struct v11_compute_mqd *m;
int size;
m = (struct v11_compute_mqd *) mqd_mem_obj->cpu_ptr;
addr = mqd_mem_obj->gpu_addr;
if (mm->dev->shared_resources.enable_mes)
size = PAGE_SIZE;
else
size = sizeof(struct v11_compute_mqd);
memset(m, 0, size);
m->header = 0xC0310800;
m->compute_pipelinestat_enable = 1;
m->compute_static_thread_mgmt_se0 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF;
m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF;
m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK |
0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT;
m->cp_mqd_control = 1 << CP_MQD_CONTROL__PRIV_STATE__SHIFT;
m->cp_mqd_base_addr_lo = lower_32_bits(addr);
m->cp_mqd_base_addr_hi = upper_32_bits(addr);
m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT |
1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT |
1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT;
if (q->format == KFD_QUEUE_FORMAT_AQL) {
m->cp_hqd_aql_control =
1 << CP_HQD_AQL_CONTROL__CONTROL0__SHIFT;
}
if (mm->dev->cwsr_enabled) {
m->cp_hqd_persistent_state |=
(1 << CP_HQD_PERSISTENT_STATE__QSWITCH_MODE__SHIFT);
m->cp_hqd_ctx_save_base_addr_lo =
lower_32_bits(q->ctx_save_restore_area_address);
m->cp_hqd_ctx_save_base_addr_hi =
upper_32_bits(q->ctx_save_restore_area_address);
m->cp_hqd_ctx_save_size = q->ctx_save_restore_area_size;
m->cp_hqd_cntl_stack_size = q->ctl_stack_size;
m->cp_hqd_cntl_stack_offset = q->ctl_stack_size;
m->cp_hqd_wg_state_offset = q->ctl_stack_size;
}
*mqd = m;
if (gart_addr)
*gart_addr = addr;
mm->update_mqd(mm, m, q, NULL);
}
static int load_mqd(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
struct queue_properties *p, struct mm_struct *mms)
{
int r = 0;
/* AQL write pointer counts in 64B packets, PM4/CP counts in dwords. */
uint32_t wptr_shift = (p->format == KFD_QUEUE_FORMAT_AQL ? 4 : 0);
r = mm->dev->kfd2kgd->hqd_load(mm->dev->adev, mqd, pipe_id, queue_id,
(uint32_t __user *)p->write_ptr,
wptr_shift, 0, mms);
return r;
}
static int hiq_load_mqd_kiq(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
struct queue_properties *p, struct mm_struct *mms)
{
return mm->dev->kfd2kgd->hiq_mqd_load(mm->dev->adev, mqd, pipe_id,
queue_id, p->doorbell_off);
}
static void update_mqd(struct mqd_manager *mm, void *mqd,
struct queue_properties *q,
struct mqd_update_info *minfo)
{
struct v11_compute_mqd *m;
m = get_mqd(mqd);
m->cp_hqd_pq_control = 5 << CP_HQD_PQ_CONTROL__RPTR_BLOCK_SIZE__SHIFT;
m->cp_hqd_pq_control |=
ffs(q->queue_size / sizeof(unsigned int)) - 1 - 1;
pr_debug("cp_hqd_pq_control 0x%x\n", m->cp_hqd_pq_control);
m->cp_hqd_pq_base_lo = lower_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_base_hi = upper_32_bits((uint64_t)q->queue_address >> 8);
m->cp_hqd_pq_rptr_report_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_rptr_report_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->cp_hqd_pq_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr);
m->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr);
m->cp_hqd_pq_doorbell_control =
q->doorbell_off <<
CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
pr_debug("cp_hqd_pq_doorbell_control 0x%x\n",
m->cp_hqd_pq_doorbell_control);
m->cp_hqd_ib_control = 3 << CP_HQD_IB_CONTROL__MIN_IB_AVAIL_SIZE__SHIFT;
/*
* HW does not clamp this field correctly. Maximum EOP queue size
* is constrained by per-SE EOP done signal count, which is 8-bit.
* Limit is 0xFF EOP entries (= 0x7F8 dwords). CP will not submit
* more than (EOP entry count - 1) so a queue size of 0x800 dwords
* is safe, giving a maximum field value of 0xA.
*/
m->cp_hqd_eop_control = min(0xA,
ffs(q->eop_ring_buffer_size / sizeof(unsigned int)) - 1 - 1);
m->cp_hqd_eop_base_addr_lo =
lower_32_bits(q->eop_ring_buffer_address >> 8);
m->cp_hqd_eop_base_addr_hi =
upper_32_bits(q->eop_ring_buffer_address >> 8);
m->cp_hqd_iq_timer = 0;
m->cp_hqd_vmid = q->vmid;
if (q->format == KFD_QUEUE_FORMAT_AQL) {
/* GC 10 removed WPP_CLAMP from PQ Control */
m->cp_hqd_pq_control |= CP_HQD_PQ_CONTROL__NO_UPDATE_RPTR_MASK |
2 << CP_HQD_PQ_CONTROL__SLOT_BASED_WPTR__SHIFT |
1 << CP_HQD_PQ_CONTROL__QUEUE_FULL_EN__SHIFT ;
m->cp_hqd_pq_doorbell_control |=
1 << CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_BIF_DROP__SHIFT;
}
if (mm->dev->cwsr_enabled)
m->cp_hqd_ctx_save_control = 0;
update_cu_mask(mm, mqd, minfo);
set_priority(m, q);
q->is_active = QUEUE_IS_ACTIVE(*q);
}
static uint32_t read_doorbell_id(void *mqd)
{
struct v11_compute_mqd *m = (struct v11_compute_mqd *)mqd;
return m->queue_doorbell_id0;
}
static int destroy_mqd(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_destroy
(mm->dev->adev, mqd, type, timeout,
pipe_id, queue_id);
}
static void free_mqd(struct mqd_manager *mm, void *mqd,
struct kfd_mem_obj *mqd_mem_obj)
{
kfd_gtt_sa_free(mm->dev, mqd_mem_obj);
}
static bool is_occupied(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_is_occupied(
mm->dev->adev, queue_address,
pipe_id, queue_id);
}
static int get_wave_state(struct mqd_manager *mm, void *mqd,
void __user *ctl_stack,
u32 *ctl_stack_used_size,
u32 *save_area_used_size)
{
struct v11_compute_mqd *m;
/*struct mqd_user_context_save_area_header header;*/
m = get_mqd(mqd);
/* Control stack is written backwards, while workgroup context data
* is written forwards. Both starts from m->cp_hqd_cntl_stack_size.
* Current position is at m->cp_hqd_cntl_stack_offset and
* m->cp_hqd_wg_state_offset, respectively.
*/
*ctl_stack_used_size = m->cp_hqd_cntl_stack_size -
m->cp_hqd_cntl_stack_offset;
*save_area_used_size = m->cp_hqd_wg_state_offset -
m->cp_hqd_cntl_stack_size;
/* Control stack is not copied to user mode for GFXv11 because
* it's part of the context save area that is already
* accessible to user mode
*/
/*
header.control_stack_size = *ctl_stack_used_size;
header.wave_state_size = *save_area_used_size;
header.wave_state_offset = m->cp_hqd_wg_state_offset;
header.control_stack_offset = m->cp_hqd_cntl_stack_offset;
if (copy_to_user(ctl_stack, &header, sizeof(header)))
return -EFAULT;
*/
return 0;
}
static void init_mqd_hiq(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct v11_compute_mqd *m;
init_mqd(mm, mqd, mqd_mem_obj, gart_addr, q);
m = get_mqd(*mqd);
m->cp_hqd_pq_control |= 1 << CP_HQD_PQ_CONTROL__PRIV_STATE__SHIFT |
1 << CP_HQD_PQ_CONTROL__KMD_QUEUE__SHIFT;
}
static void init_mqd_sdma(struct mqd_manager *mm, void **mqd,
struct kfd_mem_obj *mqd_mem_obj, uint64_t *gart_addr,
struct queue_properties *q)
{
struct v11_sdma_mqd *m;
m = (struct v11_sdma_mqd *) mqd_mem_obj->cpu_ptr;
memset(m, 0, sizeof(struct v11_sdma_mqd));
*mqd = m;
if (gart_addr)
*gart_addr = mqd_mem_obj->gpu_addr;
mm->update_mqd(mm, m, q, NULL);
}
static int load_mqd_sdma(struct mqd_manager *mm, void *mqd,
uint32_t pipe_id, uint32_t queue_id,
struct queue_properties *p, struct mm_struct *mms)
{
return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->adev, mqd,
(uint32_t __user *)p->write_ptr,
mms);
}
#define SDMA_RLC_DUMMY_DEFAULT 0xf
static void update_mqd_sdma(struct mqd_manager *mm, void *mqd,
struct queue_properties *q,
struct mqd_update_info *minfo)
{
struct v11_sdma_mqd *m;
m = get_sdma_mqd(mqd);
m->sdmax_rlcx_rb_cntl = (ffs(q->queue_size / sizeof(unsigned int)) - 1)
<< SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT |
q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT |
1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT |
6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT;
m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8);
m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8);
m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr);
m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr);
m->sdmax_rlcx_doorbell_offset =
q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT;
m->sdma_engine_id = q->sdma_engine_id;
m->sdma_queue_id = q->sdma_queue_id;
m->sdmax_rlcx_dummy_reg = SDMA_RLC_DUMMY_DEFAULT;
q->is_active = QUEUE_IS_ACTIVE(*q);
}
/*
* * preempt type here is ignored because there is only one way
* * to preempt sdma queue
*/
static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd,
enum kfd_preempt_type type,
unsigned int timeout, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->adev, mqd, timeout);
}
static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd,
uint64_t queue_address, uint32_t pipe_id,
uint32_t queue_id)
{
return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->adev, mqd);
}
#if defined(CONFIG_DEBUG_FS)
static int debugfs_show_mqd(struct seq_file *m, void *data)
{
seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
data, sizeof(struct v11_compute_mqd), false);
return 0;
}
static int debugfs_show_mqd_sdma(struct seq_file *m, void *data)
{
seq_hex_dump(m, " ", DUMP_PREFIX_OFFSET, 32, 4,
data, sizeof(struct v11_sdma_mqd), false);
return 0;
}
#endif
struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type,
struct kfd_dev *dev)
{
struct mqd_manager *mqd;
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
mqd = kzalloc(sizeof(*mqd), GFP_KERNEL);
if (!mqd)
return NULL;
mqd->dev = dev;
switch (type) {
case KFD_MQD_TYPE_CP:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd;
mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v11_compute_mqd);
mqd->get_wave_state = get_wave_state;
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
pr_debug("%s@%i\n", __func__, __LINE__);
break;
case KFD_MQD_TYPE_HIQ:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_hiq_mqd;
mqd->init_mqd = init_mqd_hiq;
mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = hiq_load_mqd_kiq;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v11_compute_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
mqd->read_doorbell_id = read_doorbell_id;
pr_debug("%s@%i\n", __func__, __LINE__);
break;
case KFD_MQD_TYPE_DIQ:
mqd->allocate_mqd = allocate_mqd;
mqd->init_mqd = init_mqd_hiq;
mqd->free_mqd = free_mqd;
mqd->load_mqd = load_mqd;
mqd->update_mqd = update_mqd;
mqd->destroy_mqd = destroy_mqd;
mqd->is_occupied = is_occupied;
mqd->mqd_size = sizeof(struct v11_compute_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd;
#endif
break;
case KFD_MQD_TYPE_SDMA:
pr_debug("%s@%i\n", __func__, __LINE__);
mqd->allocate_mqd = allocate_sdma_mqd;
mqd->init_mqd = init_mqd_sdma;
mqd->free_mqd = free_mqd_hiq_sdma;
mqd->load_mqd = load_mqd_sdma;
mqd->update_mqd = update_mqd_sdma;
mqd->destroy_mqd = destroy_mqd_sdma;
mqd->is_occupied = is_occupied_sdma;
mqd->mqd_size = sizeof(struct v11_sdma_mqd);
#if defined(CONFIG_DEBUG_FS)
mqd->debugfs_show_mqd = debugfs_show_mqd_sdma;
#endif
pr_debug("%s@%i\n", __func__, __LINE__);
break;
default:
kfree(mqd);
return NULL;
}
return mqd;
}
......@@ -228,6 +228,8 @@ struct kfd_device_info {
bool needs_pci_atomics;
uint32_t no_atomic_fw_version;
unsigned int num_sdma_queues_per_engine;
unsigned int num_reserved_sdma_queues_per_engine;
uint64_t reserved_sdma_queues_bitmap;
};
unsigned int kfd_get_num_sdma_engines(struct kfd_dev *kdev);
......@@ -564,6 +566,10 @@ struct queue {
/* procfs */
struct kobject kobj;
void *gang_ctx_bo;
uint64_t gang_ctx_gpu_addr;
void *gang_ctx_cpu_ptr;
};
enum KFD_MQD_TYPE {
......@@ -779,6 +785,10 @@ struct kfd_process_device {
* checkpointed node to refer to this device.
*/
uint32_t user_gpu_id;
void *proc_ctx_bo;
uint64_t proc_ctx_gpu_addr;
void *proc_ctx_cpu_ptr;
};
#define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd)
......@@ -1170,6 +1180,8 @@ struct mqd_manager *mqd_manager_init_v9(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_v10(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type,
struct kfd_dev *dev);
struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev);
void device_queue_manager_uninit(struct device_queue_manager *dqm);
struct kernel_queue *kernel_queue_init(struct kfd_dev *dev,
......@@ -1292,6 +1304,7 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd);
/* Events */
extern const struct kfd_event_interrupt_class event_interrupt_class_cik;
extern const struct kfd_event_interrupt_class event_interrupt_class_v9;
extern const struct kfd_event_interrupt_class event_interrupt_class_v11;
extern const struct kfd_device_global_init_class device_global_init_class_cik;
......
......@@ -1041,6 +1041,9 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)
kfd_free_process_doorbells(pdd->dev, pdd->doorbell_index);
if (pdd->dev->shared_resources.enable_mes)
amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
pdd->proc_ctx_bo);
/*
* before destroying pdd, make sure to report availability
* for auto suspend
......@@ -1484,6 +1487,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
struct kfd_process *p)
{
struct kfd_process_device *pdd = NULL;
int retval = 0;
if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
return NULL;
......@@ -1516,6 +1520,21 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev,
pdd->sdma_past_activity_counter = 0;
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);
if (dev->shared_resources.enable_mes) {
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
AMDGPU_MES_PROC_CTX_SIZE,
&pdd->proc_ctx_bo,
&pdd->proc_ctx_gpu_addr,
&pdd->proc_ctx_cpu_ptr,
false);
if (retval) {
pr_err("failed to allocate process context bo\n");
goto err_free_pdd;
}
memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
}
p->pdds[p->n_pdds++] = pdd;
/* Init idr used for memory handle translation */
......
......@@ -198,8 +198,26 @@ static int init_user_queue(struct process_queue_manager *pqm,
(*q)->device = dev;
(*q)->process = pqm->process;
if (dev->shared_resources.enable_mes) {
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
AMDGPU_MES_GANG_CTX_SIZE,
&(*q)->gang_ctx_bo,
&(*q)->gang_ctx_gpu_addr,
&(*q)->gang_ctx_cpu_ptr,
false);
if (retval) {
pr_err("failed to allocate gang context bo\n");
goto cleanup;
}
memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE);
}
pr_debug("PQM After init queue");
return 0;
cleanup:
if (dev->shared_resources.enable_mes)
uninit_queue(*q);
return retval;
}
......@@ -418,6 +436,9 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
pdd->qpd.num_gws = 0;
}
if (dev->shared_resources.enable_mes)
amdgpu_amdkfd_free_gtt_mem(dev->adev,
pqn->q->gang_ctx_bo);
uninit_queue(pqn->q);
}
......
......@@ -1412,7 +1412,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
dev->node_props.num_sdma_xgmi_engines =
kfd_get_num_xgmi_sdma_engines(gpu);
dev->node_props.num_sdma_queues_per_engine =
gpu->device_info.num_sdma_queues_per_engine;
gpu->device_info.num_sdma_queues_per_engine -
gpu->device_info.num_reserved_sdma_queues_per_engine;
dev->node_props.num_gws = (dev->gpu->gws &&
dev->gpu->dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) ?
dev->gpu->adev->gds.gws_size : 0;
......
......@@ -31,7 +31,8 @@
#define SOC15_INTSRC_VMC_FAULT 0
#define SOC15_INTSRC_SDMA_TRAP 224
#define SOC15_INTSRC_SDMA_ECC 220
#define SOC21_INTSRC_SDMA_TRAP 49
#define SOC21_INTSRC_SDMA_ECC 62
#define SOC15_CLIENT_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) & 0xff)
#define SOC15_SOURCE_ID_FROM_IH_ENTRY(entry) (le32_to_cpu(entry[0]) >> 8 & 0xff)
......
......@@ -152,6 +152,7 @@ struct kgd2kfd_shared_resources {
/* Minor device number of the render node */
int drm_render_minor;
bool enable_mes;
};
struct tile_config {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment