Commit a805889a authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher

drm/amdkfd: Update SDMA queue management for GFX9.4.3

This patch updates SDMA queue management for multi XCC in GFX9.4.3.
- Allocate/deallocate SDMA queues from the correct SDMA engines
  based on the partition mode.
- Updates the kgd2kfd interface to fetch the correct SDMA register
  addresses.
- It also fixes dumping correct SDMA queue info in debugfs.

v2: squash in fix "drm/amdkfd: Fix XGMI SDMA user-mode queue allocation"
Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f38f147a
......@@ -31,6 +31,192 @@
#include "oss/osssys_4_0_sh_mask.h"
#include "v9_structs.h"
#include "soc15.h"
#include "sdma/sdma_4_4_2_offset.h"
#include "sdma/sdma_4_4_2_sh_mask.h"
static inline struct v9_sdma_mqd *get_sdma_mqd(void *mqd)
{
return (struct v9_sdma_mqd *)mqd;
}
static uint32_t get_sdma_rlc_reg_offset(struct amdgpu_device *adev,
unsigned int engine_id,
unsigned int queue_id)
{
uint32_t sdma_engine_reg_base = SOC15_REG_OFFSET(SDMA0, engine_id,
regSDMA_RLC0_RB_CNTL) -
regSDMA_RLC0_RB_CNTL;
uint32_t retval = sdma_engine_reg_base +
queue_id * (regSDMA_RLC1_RB_CNTL - regSDMA_RLC0_RB_CNTL);
pr_debug("RLC register offset for SDMA%d RLC%d: 0x%x\n", engine_id,
queue_id, retval);
return retval;
}
int kgd_gfx_v9_4_3_hqd_sdma_load(struct amdgpu_device *adev, void *mqd,
uint32_t __user *wptr, struct mm_struct *mm)
{
struct v9_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
unsigned long end_jiffies;
uint32_t data;
uint64_t data64;
uint64_t __user *wptr64 = (uint64_t __user *)wptr;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
m->sdmax_rlcx_rb_cntl & (~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK));
end_jiffies = msecs_to_jiffies(2000) + jiffies;
while (true) {
data = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
if (data & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("SDMA RLC not idle in %s\n", __func__);
return -ETIME;
}
usleep_range(500, 1000);
}
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL_OFFSET,
m->sdmax_rlcx_doorbell_offset);
data = REG_SET_FIELD(m->sdmax_rlcx_doorbell, SDMA_RLC0_DOORBELL,
ENABLE, 1);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, data);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR,
m->sdmax_rlcx_rb_rptr);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI,
m->sdmax_rlcx_rb_rptr_hi);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 1);
if (read_user_wptr(mm, wptr64, data64)) {
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
lower_32_bits(data64));
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
upper_32_bits(data64));
} else {
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR,
m->sdmax_rlcx_rb_rptr);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_WPTR_HI,
m->sdmax_rlcx_rb_rptr_hi);
}
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_MINOR_PTR_UPDATE, 0);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE, m->sdmax_rlcx_rb_base);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_BASE_HI,
m->sdmax_rlcx_rb_base_hi);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_LO,
m->sdmax_rlcx_rb_rptr_addr_lo);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_ADDR_HI,
m->sdmax_rlcx_rb_rptr_addr_hi);
data = REG_SET_FIELD(m->sdmax_rlcx_rb_cntl, SDMA_RLC0_RB_CNTL,
RB_ENABLE, 1);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, data);
return 0;
}
int kgd_gfx_v9_4_3_hqd_sdma_dump(struct amdgpu_device *adev,
uint32_t engine_id, uint32_t queue_id,
uint32_t (**dump)[2], uint32_t *n_regs)
{
uint32_t sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev,
engine_id, queue_id);
uint32_t i = 0, reg;
#undef HQD_N_REGS
#define HQD_N_REGS (19+6+7+12)
#define DUMP_REG(addr) do { \
if (WARN_ON_ONCE(i >= HQD_N_REGS)) \
break; \
(*dump)[i][0] = (addr) << 2; \
(*dump)[i++][1] = RREG32(addr); \
} while (0)
*dump = kmalloc_array(HQD_N_REGS * 2, sizeof(uint32_t), GFP_KERNEL);
if (*dump == NULL)
return -ENOMEM;
for (reg = regSDMA_RLC0_RB_CNTL; reg <= regSDMA_RLC0_DOORBELL; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA_RLC0_STATUS; reg <= regSDMA_RLC0_CSA_ADDR_HI; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA_RLC0_IB_SUB_REMAIN;
reg <= regSDMA_RLC0_MINOR_PTR_UPDATE; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
for (reg = regSDMA_RLC0_MIDCMD_DATA0;
reg <= regSDMA_RLC0_MIDCMD_CNTL; reg++)
DUMP_REG(sdma_rlc_reg_offset + reg);
WARN_ON_ONCE(i != HQD_N_REGS);
*n_regs = i;
return 0;
}
bool kgd_gfx_v9_4_3_hqd_sdma_is_occupied(struct amdgpu_device *adev, void *mqd)
{
struct v9_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
uint32_t sdma_rlc_rb_cntl;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
sdma_rlc_rb_cntl = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
if (sdma_rlc_rb_cntl & SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK)
return true;
return false;
}
int kgd_gfx_v9_4_3_hqd_sdma_destroy(struct amdgpu_device *adev, void *mqd,
unsigned int utimeout)
{
struct v9_sdma_mqd *m;
uint32_t sdma_rlc_reg_offset;
uint32_t temp;
unsigned long end_jiffies = (utimeout * HZ / 1000) + jiffies;
m = get_sdma_mqd(mqd);
sdma_rlc_reg_offset = get_sdma_rlc_reg_offset(adev, m->sdma_engine_id,
m->sdma_queue_id);
temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL);
temp = temp & ~SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK;
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL, temp);
while (true) {
temp = RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_CONTEXT_STATUS);
if (temp & SDMA_RLC0_CONTEXT_STATUS__IDLE_MASK)
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("SDMA RLC not idle in %s\n", __func__);
return -ETIME;
}
usleep_range(500, 1000);
}
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_DOORBELL, 0);
WREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL,
RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_CNTL) |
SDMA_RLC0_RB_CNTL__RB_ENABLE_MASK);
m->sdmax_rlcx_rb_rptr =
RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR);
m->sdmax_rlcx_rb_rptr_hi =
RREG32(sdma_rlc_reg_offset + regSDMA_RLC0_RB_RPTR_HI);
return 0;
}
static int kgd_gfx_v9_4_3_set_pasid_vmid_mapping(struct amdgpu_device *adev,
u32 pasid, unsigned int vmid, uint32_t inst)
......@@ -166,13 +352,13 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = {
.init_interrupts = kgd_gfx_v9_init_interrupts,
.hqd_load = kgd_gfx_v9_4_3_hqd_load,
.hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
.hqd_sdma_load = kgd_arcturus_hqd_sdma_load,
.hqd_sdma_load = kgd_gfx_v9_4_3_hqd_sdma_load,
.hqd_dump = kgd_gfx_v9_hqd_dump,
.hqd_sdma_dump = kgd_arcturus_hqd_sdma_dump,
.hqd_sdma_dump = kgd_gfx_v9_4_3_hqd_sdma_dump,
.hqd_is_occupied = kgd_gfx_v9_hqd_is_occupied,
.hqd_sdma_is_occupied = kgd_arcturus_hqd_sdma_is_occupied,
.hqd_sdma_is_occupied = kgd_gfx_v9_4_3_hqd_sdma_is_occupied,
.hqd_destroy = kgd_gfx_v9_hqd_destroy,
.hqd_sdma_destroy = kgd_arcturus_hqd_sdma_destroy,
.hqd_sdma_destroy = kgd_gfx_v9_4_3_hqd_sdma_destroy,
.wave_control_execute = kgd_gfx_v9_wave_control_execute,
.get_atc_vmid_pasid_mapping_info =
kgd_gfx_v9_get_atc_vmid_pasid_mapping_info,
......
......@@ -741,6 +741,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
if (!node)
goto node_alloc_error;
node->node_id = i;
node->adev = kfd->adev;
node->kfd = kfd;
node->kfd2kgd = kfd->kfd2kgd;
......@@ -1323,15 +1324,16 @@ unsigned int kfd_get_num_sdma_engines(struct kfd_node *node)
{
/* If XGMI is not supported, all SDMA engines are PCIe */
if (!node->adev->gmc.xgmi.supported)
return node->adev->sdma.num_instances;
return node->adev->sdma.num_instances/(int)node->kfd->num_nodes;
return min(node->adev->sdma.num_instances, 2);
return min(node->adev->sdma.num_instances/(int)node->kfd->num_nodes, 2);
}
unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
{
/* After reserved for PCIe, the rest of engines are XGMI */
return node->adev->sdma.num_instances - kfd_get_num_sdma_engines(node);
return node->adev->sdma.num_instances/(int)node->kfd->num_nodes -
kfd_get_num_sdma_engines(node);
}
#if defined(CONFIG_DEBUG_FS)
......
......@@ -124,6 +124,15 @@ static inline uint64_t get_reserved_sdma_queues_bitmap(struct device_queue_manag
return dqm->dev->kfd->device_info.reserved_sdma_queues_bitmap;
}
static void init_sdma_bitmaps(struct device_queue_manager *dqm)
{
bitmap_zero(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES);
bitmap_set(dqm->sdma_bitmap, 0, get_num_sdma_queues(dqm));
bitmap_zero(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
bitmap_set(dqm->xgmi_sdma_bitmap, 0, get_num_xgmi_sdma_queues(dqm));
}
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
......@@ -1268,24 +1277,6 @@ static void init_interrupts(struct device_queue_manager *dqm)
}
}
static void init_sdma_bitmaps(struct device_queue_manager *dqm)
{
unsigned int num_sdma_queues =
min_t(unsigned int, sizeof(dqm->sdma_bitmap)*8,
get_num_sdma_queues(dqm));
unsigned int num_xgmi_sdma_queues =
min_t(unsigned int, sizeof(dqm->xgmi_sdma_bitmap)*8,
get_num_xgmi_sdma_queues(dqm));
if (num_sdma_queues)
dqm->sdma_bitmap = GENMASK_ULL(num_sdma_queues-1, 0);
if (num_xgmi_sdma_queues)
dqm->xgmi_sdma_bitmap = GENMASK_ULL(num_xgmi_sdma_queues-1, 0);
dqm->sdma_bitmap &= ~get_reserved_sdma_queues_bitmap(dqm);
pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap);
}
static int initialize_nocpsch(struct device_queue_manager *dqm)
{
int pipe, queue;
......@@ -1375,46 +1366,49 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
int bit;
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (dqm->sdma_bitmap == 0) {
if (bitmap_empty(dqm->sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
pr_err("No more SDMA queue to allocate\n");
return -ENOMEM;
}
if (restore_sdma_id) {
/* Re-use existing sdma_id */
if (!(dqm->sdma_bitmap & (1ULL << *restore_sdma_id))) {
if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
pr_err("SDMA queue already in use\n");
return -EBUSY;
}
dqm->sdma_bitmap &= ~(1ULL << *restore_sdma_id);
clear_bit(*restore_sdma_id, dqm->sdma_bitmap);
q->sdma_id = *restore_sdma_id;
} else {
/* Find first available sdma_id */
bit = __ffs64(dqm->sdma_bitmap);
dqm->sdma_bitmap &= ~(1ULL << bit);
bit = find_first_bit(dqm->sdma_bitmap,
get_num_sdma_queues(dqm));
clear_bit(bit, dqm->sdma_bitmap);
q->sdma_id = bit;
}
q->properties.sdma_engine_id = q->sdma_id %
kfd_get_num_sdma_engines(dqm->dev);
q->properties.sdma_engine_id =
dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
q->sdma_id % kfd_get_num_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_sdma_engines(dqm->dev);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (dqm->xgmi_sdma_bitmap == 0) {
if (bitmap_empty(dqm->xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES)) {
pr_err("No more XGMI SDMA queue to allocate\n");
return -ENOMEM;
}
if (restore_sdma_id) {
/* Re-use existing sdma_id */
if (!(dqm->xgmi_sdma_bitmap & (1ULL << *restore_sdma_id))) {
if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
pr_err("SDMA queue already in use\n");
return -EBUSY;
}
dqm->xgmi_sdma_bitmap &= ~(1ULL << *restore_sdma_id);
clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap);
q->sdma_id = *restore_sdma_id;
} else {
bit = __ffs64(dqm->xgmi_sdma_bitmap);
dqm->xgmi_sdma_bitmap &= ~(1ULL << bit);
bit = find_first_bit(dqm->xgmi_sdma_bitmap,
get_num_xgmi_sdma_queues(dqm));
clear_bit(bit, dqm->xgmi_sdma_bitmap);
q->sdma_id = bit;
}
/* sdma_engine_id is sdma id including
......@@ -1424,6 +1418,7 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
* PCIe-optimized ones
*/
q->properties.sdma_engine_id =
dqm->dev->node_id * get_num_all_sdma_engines(dqm) +
kfd_get_num_sdma_engines(dqm->dev) +
q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
......@@ -1442,11 +1437,11 @@ static void deallocate_sdma_queue(struct device_queue_manager *dqm,
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) {
if (q->sdma_id >= get_num_sdma_queues(dqm))
return;
dqm->sdma_bitmap |= (1ULL << q->sdma_id);
set_bit(q->sdma_id, dqm->sdma_bitmap);
} else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) {
if (q->sdma_id >= get_num_xgmi_sdma_queues(dqm))
return;
dqm->xgmi_sdma_bitmap |= (1ULL << q->sdma_id);
set_bit(q->sdma_id, dqm->xgmi_sdma_bitmap);
}
}
......
......@@ -239,8 +239,8 @@ struct device_queue_manager {
unsigned int total_queue_count;
unsigned int next_pipe_to_allocate;
unsigned int *allocated_queues;
uint64_t sdma_bitmap;
uint64_t xgmi_sdma_bitmap;
DECLARE_BITMAP(sdma_bitmap, KFD_MAX_SDMA_QUEUES);
DECLARE_BITMAP(xgmi_sdma_bitmap, KFD_MAX_SDMA_QUEUES);
/* the pasid mapping for each kfd vmid */
uint16_t vmid_pasid[VMID_NUM];
uint64_t pipelines_addr;
......
......@@ -113,6 +113,8 @@
#define KFD_UNMAP_LATENCY_MS (4000)
#define KFD_MAX_SDMA_QUEUES 128
/*
* 512 = 0x200
* The doorbell index distance between SDMA RLC (2*i) and (2*i+1) in the
......@@ -260,6 +262,7 @@ struct kfd_vmid_info {
struct kfd_dev;
struct kfd_node {
unsigned int node_id;
struct amdgpu_device *adev; /* Duplicated here along with keeping
* a copy in kfd_dev to save a hop
*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment