Commit 3a87177e authored by Harish Kasiviswanathan's avatar Harish Kasiviswanathan Committed by Oded Gabbay

drm/amdkfd: Add topology support for dGPUs

Generate and parse VCRAT tables for dGPUs in kfd_topology_add_device.

Some information that isn't available in the CRAT table is patched
into the topology after parsing.

HSA_CAP_DOORBELL_TYPE_1_0 is dependent on the ASIC feature
CP_HQD_PQ_CONTROL.SLOT_BASED_WPTR, which was not introduced in VI
until Carrizo. Report HSA_CAP_DOORBELL_TYPE_PRE_1_0 on Tonga ASICs.

v2: Added #include <linux/pci.h> to kfd_crat.c to make it compile
Signed-off-by: default avatarHarish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Signed-off-by: default avatarBen Goz <ben.goz@amd.com>
Signed-off-by: default avatarAmber Lin <Amber.Lin@amd.com>
Signed-off-by: default avatarJay Cornwall <Jay.Cornwall@amd.com>
Signed-off-by: default avatarKent Russell <kent.russell@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Acked-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: default avatarOded Gabbay <oded.gabbay@gmail.com>
parent 520b8fb7
This diff is collapsed.
...@@ -109,7 +109,7 @@ struct crat_subtype_computeunit { ...@@ -109,7 +109,7 @@ struct crat_subtype_computeunit {
uint8_t wave_front_size; uint8_t wave_front_size;
uint8_t num_banks; uint8_t num_banks;
uint16_t micro_engine_id; uint16_t micro_engine_id;
uint8_t num_arrays; uint8_t array_count;
uint8_t num_cu_per_array; uint8_t num_cu_per_array;
uint8_t num_simd_per_cu; uint8_t num_simd_per_cu;
uint8_t max_slots_scatch_cu; uint8_t max_slots_scatch_cu;
...@@ -137,7 +137,8 @@ struct crat_subtype_memory { ...@@ -137,7 +137,8 @@ struct crat_subtype_memory {
uint32_t length_low; uint32_t length_low;
uint32_t length_high; uint32_t length_high;
uint32_t width; uint32_t width;
uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH]; uint8_t visibility_type; /* for virtual (dGPU) CRAT */
uint8_t reserved2[CRAT_MEMORY_RESERVED_LENGTH - 1];
}; };
/* /*
......
...@@ -668,6 +668,8 @@ int kfd_topology_init(void); ...@@ -668,6 +668,8 @@ int kfd_topology_init(void);
void kfd_topology_shutdown(void); void kfd_topology_shutdown(void);
int kfd_topology_add_device(struct kfd_dev *gpu); int kfd_topology_add_device(struct kfd_dev *gpu);
int kfd_topology_remove_device(struct kfd_dev *gpu); int kfd_topology_remove_device(struct kfd_dev *gpu);
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain);
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id); struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev); struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev); int kfd_topology_enum_kfd_devices(uint8_t idx, struct kfd_dev **kdev);
......
...@@ -43,6 +43,25 @@ static struct kfd_system_properties sys_props; ...@@ -43,6 +43,25 @@ static struct kfd_system_properties sys_props;
static DECLARE_RWSEM(topology_lock); static DECLARE_RWSEM(topology_lock);
static atomic_t topology_crat_proximity_domain; static atomic_t topology_crat_proximity_domain;
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
uint32_t proximity_domain)
{
struct kfd_topology_device *top_dev;
struct kfd_topology_device *device = NULL;
down_read(&topology_lock);
list_for_each_entry(top_dev, &topology_device_list, list)
if (top_dev->proximity_domain == proximity_domain) {
device = top_dev;
break;
}
up_read(&topology_lock);
return device;
}
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id) struct kfd_dev *kfd_device_by_id(uint32_t gpu_id)
{ {
struct kfd_topology_device *top_dev; struct kfd_topology_device *top_dev;
...@@ -79,6 +98,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev) ...@@ -79,6 +98,7 @@ struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev)
return device; return device;
} }
/* Called with write topology_lock acquired */
static void kfd_release_topology_device(struct kfd_topology_device *dev) static void kfd_release_topology_device(struct kfd_topology_device *dev)
{ {
struct kfd_mem_properties *mem; struct kfd_mem_properties *mem;
...@@ -394,8 +414,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, ...@@ -394,8 +414,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr,
} }
sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute",
dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz( dev->node_props.max_engine_clk_fcompute);
dev->gpu->kgd));
sysfs_show_64bit_prop(buffer, "local_mem_size", sysfs_show_64bit_prop(buffer, "local_mem_size",
(unsigned long long int) 0); (unsigned long long int) 0);
...@@ -597,6 +616,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, ...@@ -597,6 +616,7 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev,
return 0; return 0;
} }
/* Called with write topology lock acquired */
static int kfd_build_sysfs_node_tree(void) static int kfd_build_sysfs_node_tree(void)
{ {
struct kfd_topology_device *dev; struct kfd_topology_device *dev;
...@@ -613,6 +633,7 @@ static int kfd_build_sysfs_node_tree(void) ...@@ -613,6 +633,7 @@ static int kfd_build_sysfs_node_tree(void)
return 0; return 0;
} }
/* Called with write topology lock acquired */
static void kfd_remove_sysfs_node_tree(void) static void kfd_remove_sysfs_node_tree(void)
{ {
struct kfd_topology_device *dev; struct kfd_topology_device *dev;
...@@ -908,19 +929,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu) ...@@ -908,19 +929,26 @@ static uint32_t kfd_generate_gpu_id(struct kfd_dev *gpu)
return hashout; return hashout;
} }
/* kfd_assign_gpu - Attach @gpu to the correct kfd topology device. If
* the GPU device is not already present in the topology device
* list then return NULL. This means a new topology device has to
* be created for this GPU.
* TODO: Rather than assiging @gpu to first topology device withtout
* gpu attached, it will better to have more stringent check.
*/
static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu) static struct kfd_topology_device *kfd_assign_gpu(struct kfd_dev *gpu)
{ {
struct kfd_topology_device *dev; struct kfd_topology_device *dev;
struct kfd_topology_device *out_dev = NULL; struct kfd_topology_device *out_dev = NULL;
down_write(&topology_lock);
list_for_each_entry(dev, &topology_device_list, list) list_for_each_entry(dev, &topology_device_list, list)
if (!dev->gpu && (dev->node_props.simd_count > 0)) { if (!dev->gpu && (dev->node_props.simd_count > 0)) {
dev->gpu = gpu; dev->gpu = gpu;
out_dev = dev; out_dev = dev;
break; break;
} }
up_write(&topology_lock);
return out_dev; return out_dev;
} }
...@@ -932,6 +960,45 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival) ...@@ -932,6 +960,45 @@ static void kfd_notify_gpu_change(uint32_t gpu_id, int arrival)
*/ */
} }
/* kfd_fill_mem_clk_max_info - Since CRAT doesn't have memory clock info,
* patch this after CRAT parsing.
*/
static void kfd_fill_mem_clk_max_info(struct kfd_topology_device *dev)
{
struct kfd_mem_properties *mem;
struct kfd_local_mem_info local_mem_info;
if (!dev)
return;
/* Currently, amdgpu driver (amdgpu_mc) deals only with GPUs with
* single bank of VRAM local memory.
* for dGPUs - VCRAT reports only one bank of Local Memory
* for APUs - If CRAT from ACPI reports more than one bank, then
* all the banks will report the same mem_clk_max information
*/
dev->gpu->kfd2kgd->get_local_mem_info(dev->gpu->kgd,
&local_mem_info);
list_for_each_entry(mem, &dev->mem_props, list)
mem->mem_clk_max = local_mem_info.mem_clk_max;
}
static void kfd_fill_iolink_non_crat_info(struct kfd_topology_device *dev)
{
struct kfd_iolink_properties *link;
if (!dev || !dev->gpu)
return;
/* GPU only creates direck links so apply flags setting to all */
if (dev->gpu->device_info->asic_family == CHIP_HAWAII)
list_for_each_entry(link, &dev->io_link_props, list)
link->flags = CRAT_IOLINK_FLAGS_ENABLED |
CRAT_IOLINK_FLAGS_NO_ATOMICS_32_BIT |
CRAT_IOLINK_FLAGS_NO_ATOMICS_64_BIT;
}
int kfd_topology_add_device(struct kfd_dev *gpu) int kfd_topology_add_device(struct kfd_dev *gpu)
{ {
uint32_t gpu_id; uint32_t gpu_id;
...@@ -939,6 +1006,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu) ...@@ -939,6 +1006,9 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
struct kfd_cu_info cu_info; struct kfd_cu_info cu_info;
int res = 0; int res = 0;
struct list_head temp_topology_device_list; struct list_head temp_topology_device_list;
void *crat_image = NULL;
size_t image_size = 0;
int proximity_domain;
INIT_LIST_HEAD(&temp_topology_device_list); INIT_LIST_HEAD(&temp_topology_device_list);
...@@ -946,27 +1016,33 @@ int kfd_topology_add_device(struct kfd_dev *gpu) ...@@ -946,27 +1016,33 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id); pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
/* proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
* Try to assign the GPU to existing topology device (generated from
* CRAT table /* Check to see if this gpu device exists in the topology_device_list.
* If so, assign the gpu to that device,
* else create a Virtual CRAT for this gpu device and then parse that
* CRAT to create a new topology device. Once created assign the gpu to
* that topology device
*/ */
dev = kfd_assign_gpu(gpu); dev = kfd_assign_gpu(gpu);
if (!dev) { if (!dev) {
pr_info("GPU was not found in the current topology. Extending.\n"); res = kfd_create_crat_image_virtual(&crat_image, &image_size,
kfd_debug_print_topology(); COMPUTE_UNIT_GPU, gpu,
dev = kfd_create_topology_device(&temp_topology_device_list); proximity_domain);
if (!dev) { if (res) {
res = -ENOMEM; pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
return res;
}
res = kfd_parse_crat_table(crat_image,
&temp_topology_device_list,
proximity_domain);
if (res) {
pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
gpu_id);
goto err; goto err;
} }
dev->gpu = gpu;
/*
* TODO: Make a call to retrieve topology information from the
* GPU vBIOS
*/
down_write(&topology_lock); down_write(&topology_lock);
kfd_topology_update_device_list(&temp_topology_device_list, kfd_topology_update_device_list(&temp_topology_device_list,
&topology_device_list); &topology_device_list);
...@@ -974,34 +1050,86 @@ int kfd_topology_add_device(struct kfd_dev *gpu) ...@@ -974,34 +1050,86 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
/* Update the SYSFS tree, since we added another topology /* Update the SYSFS tree, since we added another topology
* device * device
*/ */
if (kfd_topology_update_sysfs() < 0) res = kfd_topology_update_sysfs();
kfd_topology_release_sysfs();
up_write(&topology_lock); up_write(&topology_lock);
if (!res)
sys_props.generation_count++;
else
pr_err("Failed to update GPU (ID: 0x%x) to sysfs topology. res=%d\n",
gpu_id, res);
dev = kfd_assign_gpu(gpu);
if (WARN_ON(!dev)) {
res = -ENODEV;
goto err;
}
} }
dev->gpu_id = gpu_id; dev->gpu_id = gpu_id;
gpu->id = gpu_id; gpu->id = gpu_id;
/* TODO: Move the following lines to function
* kfd_add_non_crat_information
*/
/* Fill-in additional information that is not available in CRAT but
* needed for the topology
*/
dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info); dev->gpu->kfd2kgd->get_cu_info(dev->gpu->kgd, &cu_info);
dev->node_props.simd_count = dev->node_props.simd_per_cu * dev->node_props.simd_arrays_per_engine =
cu_info.cu_active_number; cu_info.num_shader_arrays_per_engine;
dev->node_props.vendor_id = gpu->pdev->vendor; dev->node_props.vendor_id = gpu->pdev->vendor;
dev->node_props.device_id = gpu->pdev->device; dev->node_props.device_id = gpu->pdev->device;
dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number, dev->node_props.location_id = PCI_DEVID(gpu->pdev->bus->number,
gpu->pdev->devfn); gpu->pdev->devfn);
/* dev->node_props.max_engine_clk_fcompute =
* TODO: Retrieve max engine clock values from KGD dev->gpu->kfd2kgd->get_max_engine_clock_in_mhz(dev->gpu->kgd);
*/ dev->node_props.max_engine_clk_ccompute =
cpufreq_quick_get_max(0) / 1000;
if (dev->gpu->device_info->asic_family == CHIP_CARRIZO) {
dev->node_props.capability |= HSA_CAP_DOORBELL_PACKET_TYPE; kfd_fill_mem_clk_max_info(dev);
kfd_fill_iolink_non_crat_info(dev);
switch (dev->gpu->device_info->asic_family) {
case CHIP_KAVERI:
case CHIP_HAWAII:
case CHIP_TONGA:
dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_PRE_1_0 <<
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
break;
case CHIP_CARRIZO:
case CHIP_FIJI:
case CHIP_POLARIS10:
case CHIP_POLARIS11:
pr_debug("Adding doorbell packet type capability\n"); pr_debug("Adding doorbell packet type capability\n");
dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_1_0 <<
HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) &
HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK);
break;
default:
WARN(1, "Unexpected ASIC family %u",
dev->gpu->device_info->asic_family);
} }
/* Fix errors in CZ CRAT.
* simd_count: Carrizo CRAT reports wrong simd_count, probably
* because it doesn't consider masked out CUs
* capability flag: Carrizo CRAT doesn't report IOMMU
* flags. TODO: Fix this.
*/
if (dev->gpu->device_info->asic_family == CHIP_CARRIZO)
dev->node_props.simd_count =
cu_info.simd_per_cu * cu_info.cu_active_number;
kfd_debug_print_topology();
if (!res) if (!res)
kfd_notify_gpu_change(gpu_id, 1); kfd_notify_gpu_change(gpu_id, 1);
err: err:
kfd_destroy_crat_image(crat_image);
return res; return res;
} }
......
...@@ -39,8 +39,12 @@ ...@@ -39,8 +39,12 @@
#define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080 #define HSA_CAP_WATCH_POINTS_SUPPORTED 0x00000080
#define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00 #define HSA_CAP_WATCH_POINTS_TOTALBITS_MASK 0x00000f00
#define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8 #define HSA_CAP_WATCH_POINTS_TOTALBITS_SHIFT 8
#define HSA_CAP_RESERVED 0xfffff000 #define HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK 0x00003000
#define HSA_CAP_DOORBELL_PACKET_TYPE 0x00001000 #define HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT 12
#define HSA_CAP_RESERVED 0xffffc000
#define HSA_CAP_DOORBELL_TYPE_PRE_1_0 0x0
#define HSA_CAP_DOORBELL_TYPE_1_0 0x1
struct kfd_node_properties { struct kfd_node_properties {
uint32_t cpu_cores_count; uint32_t cpu_cores_count;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment