Commit 5f248462 authored by David Francis's avatar David Francis Committed by Alex Deucher

drm/amdgpu: Add EXT_COHERENT memory allocation flags

These flags (for GEM and SVM allocations) allocate
memory that allows for system-scope atomic semantics.

On GFX943 these flags cause caches to be avoided on
non-local memory.

On all other ASICs they are identical in functionality to the
equivalent COHERENT flags.

Corresponding Thunk patch is at
https://github.com/RadeonOpenCompute/ROCT-Thunk-Interface/pull/88Reviewed-by: default avatarDavid Yat Sin <David.YatSin@amd.com>
Signed-off-by: default avatarDavid Francis <David.Francis@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d92e5556
...@@ -1691,6 +1691,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( ...@@ -1691,6 +1691,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT) if (flags & KFD_IOC_ALLOC_MEM_FLAGS_COHERENT)
alloc_flags |= AMDGPU_GEM_CREATE_COHERENT; alloc_flags |= AMDGPU_GEM_CREATE_COHERENT;
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT)
alloc_flags |= AMDGPU_GEM_CREATE_EXT_COHERENT;
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED) if (flags & KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED)
alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED; alloc_flags |= AMDGPU_GEM_CREATE_UNCACHED;
......
...@@ -331,6 +331,7 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf) ...@@ -331,6 +331,7 @@ amdgpu_dma_buf_create_obj(struct drm_device *dev, struct dma_buf *dma_buf)
flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC | flags |= other->flags & (AMDGPU_GEM_CREATE_CPU_GTT_USWC |
AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_COHERENT |
AMDGPU_GEM_CREATE_EXT_COHERENT |
AMDGPU_GEM_CREATE_UNCACHED); AMDGPU_GEM_CREATE_UNCACHED);
} }
......
...@@ -635,6 +635,7 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev, ...@@ -635,6 +635,7 @@ static void gmc_v10_0_get_vm_pte(struct amdgpu_device *adev,
} }
if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
AMDGPU_GEM_CREATE_EXT_COHERENT |
AMDGPU_GEM_CREATE_UNCACHED)) AMDGPU_GEM_CREATE_UNCACHED))
*flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
......
...@@ -543,6 +543,7 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev, ...@@ -543,6 +543,7 @@ static void gmc_v11_0_get_vm_pte(struct amdgpu_device *adev,
} }
if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT | if (bo && bo->flags & (AMDGPU_GEM_CREATE_COHERENT |
AMDGPU_GEM_CREATE_EXT_COHERENT |
AMDGPU_GEM_CREATE_UNCACHED)) AMDGPU_GEM_CREATE_UNCACHED))
*flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) | *flags = (*flags & ~AMDGPU_PTE_MTYPE_NV10_MASK) |
AMDGPU_PTE_MTYPE_NV10(MTYPE_UC); AMDGPU_PTE_MTYPE_NV10(MTYPE_UC);
......
...@@ -1187,7 +1187,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, ...@@ -1187,7 +1187,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
{ {
struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev); struct amdgpu_device *bo_adev = amdgpu_ttm_adev(bo->tbo.bdev);
bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM; bool is_vram = bo->tbo.resource->mem_type == TTM_PL_VRAM;
bool coherent = bo->flags & AMDGPU_GEM_CREATE_COHERENT; bool coherent = bo->flags & (AMDGPU_GEM_CREATE_COHERENT | AMDGPU_GEM_CREATE_EXT_COHERENT);
bool ext_coherent = bo->flags & AMDGPU_GEM_CREATE_EXT_COHERENT;
bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED; bool uncached = bo->flags & AMDGPU_GEM_CREATE_UNCACHED;
struct amdgpu_vm *vm = mapping->bo_va->base.vm; struct amdgpu_vm *vm = mapping->bo_va->base.vm;
unsigned int mtype_local, mtype; unsigned int mtype_local, mtype;
...@@ -1257,6 +1258,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev, ...@@ -1257,6 +1258,8 @@ static void gmc_v9_0_get_coherence_flags(struct amdgpu_device *adev,
snoop = true; snoop = true;
if (uncached) { if (uncached) {
mtype = MTYPE_UC; mtype = MTYPE_UC;
} else if (ext_coherent) {
mtype = is_local ? MTYPE_CC : MTYPE_UC;
} else if (adev->flags & AMD_IS_APU) { } else if (adev->flags & AMD_IS_APU) {
mtype = is_local ? mtype_local : MTYPE_NC; mtype = is_local ? mtype_local : MTYPE_NC;
} else { } else {
......
...@@ -1189,7 +1189,8 @@ svm_range_get_pte_flags(struct kfd_node *node, ...@@ -1189,7 +1189,8 @@ svm_range_get_pte_flags(struct kfd_node *node,
uint32_t mapping_flags = 0; uint32_t mapping_flags = 0;
uint64_t pte_flags; uint64_t pte_flags;
bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN);
bool coherent = flags & KFD_IOCTL_SVM_FLAG_COHERENT; bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT);
bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT;
bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/
unsigned int mtype_local; unsigned int mtype_local;
...@@ -1237,6 +1238,13 @@ svm_range_get_pte_flags(struct kfd_node *node, ...@@ -1237,6 +1238,13 @@ svm_range_get_pte_flags(struct kfd_node *node,
snoop = true; snoop = true;
if (uncached) { if (uncached) {
mapping_flags |= AMDGPU_VM_MTYPE_UC; mapping_flags |= AMDGPU_VM_MTYPE_UC;
} else if (ext_coherent) {
/* local HBM region close to partition */
if (bo_node->adev == node->adev &&
(!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id))
mapping_flags |= AMDGPU_VM_MTYPE_CC;
else
mapping_flags |= AMDGPU_VM_MTYPE_UC;
} else if (domain == SVM_RANGE_VRAM_DOMAIN) { } else if (domain == SVM_RANGE_VRAM_DOMAIN) {
/* local HBM region close to partition */ /* local HBM region close to partition */
if (bo_node->adev == node->adev && if (bo_node->adev == node->adev &&
......
...@@ -150,7 +150,7 @@ extern "C" { ...@@ -150,7 +150,7 @@ extern "C" {
*/ */
#define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12) #define AMDGPU_GEM_CREATE_DISCARDABLE (1 << 12)
/* Flag that BO is shared coherently between multiple devices or CPU threads. /* Flag that BO is shared coherently between multiple devices or CPU threads.
* May depend on GPU instructions to flush caches explicitly * May depend on GPU instructions to flush caches to system scope explicitly.
* *
* This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and * This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
* may override the MTYPE selected in AMDGPU_VA_OP_MAP. * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
...@@ -163,6 +163,14 @@ extern "C" { ...@@ -163,6 +163,14 @@ extern "C" {
* may override the MTYPE selected in AMDGPU_VA_OP_MAP. * may override the MTYPE selected in AMDGPU_VA_OP_MAP.
*/ */
#define AMDGPU_GEM_CREATE_UNCACHED (1 << 14) #define AMDGPU_GEM_CREATE_UNCACHED (1 << 14)
/* Flag that BO should be coherent across devices when using device-level
* atomics. May depend on GPU instructions to flush caches to device scope
* explicitly, promoting them to system scope automatically.
*
* This influences the choice of MTYPE in the PTEs on GFXv9 and later GPUs and
* may override the MTYPE selected in AMDGPU_VA_OP_MAP.
*/
#define AMDGPU_GEM_CREATE_EXT_COHERENT (1 << 15)
struct drm_amdgpu_gem_create_in { struct drm_amdgpu_gem_create_in {
/** the requested memory size */ /** the requested memory size */
......
...@@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args { ...@@ -405,6 +405,7 @@ struct kfd_ioctl_acquire_vm_args {
#define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27) #define KFD_IOC_ALLOC_MEM_FLAGS_AQL_QUEUE_MEM (1 << 27)
#define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26) #define KFD_IOC_ALLOC_MEM_FLAGS_COHERENT (1 << 26)
#define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25) #define KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED (1 << 25)
#define KFD_IOC_ALLOC_MEM_FLAGS_EXT_COHERENT (1 << 24)
/* Allocate memory for later SVM (shared virtual memory) mapping. /* Allocate memory for later SVM (shared virtual memory) mapping.
* *
...@@ -659,6 +660,8 @@ enum kfd_mmio_remap { ...@@ -659,6 +660,8 @@ enum kfd_mmio_remap {
#define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020 #define KFD_IOCTL_SVM_FLAG_GPU_READ_MOSTLY 0x00000020
/* Keep GPU memory mapping always valid as if XNACK is disable */ /* Keep GPU memory mapping always valid as if XNACK is disable */
#define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040 #define KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED 0x00000040
/* Fine grained coherency between all devices using device-scope atomics */
#define KFD_IOCTL_SVM_FLAG_EXT_COHERENT 0x00000080
/** /**
* kfd_ioctl_svm_op - SVM ioctl operations * kfd_ioctl_svm_op - SVM ioctl operations
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment