Commit fbf6dc7a authored by Alex Deucher's avatar Alex Deucher

drm/radeon: add fault decode function for SI (v2)

Helpful for debugging GPUVM errors as we can see what
hw block and page generated the fault in the log.

v2: simplify fault decoding
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
parent 54e2e49c
......@@ -4389,6 +4389,270 @@ void si_vm_fini(struct radeon_device *rdev)
{
}
/**
* si_vm_decode_fault - print human readable fault info
*
* @rdev: radeon_device pointer
* @status: VM_CONTEXT1_PROTECTION_FAULT_STATUS register value
* @addr: VM_CONTEXT1_PROTECTION_FAULT_ADDR register value
*
* Print human readable fault information (SI).
*/
static void si_vm_decode_fault(struct radeon_device *rdev,
u32 status, u32 addr)
{
u32 mc_id = (status & MEMORY_CLIENT_ID_MASK) >> MEMORY_CLIENT_ID_SHIFT;
u32 vmid = (status & FAULT_VMID_MASK) >> FAULT_VMID_SHIFT;
u32 protections = (status & PROTECTIONS_MASK) >> PROTECTIONS_SHIFT;
char *block;
if (rdev->family == CHIP_TAHITI) {
switch (mc_id) {
case 160:
case 144:
case 96:
case 80:
case 224:
case 208:
case 32:
case 16:
block = "CB";
break;
case 161:
case 145:
case 97:
case 81:
case 225:
case 209:
case 33:
case 17:
block = "CB_FMASK";
break;
case 162:
case 146:
case 98:
case 82:
case 226:
case 210:
case 34:
case 18:
block = "CB_CMASK";
break;
case 163:
case 147:
case 99:
case 83:
case 227:
case 211:
case 35:
case 19:
block = "CB_IMMED";
break;
case 164:
case 148:
case 100:
case 84:
case 228:
case 212:
case 36:
case 20:
block = "DB";
break;
case 165:
case 149:
case 101:
case 85:
case 229:
case 213:
case 37:
case 21:
block = "DB_HTILE";
break;
case 167:
case 151:
case 103:
case 87:
case 231:
case 215:
case 39:
case 23:
block = "DB_STEN";
break;
case 72:
case 68:
case 64:
case 8:
case 4:
case 0:
case 136:
case 132:
case 128:
case 200:
case 196:
case 192:
block = "TC";
break;
case 112:
case 48:
block = "CP";
break;
case 49:
case 177:
case 50:
case 178:
block = "SH";
break;
case 53:
case 190:
block = "VGT";
break;
case 117:
block = "IH";
break;
case 51:
case 115:
block = "RLC";
break;
case 119:
case 183:
block = "DMA0";
break;
case 61:
block = "DMA1";
break;
case 248:
case 120:
block = "HDP";
break;
default:
block = "unknown";
break;
}
} else {
switch (mc_id) {
case 32:
case 16:
case 96:
case 80:
case 160:
case 144:
case 224:
case 208:
block = "CB";
break;
case 33:
case 17:
case 97:
case 81:
case 161:
case 145:
case 225:
case 209:
block = "CB_FMASK";
break;
case 34:
case 18:
case 98:
case 82:
case 162:
case 146:
case 226:
case 210:
block = "CB_CMASK";
break;
case 35:
case 19:
case 99:
case 83:
case 163:
case 147:
case 227:
case 211:
block = "CB_IMMED";
break;
case 36:
case 20:
case 100:
case 84:
case 164:
case 148:
case 228:
case 212:
block = "DB";
break;
case 37:
case 21:
case 101:
case 85:
case 165:
case 149:
case 229:
case 213:
block = "DB_HTILE";
break;
case 39:
case 23:
case 103:
case 87:
case 167:
case 151:
case 231:
case 215:
block = "DB_STEN";
break;
case 72:
case 68:
case 8:
case 4:
case 136:
case 132:
case 200:
case 196:
block = "TC";
break;
case 112:
case 48:
block = "CP";
break;
case 49:
case 177:
case 50:
case 178:
block = "SH";
break;
case 53:
block = "VGT";
break;
case 117:
block = "IH";
break;
case 51:
case 115:
block = "RLC";
break;
case 119:
case 183:
block = "DMA0";
break;
case 61:
block = "DMA1";
break;
case 248:
case 120:
block = "HDP";
break;
default:
block = "unknown";
break;
}
}
printk("VM fault (0x%02x, vmid %d) at page %u, %s from %s (%d)\n",
protections, vmid, addr,
(status & MEMORY_CLIENT_RW_MASK) ? "write" : "read",
block, mc_id);
}
/**
* si_vm_set_page - update the page tables using the CP
*
......@@ -5755,6 +6019,7 @@ int si_irq_process(struct radeon_device *rdev)
u32 ring_index;
bool queue_hotplug = false;
bool queue_thermal = false;
u32 status, addr;
if (!rdev->ih.enabled || rdev->shutdown)
return IRQ_NONE;
......@@ -5990,11 +6255,14 @@ int si_irq_process(struct radeon_device *rdev)
break;
case 146:
case 147:
addr = RREG32(VM_CONTEXT1_PROTECTION_FAULT_ADDR);
status = RREG32(VM_CONTEXT1_PROTECTION_FAULT_STATUS);
dev_err(rdev->dev, "GPU fault detected: %d 0x%08x\n", src_id, src_data);
dev_err(rdev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x%08X\n",
RREG32(VM_CONTEXT1_PROTECTION_FAULT_ADDR));
addr);
dev_err(rdev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x%08X\n",
RREG32(VM_CONTEXT1_PROTECTION_FAULT_STATUS));
status);
si_vm_decode_fault(rdev, status, addr);
/* reset addr and status */
WREG32_P(VM_CONTEXT1_CNTL2, 1, ~1);
break;
......
......@@ -367,6 +367,20 @@
#define VM_CONTEXT1_PROTECTION_FAULT_ADDR 0x14FC
#define VM_CONTEXT1_PROTECTION_FAULT_STATUS 0x14DC
#define PROTECTIONS_MASK (0xf << 0)
#define PROTECTIONS_SHIFT 0
/* bit 0: range
* bit 1: pde0
* bit 2: valid
* bit 3: read
* bit 4: write
*/
#define MEMORY_CLIENT_ID_MASK (0xff << 12)
#define MEMORY_CLIENT_ID_SHIFT 12
#define MEMORY_CLIENT_RW_MASK (1 << 24)
#define MEMORY_CLIENT_RW_SHIFT 24
#define FAULT_VMID_MASK (0xf << 25)
#define FAULT_VMID_SHIFT 25
#define VM_INVALIDATE_REQUEST 0x1478
#define VM_INVALIDATE_RESPONSE 0x147c
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment