drm/amdgpu: add gang submit frontend v6

Allows submitting jobs as gang which needs to run on multiple engines at the same time. All members of the gang get the same implicit, explicit and VM dependencies. So no gang member will start running until everything else is ready. The last job is considered the gang leader (usually a submission to the GFX ring) and used for signaling output dependencies. Each job is remembered individually as user of a buffer object, so there is no joining of work at the end. v2: rebase and fix review comments from Andrey and Yogesh v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang leader only when necessary v4: fix order of pushing jobs and adding fences found by Trigger. v5: fix job index calculation and adding IBs to jobs v6: fix typo found by Alex Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdgpu: add gang submit frontend v6
Allows submitting jobs as gang which needs to run on multiple engines at the same time. All members of the gang get the same implicit, explicit and VM dependencies. So no gang member will start running until everything else is ready. The last job is considered the gang leader (usually a submission to the GFX ring) and used for signaling output dependencies. Each job is remembered individually as user of a buffer object, so there is no joining of work at the end. v2: rebase and fix review comments from Andrey and Yogesh v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang leader only when necessary v4: fix order of pushing jobs and adding fences found by Trigger. v5: fix job index calculation and adding IBs to jobs v6: fix typo found by Alex Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
4624459c · Christian König · Alex Deucher · 68ce8b24 · 4624459c · 4624459c
Commit 4624459c authored Mar 02, 2022 by Christian König Committed by Alex Deucher Sep 20, 2022
5 changed files
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
 	ib->length_dw = ib_len;
 	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
 	job->vmid = vmid;
+	job->num_ibs = 1;

 	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);


--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
@@ -27,6 +27,8 @@
 #include "amdgpu_bo_list.h"
 #include "amdgpu_ring.h"

+#define AMDGPU_CS_GANG_SIZE	4
+
 struct amdgpu_bo_va_mapping;

 struct amdgpu_cs_chunk {
@@ -50,9 +52,11 @@ struct amdgpu_cs_parser {
 	unsigned		nchunks;
 	struct amdgpu_cs_chunk	*chunks;

-	/* scheduler job object */
-	struct amdgpu_job	*job;
-	struct drm_sched_entity	*entity;
+	/* scheduler job objects */
+	unsigned int		gang_size;
+	struct drm_sched_entity	*entities[AMDGPU_CS_GANG_SIZE];
+	struct amdgpu_job	*jobs[AMDGPU_CS_GANG_SIZE];
+	struct amdgpu_job	*gang_leader;

 	/* buffer objects */
 	struct ww_acquire_ctx		ticket;

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs,
 	 */
 	(*job)->base.sched = &adev->rings[0]->sched;
 	(*job)->vm = vm;
-	(*job)->num_ibs = num_ibs;

 	amdgpu_sync_create(&(*job)->sync);
 	amdgpu_sync_create(&(*job)->sched_sync);
@@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size,
 	if (r)
 		return r;

+	(*job)->num_ibs = 1;
 	r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]);
 	if (r)
 		kfree(*job);

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
 );

 TRACE_EVENT(amdgpu_cs,
-	    TP_PROTO(struct amdgpu_cs_parser *p, int i),
-	    TP_ARGS(p, i),
+	    TP_PROTO(struct amdgpu_cs_parser *p,
+		     struct amdgpu_job *job,
+		     struct amdgpu_ib *ib),
+	    TP_ARGS(p, job, ib),
 	    TP_STRUCT__entry(
 			     __field(struct amdgpu_bo_list *, bo_list)
 			     __field(u32, ring)
@@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,

 	    TP_fast_assign(
 			   __entry->bo_list = p->bo_list;
-			   __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx;
-			   __entry->dw = p->job->ibs[i].length_dw;
+			   __entry->ring = to_amdgpu_ring(job->base.sched)->idx;
+			   __entry->dw = ib->length_dw;
 			   __entry->fences = amdgpu_fence_count_emitted(
-				to_amdgpu_ring(p->entity->rq->sched));
+				to_amdgpu_ring(job->base.sched));
 			   ),
 	    TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
 		      __entry->bo_list, __entry->ring, __entry->dw,