habanalabs: Add a new H/W queue type

This patch adds a support for a new H/W queue type. This type of queue is for DMA and compute engines jobs, for which completion notification are sent by H/W. Command buffer for this queue can be created either through the CB IOCTL and using the retrieved CB handle, or by preparing a buffer on the host or device SRAM/DRAM, and using the device address to that buffer. The patch includes the handling of the 2 options, as well as the initialization of the H/W queue and its jobs scheduling. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

habanalabs: Add a new H/W queue type
This patch adds a support for a new H/W queue type. This type of queue is for DMA and compute engines jobs, for which completion notification are sent by H/W. Command buffer for this queue can be created either through the CB IOCTL and using the retrieved CB handle, or by preparing a buffer on the host or device SRAM/DRAM, and using the device address to that buffer. The patch includes the handling of the 2 options, as well as the initialization of the H/W queue and its jobs scheduling. Signed-off-by: Tomer Tayar <ttayar@habana.ai> Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com> Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
cb596aee · Tomer Tayar · Oded Gabbay · df762375 · cb596aee · cb596aee
Commit cb596aee authored Oct 03, 2019 by Tomer Tayar Committed by Oded Gabbay Nov 21, 2019
5 changed files
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -65,6 +65,18 @@ static void cs_put(struct hl_cs *cs)
 	kref_put(&cs->refcount, cs_do_release);
 }
+static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
+{
+	/*
+	 * Patched CB is created for external queues jobs, and for H/W queues
+	 * jobs if the user CB was allocated by driver and MMU is disabled.
+	 */
+	return (job->queue_type == QUEUE_TYPE_EXT ||
+			(job->queue_type == QUEUE_TYPE_HW &&
+					job->is_kernel_allocated_cb &&
+					!hdev->mmu_enable));
+}
 /*
 * cs_parser - parse the user command submission
 *
@@ -91,11 +103,13 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 	parser.patched_cb = NULL;
 	parser.user_cb = job->user_cb;
 	parser.user_cb_size = job->user_cb_size;
-	parser.ext_queue = job->ext_queue;
+	parser.queue_type = job->queue_type;
+	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
 	job->patched_cb = NULL;
 	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job)) {
 		if (!rc) {
 			job->patched_cb = parser.patched_cb;
 			job->job_cb_size = parser.patched_cb_size;
@@ -124,7 +138,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	struct hl_cs *cs = job->cs;
-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job)) {
 		hl_userptr_delete_list(hdev, &job->userptr_list);
 		/*
@@ -140,6 +154,19 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 		}
 	}
+	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
+	 * enabled, the user CB isn't released in cs_parser() and thus should be
+	 * released here.
+	 */
+	if (job->queue_type == QUEUE_TYPE_HW &&
+			job->is_kernel_allocated_cb && hdev->mmu_enable) {
+		spin_lock(&job->user_cb->lock);
+		job->user_cb->cs_cnt--;
+		spin_unlock(&job->user_cb->lock);
+		hl_cb_put(job->user_cb);
+	}
 	/*
 	 * This is the only place where there can be multiple threads
 	 * modifying the list at the same time
@@ -150,7 +177,8 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 	hl_debugfs_remove_job(hdev, job);
-	if (job->ext_queue)
+	if (job->queue_type == QUEUE_TYPE_EXT ||
+			job->queue_type == QUEUE_TYPE_HW)
 		cs_put(cs);
 	kfree(job);
@@ -387,18 +415,13 @@ static void job_wq_completion(struct work_struct *work)
 	free_job(hdev, job);
 }
-static struct hl_cb *validate_queue_index(struct hl_device *hdev,
+static int validate_queue_index(struct hl_device *hdev,
-					struct hl_cb_mgr *cb_mgr,
 				struct hl_cs_chunk *chunk,
-					bool *ext_queue)
+				enum hl_queue_type *queue_type,
+				bool *is_kernel_allocated_cb)
 {
 	struct asic_fixed_properties *asic = &hdev->asic_prop;
 	struct hw_queue_properties *hw_queue_prop;
-	u32 cb_handle;
-	struct hl_cb *cb;
-	/* Assume external queue */
-	*ext_queue = true;
 	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
@@ -406,22 +429,29 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 			(hw_queue_prop->type == QUEUE_TYPE_NA)) {
 		dev_err(hdev->dev, "Queue index %d is invalid\n",
 			chunk->queue_index);
-		return NULL;
+		return -EINVAL;
 	}
 	if (hw_queue_prop->driver_only) {
 		dev_err(hdev->dev,
 			"Queue index %d is restricted for the kernel driver\n",
 			chunk->queue_index);
-		return NULL;
+		return -EINVAL;
 	}
-	if (!hw_queue_prop->requires_kernel_cb) {
+	*queue_type = hw_queue_prop->type;
-		*ext_queue = false;
+	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
-		return (struct hl_cb *) (uintptr_t) chunk->cb_handle;
-	}
+	return 0;
+}
+static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
+					struct hl_cb_mgr *cb_mgr,
+					struct hl_cs_chunk *chunk)
+{
+	struct hl_cb *cb;
+	u32 cb_handle;
-	/* Retrieve CB object */
 	cb_handle = (u32) (chunk->cb_handle >> PAGE_SHIFT);
 	cb = hl_cb_get(hdev, cb_mgr, cb_handle);
@@ -446,7 +476,8 @@ static struct hl_cb *validate_queue_index(struct hl_device *hdev,
 	return NULL;
 }
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+		enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
 {
 	struct hl_cs_job *job;
@@ -454,12 +485,14 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue)
 	if (!job)
 		return NULL;
-	job->ext_queue = ext_queue;
+	job->queue_type = queue_type;
+	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
-	if (job->ext_queue) {
+	if (is_cb_patched(hdev, job))
 		INIT_LIST_HEAD(&job->userptr_list);
+	if (job->queue_type == QUEUE_TYPE_EXT)
 		INIT_WORK(&job->finish_work, job_wq_completion);
-	}
 	return job;
 }
@@ -472,7 +505,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
-	bool ext_queue_present = false;
+	bool int_queues_only = true;
 	u32 size_to_copy;
 	int rc, i, parse_cnt;
@@ -516,23 +549,33 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 	/* Validate ALL the CS chunks before submitting the CS */
 	for (i = 0, parse_cnt = 0 ; i < num_chunks ; i++, parse_cnt++) {
 		struct hl_cs_chunk *chunk = &cs_chunk_array[i];
-		bool ext_queue;
+		enum hl_queue_type queue_type;
+		bool is_kernel_allocated_cb;
-		cb = validate_queue_index(hdev, &hpriv->cb_mgr, chunk,
+		rc = validate_queue_index(hdev, chunk, &queue_type,
-					&ext_queue);
+						&is_kernel_allocated_cb);
-		if (ext_queue) {
+		if (rc)
-			ext_queue_present = true;
+			goto free_cs_object;
+		if (is_kernel_allocated_cb) {
+			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
 			if (!cb) {
 				rc = -EINVAL;
 				goto free_cs_object;
 			}
+		} else {
+			cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
 		}
-		job = hl_cs_allocate_job(hdev, ext_queue);
+		if (queue_type == QUEUE_TYPE_EXT || queue_type == QUEUE_TYPE_HW)
+			int_queues_only = false;
+		job = hl_cs_allocate_job(hdev, queue_type,
+						is_kernel_allocated_cb);
 		if (!job) {
 			dev_err(hdev->dev, "Failed to allocate a new job\n");
 			rc = -ENOMEM;
-			if (ext_queue)
+			if (is_kernel_allocated_cb)
 				goto release_cb;
 			else
 				goto free_cs_object;
@@ -542,7 +585,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		job->cs = cs;
 		job->user_cb = cb;
 		job->user_cb_size = chunk->cb_size;
-		if (job->ext_queue)
+		if (is_kernel_allocated_cb)
 			job->job_cb_size = cb->size;
 		else
 			job->job_cb_size = chunk->cb_size;
@@ -555,10 +598,11 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		/*
 		 * Increment CS reference. When CS reference is 0, CS is
 		 * done and can be signaled to user and free all its resources
-		 * Only increment for JOB on external queues, because only
+		 * Only increment for JOB on external or H/W queues, because
-		 * for those JOBs we get completion
+		 * only for those JOBs we get completion
 		 */
-		if (job->ext_queue)
+		if (job->queue_type == QUEUE_TYPE_EXT ||
+				job->queue_type == QUEUE_TYPE_HW)
 			cs_get(cs);
 		hl_debugfs_add_job(hdev, job);
@@ -572,9 +616,9 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks,
 		}
 	}
-	if (!ext_queue_present) {
+	if (int_queues_only) {
 		dev_err(hdev->dev,
-			"Reject CS %d.%llu because no external queues jobs\n",
+			"Reject CS %d.%llu because only internal queues jobs are present\n",
 			cs->ctx->asid, cs->sequence);
 		rc = -EINVAL;
 		goto free_cs_object;

--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -3943,7 +3943,7 @@ int goya_cs_parser(struct hl_device *hdev, struct hl_cs_parser *parser)
 {
 	struct goya_device *goya = hdev->asic_specific;
-	if (!parser->ext_queue)
+	if (parser->queue_type == QUEUE_TYPE_INT)
 		return goya_parse_cb_no_ext_queue(hdev, parser);
 	if (goya->hw_cap_initialized & HW_CAP_MMU)
@@ -4614,7 +4614,7 @@ static int goya_memset_device_memory(struct hl_device *hdev, u64 addr, u64 size,
 		lin_dma_pkt++;
 	} while (--lin_dma_pkts_cnt);
-	job = hl_cs_allocate_job(hdev, true);
+	job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true);
 	if (!job) {
 		dev_err(hdev->dev, "Failed to allocate a new job\n");
 		rc = -ENOMEM;

--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -85,12 +85,15 @@ struct hl_fpriv;
 * @QUEUE_TYPE_INT: internal queue that performs DMA inside the device's
 *			memories and/or operates the compute engines.
 * @QUEUE_TYPE_CPU: S/W queue for communication with the device's CPU.
+ * @QUEUE_TYPE_HW: queue of DMA and compute engines jobs, for which completion
+ *                 notifications are sent by H/W.
 */
 enum hl_queue_type {
 	QUEUE_TYPE_NA,
 	QUEUE_TYPE_EXT,
 	QUEUE_TYPE_INT,
-	QUEUE_TYPE_CPU
+	QUEUE_TYPE_CPU,
+	QUEUE_TYPE_HW
 };
 /**
@@ -755,11 +758,14 @@ struct hl_cs {
 * @userptr_list: linked-list of userptr mappings that belong to this job and
 *			wait for completion.
 * @debugfs_list: node in debugfs list of command submission jobs.
+ * @queue_type: the type of the H/W queue this job is submitted to.
 * @id: the id of this job inside a CS.
 * @hw_queue_id: the id of the H/W queue this job is submitted to.
 * @user_cb_size: the actual size of the CB we got from the user.
 * @job_cb_size: the actual size of the CB that we put on the queue.
- * @ext_queue: whether the job is for external queue or internal queue.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
 */
 struct hl_cs_job {
 	struct list_head	cs_node;
@@ -769,11 +775,12 @@ struct hl_cs_job {
 	struct work_struct	finish_work;
 	struct list_head	userptr_list;
 	struct list_head	debugfs_list;
+	enum hl_queue_type	queue_type;
 	u32			id;
 	u32			hw_queue_id;
 	u32			user_cb_size;
 	u32			job_cb_size;
-	u8			ext_queue;
+	u8			is_kernel_allocated_cb;
 };
 /**
@@ -784,24 +791,28 @@ struct hl_cs_job {
 * @job_userptr_list: linked-list of userptr mappings that belong to the related
 *			job and wait for completion.
 * @cs_sequence: the sequence number of the related CS.
+ * @queue_type: the type of the H/W queue this job is submitted to.
 * @ctx_id: the ID of the context the related CS belongs to.
 * @hw_queue_id: the id of the H/W queue this job is submitted to.
 * @user_cb_size: the actual size of the CB we got from the user.
 * @patched_cb_size: the size of the CB after parsing.
- * @ext_queue: whether the job is for external queue or internal queue.
 * @job_id: the id of the related job inside the related CS.
+ * @is_kernel_allocated_cb: true if the CB handle we got from the user holds a
+ *                          handle to a kernel-allocated CB object, false
+ *                          otherwise (SRAM/DRAM/host address).
 */
 struct hl_cs_parser {
 	struct hl_cb		*user_cb;
 	struct hl_cb		*patched_cb;
 	struct list_head	*job_userptr_list;
 	u64			cs_sequence;
+	enum hl_queue_type	queue_type;
 	u32			ctx_id;
 	u32			hw_queue_id;
 	u32			user_cb_size;
 	u32			patched_cb_size;
-	u8			ext_queue;
 	u8			job_id;
+	u8			is_kernel_allocated_cb;
 };
@@ -1504,7 +1515,8 @@ int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
 void hl_cs_rollback_all(struct hl_device *hdev);
-struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, bool ext_queue);
+struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
+		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void goya_set_asic_funcs(struct hl_device *hdev);

--- a/drivers/misc/habanalabs/hw_queue.c
+++ b/drivers/misc/habanalabs/hw_queue.c
@@ -58,8 +58,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
 }
 /*
- * ext_queue_submit_bd - Submit a buffer descriptor to an external queue
+ * ext_and_hw_queue_submit_bd() - Submit a buffer descriptor to an external or a
- *
+ *                                H/W queue.
 * @hdev: pointer to habanalabs device structure
 * @q: pointer to habanalabs queue structure
 * @ctl: BD's control word
@@ -73,8 +73,8 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
 * This function must be called when the scheduler mutex is taken
 *
 */
-static void ext_queue_submit_bd(struct hl_device *hdev, struct hl_hw_queue *q,
+static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
-				u32 ctl, u32 len, u64 ptr)
+			struct hl_hw_queue *q, u32 ctl, u32 len, u64 ptr)
 {
 	struct hl_bd *bd;
@@ -173,6 +173,45 @@ static int int_queue_sanity_checks(struct hl_device *hdev,
 	return 0;
 }
+/*
+ * hw_queue_sanity_checks() - Perform some sanity checks on a H/W queue.
+ * @hdev: Pointer to hl_device structure.
+ * @q: Pointer to hl_hw_queue structure.
+ * @num_of_entries: How many entries to check for space.
+ *
+ * Perform the following:
+ * - Make sure we have enough space in the completion queue.
+ *   This check also ensures that there is enough space in the h/w queue, as
+ *   both queues are of the same size.
+ * - Reserve space in the completion queue (needs to be reversed if there
+ *   is a failure down the road before the actual submission of work).
+ *
+ * Both operations are done using the "free_slots_cnt" field of the completion
+ * queue. The CI counters of the queue and the completion queue are not
+ * needed/used for the H/W queue type.
+ */
+static int hw_queue_sanity_checks(struct hl_device *hdev, struct hl_hw_queue *q,
+					int num_of_entries)
+{
+	atomic_t *free_slots =
+			&hdev->completion_queue[q->hw_queue_id].free_slots_cnt;
+	/*
+	 * Check we have enough space in the completion queue.
+	 * Add -1 to counter (decrement) unless counter was already 0.
+	 * In that case, CQ is full so we can't submit a new CB.
+	 * atomic_add_unless will return 0 if counter was already 0.
+	 */
+	if (atomic_add_negative(num_of_entries * -1, free_slots)) {
+		dev_dbg(hdev->dev, "No space for %d entries on CQ %d\n",
+			num_of_entries, q->hw_queue_id);
+		atomic_add(num_of_entries, free_slots);
+		return -EAGAIN;
+	}
+	return 0;
+}
 /*
 * hl_hw_queue_send_cb_no_cmpl - send a single CB (not a JOB) without completion
 *
@@ -188,7 +227,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 				u32 cb_size, u64 cb_ptr)
 {
 	struct hl_hw_queue *q = &hdev->kernel_queues[hw_queue_id];
-	int rc;
+	int rc = 0;
 	/*
 	 * The CPU queue is a synchronous queue with an effective depth of
@@ -206,11 +245,18 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 		goto out;
 	}
+	/*
+	 * hl_hw_queue_send_cb_no_cmpl() is called for queues of a H/W queue
+	 * type only on init phase, when the queues are empty and being tested,
+	 * so there is no need for sanity checks.
+	 */
+	if (q->queue_type != QUEUE_TYPE_HW) {
 		rc = ext_queue_sanity_checks(hdev, q, 1, false);
 		if (rc)
 			goto out;
+	}
-	ext_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
+	ext_and_hw_queue_submit_bd(hdev, q, 0, cb_size, cb_ptr);
 out:
 	if (q->queue_type != QUEUE_TYPE_CPU)
@@ -220,14 +266,14 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
 }
 /*
- * ext_hw_queue_schedule_job - submit a JOB to an external queue
+ * ext_queue_schedule_job - submit a JOB to an external queue
 *
 * @job: pointer to the job that needs to be submitted to the queue
 *
 * This function must be called when the scheduler mutex is taken
 *
 */
-static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
+static void ext_queue_schedule_job(struct hl_cs_job *job)
 {
 	struct hl_device *hdev = job->cs->ctx->hdev;
 	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -260,7 +306,7 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 	 * H/W queues is done under the scheduler mutex
 	 *
 	 * No need to check if CQ is full because it was already
-	 * checked in hl_queue_sanity_checks
+	 * checked in ext_queue_sanity_checks
 	 */
 	cq = &hdev->completion_queue[q->hw_queue_id];
 	cq_addr = cq->bus_address + cq->pi * sizeof(struct hl_cq_entry);
@@ -274,18 +320,18 @@ static void ext_hw_queue_schedule_job(struct hl_cs_job *job)
 	cq->pi = hl_cq_inc_ptr(cq->pi);
-	ext_queue_submit_bd(hdev, q, ctl, len, ptr);
+	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 /*
- * int_hw_queue_schedule_job - submit a JOB to an internal queue
+ * int_queue_schedule_job - submit a JOB to an internal queue
 *
 * @job: pointer to the job that needs to be submitted to the queue
 *
 * This function must be called when the scheduler mutex is taken
 *
 */
-static void int_hw_queue_schedule_job(struct hl_cs_job *job)
+static void int_queue_schedule_job(struct hl_cs_job *job)
 {
 	struct hl_device *hdev = job->cs->ctx->hdev;
 	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
@@ -307,6 +353,60 @@ static void int_hw_queue_schedule_job(struct hl_cs_job *job)
 	hdev->asic_funcs->ring_doorbell(hdev, q->hw_queue_id, q->pi);
 }
+/*
+ * hw_queue_schedule_job - submit a JOB to a H/W queue
+ *
+ * @job: pointer to the job that needs to be submitted to the queue
+ *
+ * This function must be called when the scheduler mutex is taken
+ *
+ */
+static void hw_queue_schedule_job(struct hl_cs_job *job)
+{
+	struct hl_device *hdev = job->cs->ctx->hdev;
+	struct hl_hw_queue *q = &hdev->kernel_queues[job->hw_queue_id];
+	struct hl_cq *cq;
+	u64 ptr;
+	u32 offset, ctl, len;
+	/*
+	 * Upon PQE completion, COMP_DATA is used as the write data to the
+	 * completion queue (QMAN HBW message), and COMP_OFFSET is used as the
+	 * write address offset in the SM block (QMAN LBW message).
+	 * The write address offset is calculated as "COMP_OFFSET << 2".
+	 */
+	offset = job->cs->sequence & (HL_MAX_PENDING_CS - 1);
+	ctl = ((offset << BD_CTL_COMP_OFFSET_SHIFT) & BD_CTL_COMP_OFFSET_MASK) |
+		((q->pi << BD_CTL_COMP_DATA_SHIFT) & BD_CTL_COMP_DATA_MASK);
+	len = job->job_cb_size;
+	/*
+	 * A patched CB is created only if a user CB was allocated by driver and
+	 * MMU is disabled. If MMU is enabled, the user CB should be used
+	 * instead. If the user CB wasn't allocated by driver, assume that it
+	 * holds an address.
+	 */
+	if (job->patched_cb)
+		ptr = job->patched_cb->bus_address;
+	else if (job->is_kernel_allocated_cb)
+		ptr = job->user_cb->bus_address;
+	else
+		ptr = (u64) (uintptr_t) job->user_cb;
+	/*
+	 * No need to protect pi_offset because scheduling to the
+	 * H/W queues is done under the scheduler mutex
+	 *
+	 * No need to check if CQ is full because it was already
+	 * checked in hw_queue_sanity_checks
+	 */
+	cq = &hdev->completion_queue[q->hw_queue_id];
+	cq->pi = hl_cq_inc_ptr(cq->pi);
+	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
+}
 /*
 * hl_hw_queue_schedule_cs - schedule a command submission
 *
@@ -330,23 +430,34 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 	}
 	q = &hdev->kernel_queues[0];
-	/* This loop assumes all external queues are consecutive */
 	for (i = 0, cq_cnt = 0 ; i < HL_MAX_QUEUES ; i++, q++) {
-		if (q->queue_type == QUEUE_TYPE_EXT) {
 		if (cs->jobs_in_queue_cnt[i]) {
+			switch (q->queue_type) {
+			case QUEUE_TYPE_EXT:
 				rc = ext_queue_sanity_checks(hdev, q,
 						cs->jobs_in_queue_cnt[i], true);
-				if (rc)
+				break;
-					goto unroll_cq_resv;
+			case QUEUE_TYPE_INT:
-				cq_cnt++;
-			}
-		} else if (q->queue_type == QUEUE_TYPE_INT) {
-			if (cs->jobs_in_queue_cnt[i]) {
 				rc = int_queue_sanity_checks(hdev, q,
 						cs->jobs_in_queue_cnt[i]);
+				break;
+			case QUEUE_TYPE_HW:
+				rc = hw_queue_sanity_checks(hdev, q,
+						cs->jobs_in_queue_cnt[i]);
+				break;
+			default:
+				dev_err(hdev->dev, "Queue type %d is invalid\n",
+					q->queue_type);
+				rc = -EINVAL;
+				break;
+			}
 			if (rc)
 				goto unroll_cq_resv;
-			}
+			if (q->queue_type == QUEUE_TYPE_EXT ||
+					q->queue_type == QUEUE_TYPE_HW)
+				cq_cnt++;
 		}
 	}
@@ -373,21 +484,30 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 	}
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		if (job->ext_queue)
+		switch (job->queue_type) {
-			ext_hw_queue_schedule_job(job);
+		case QUEUE_TYPE_EXT:
-		else
+			ext_queue_schedule_job(job);
-			int_hw_queue_schedule_job(job);
+			break;
+		case QUEUE_TYPE_INT:
+			int_queue_schedule_job(job);
+			break;
+		case QUEUE_TYPE_HW:
+			hw_queue_schedule_job(job);
+			break;
+		default:
+			break;
+		}
 	cs->submitted = true;
 	goto out;
 unroll_cq_resv:
-	/* This loop assumes all external queues are consecutive */
 	q = &hdev->kernel_queues[0];
 	for (i = 0 ; (i < HL_MAX_QUEUES) && (cq_cnt > 0) ; i++, q++) {
-		if ((q->queue_type == QUEUE_TYPE_EXT) &&
+		if ((q->queue_type == QUEUE_TYPE_EXT ||
-				(cs->jobs_in_queue_cnt[i])) {
+				q->queue_type == QUEUE_TYPE_HW) &&
+				cs->jobs_in_queue_cnt[i]) {
 			atomic_t *free_slots =
 				&hdev->completion_queue[i].free_slots_cnt;
 			atomic_add(cs->jobs_in_queue_cnt[i], free_slots);
@@ -414,8 +534,8 @@ void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id)
 	q->ci = hl_queue_inc_ptr(q->ci);
 }
-static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
+static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
-				struct hl_hw_queue *q, bool is_cpu_queue)
+					bool is_cpu_queue)
 {
 	void *p;
 	int rc;
@@ -465,7 +585,7 @@ static int ext_and_cpu_hw_queue_init(struct hl_device *hdev,
 	return rc;
 }
-static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
 	void *p;
@@ -485,18 +605,38 @@ static int int_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 	return 0;
 }
-static int cpu_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+{
+	return ext_and_cpu_queue_init(hdev, q, true);
+}
+static int ext_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-	return ext_and_cpu_hw_queue_init(hdev, q, true);
+	return ext_and_cpu_queue_init(hdev, q, false);
 }
-static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
+static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 {
-	return ext_and_cpu_hw_queue_init(hdev, q, false);
+	void *p;
+	p = hdev->asic_funcs->asic_dma_alloc_coherent(hdev,
+						HL_QUEUE_SIZE_IN_BYTES,
+						&q->bus_address,
+						GFP_KERNEL | __GFP_ZERO);
+	if (!p)
+		return -ENOMEM;
+	q->kernel_address = (u64) (uintptr_t) p;
+	/* Make sure read/write pointers are initialized to start of queue */
+	q->ci = 0;
+	q->pi = 0;
+	return 0;
 }
 /*
- * hw_queue_init - main initialization function for H/W queue object
+ * queue_init - main initialization function for H/W queue object
 *
 * @hdev: pointer to hl_device device structure
 * @q: pointer to hl_hw_queue queue structure
@@ -505,7 +645,7 @@ static int ext_hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 * Allocate dma-able memory for the queue and initialize fields
 * Returns 0 on success
 */
-static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
+static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 			u32 hw_queue_id)
 {
 	int rc;
@@ -516,21 +656,20 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 	switch (q->queue_type) {
 	case QUEUE_TYPE_EXT:
-		rc = ext_hw_queue_init(hdev, q);
+		rc = ext_queue_init(hdev, q);
 		break;
 	case QUEUE_TYPE_INT:
-		rc = int_hw_queue_init(hdev, q);
+		rc = int_queue_init(hdev, q);
 		break;
 	case QUEUE_TYPE_CPU:
-		rc = cpu_hw_queue_init(hdev, q);
+		rc = cpu_queue_init(hdev, q);
+		break;
+	case QUEUE_TYPE_HW:
+		rc = hw_queue_init(hdev, q);
 		break;
 	case QUEUE_TYPE_NA:
 		q->valid = 0;
 		return 0;
 	default:
 		dev_crit(hdev->dev, "wrong queue type %d during init\n",
 			q->queue_type);
@@ -554,7 +693,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 *
 * Free the queue memory
 */
-static void hw_queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
+static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
 {
 	if (!q->valid)
 		return;
@@ -612,7 +751,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 			i < HL_MAX_QUEUES ; i++, q_ready_cnt++, q++) {
 		q->queue_type = asic->hw_queues_props[i].type;
-		rc = hw_queue_init(hdev, q, i);
+		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to initialize queue %d\n", i);
@@ -624,7 +763,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 release_queues:
 	for (i = 0, q = hdev->kernel_queues ; i < q_ready_cnt ; i++, q++)
-		hw_queue_fini(hdev, q);
+		queue_fini(hdev, q);
 	kfree(hdev->kernel_queues);
@@ -637,7 +776,7 @@ void hl_hw_queues_destroy(struct hl_device *hdev)
 	int i;
 	for (i = 0, q = hdev->kernel_queues ; i < HL_MAX_QUEUES ; i++, q++)
-		hw_queue_fini(hdev, q);
+		queue_fini(hdev, q);
 	kfree(hdev->kernel_queues);
 }

--- a/drivers/misc/habanalabs/include/qman_if.h
+++ b/drivers/misc/habanalabs/include/qman_if.h
@@ -23,6 +23,8 @@ struct hl_bd {
 #define HL_BD_SIZE			sizeof(struct hl_bd)
 /*
+ * S/W CTL FIELDS.
+ *
 * BD_CTL_REPEAT_VALID tells the CP whether the repeat field in the BD CTL is
 * valid. 1 means the repeat field is valid, 0 means not-valid,
 * i.e. repeat == 1
@@ -33,6 +35,16 @@ struct hl_bd {
 #define BD_CTL_SHADOW_INDEX_SHIFT	0
 #define BD_CTL_SHADOW_INDEX_MASK	0x00000FFF
+/*
+ * H/W CTL FIELDS
+ */
+#define BD_CTL_COMP_OFFSET_SHIFT	16
+#define BD_CTL_COMP_OFFSET_MASK		0x00FF0000
+#define BD_CTL_COMP_DATA_SHIFT		0
+#define BD_CTL_COMP_DATA_MASK		0x0000FFFF
 /*
 * COMPLETION QUEUE
 */