Commit 2da9f8d8 authored by farah kassabri's avatar farah kassabri Committed by Oded Gabbay

accel/habanalabs: fix wait_for_interrupt abortion flow

When the driver needs to abort waiters for interrupts, for cases
such as critical events that occur and driver need to do hard reset,
in such scenario the driver will complete the fence to wake up the
waiting thread, and will set the fence error indication.
The return value of the completion API will be greater than 0
since it will return the timeout, but as this indicates successful
completion, the driver should mark it as aborted.
Signed-off-by: default avatarfarah kassabri <fkassabri@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent eaa43a06
...@@ -3449,7 +3449,15 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -3449,7 +3449,15 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion, completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
timeout); timeout);
if (completion_rc > 0) { if (completion_rc > 0) {
if (pend->fence.error == -EIO) {
dev_err_ratelimited(hdev->dev,
"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
pend->fence.error);
rc = -EIO;
*status = HL_WAIT_CS_STATUS_ABORTED;
} else {
*status = HL_WAIT_CS_STATUS_COMPLETED; *status = HL_WAIT_CS_STATUS_COMPLETED;
}
} else { } else {
if (completion_rc == -ERESTARTSYS) { if (completion_rc == -ERESTARTSYS) {
dev_err_ratelimited(hdev->dev, dev_err_ratelimited(hdev->dev,
...@@ -3457,16 +3465,9 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -3457,16 +3465,9 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
data->interrupt->interrupt_id); data->interrupt->interrupt_id);
rc = -EINTR; rc = -EINTR;
*status = HL_WAIT_CS_STATUS_ABORTED; *status = HL_WAIT_CS_STATUS_ABORTED;
} else {
if (pend->fence.error == -EIO) {
dev_err_ratelimited(hdev->dev,
"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
pend->fence.error);
rc = -EIO;
*status = HL_WAIT_CS_STATUS_ABORTED;
} else { } else {
/* The wait has timed-out. We don't know anything beyond that /* The wait has timed-out. We don't know anything beyond that
* because the workload wasn't submitted through the driver. * because the workload was not submitted through the driver.
* Therefore, from driver's perspective, the workload is still * Therefore, from driver's perspective, the workload is still
* executing. * executing.
*/ */
...@@ -3474,7 +3475,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, ...@@ -3474,7 +3475,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
*status = HL_WAIT_CS_STATUS_BUSY; *status = HL_WAIT_CS_STATUS_BUSY;
} }
} }
}
/* /*
* We keep removing the node from list here, and not at the irq handler * We keep removing the node from list here, and not at the irq handler
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment