Commit 6b4b87f2 authored by Dave Jiang's avatar Dave Jiang Committed by Vinod Koul

dmaengine: idxd: fix submission race window

Konstantin observed that when descriptors are submitted, the descriptor is
added to the pending list after the submission. This creates a race window
with the slight possibility that the descriptor can complete before it
gets added to the pending list and this window would cause the completion
handler to miss processing the descriptor.

To address the issue, the addition of the descriptor to the pending list
must be done before it gets submitted to the hardware. However, submitting
to swq with ENQCMDS instruction can cause a failure with the condition of
either wq is full or wq is not "active".

With the descriptor allocation being the gate to the wq capacity, it is not
possible to hit a retry with ENQCMDS submission to the swq. The only
possible failure can happen is when wq is no longer "active" due to hw
error and therefore we are moving towards taking down the portal. Given
this is a rare condition and there's no longer concern over I/O
performance, the driver can walk the completion lists in order to retrieve
and abort the descriptor.

The error path will set the descriptor to aborted status. It will take the
work list lock to prevent further processing of worklist. It will do a
delete_all on the pending llist to retrieve all descriptors on the pending
llist. The delete_all action does not require a lock. It will walk through
the acquired llist to find the aborted descriptor while add all remaining
descriptors to the work list since it holds the lock. If it does not find
the aborted descriptor on the llist, it will walk through the work
list. And if it still does not find the descriptor, then it means the
interrupt handler has removed the desc from the llist but is pending on
the work list lock and will process it once the error path releases the
lock.

Fixes: eb15e715 ("dmaengine: idxd: add interrupt handle request and release support")
Reported-by: default avatarKonstantin Ananyev <konstantin.ananyev@intel.com>
Signed-off-by: default avatarDave Jiang <dave.jiang@intel.com>
Link: https://lore.kernel.org/r/162628855747.360485.10101925573082466530.stgit@djiang5-desk3.ch.intel.comSigned-off-by: default avatarVinod Koul <vkoul@kernel.org>
parent 7eb25da1
...@@ -294,6 +294,14 @@ struct idxd_desc { ...@@ -294,6 +294,14 @@ struct idxd_desc {
struct idxd_wq *wq; struct idxd_wq *wq;
}; };
/*
* This is software defined error for the completion status. We overload the error code
* that will never appear in completion status and only SWERR register.
*/
enum idxd_completion_status {
IDXD_COMP_DESC_ABORT = 0xff,
};
#define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev) #define confdev_to_idxd(dev) container_of(dev, struct idxd_device, conf_dev)
#define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev) #define confdev_to_wq(dev) container_of(dev, struct idxd_wq, conf_dev)
...@@ -482,4 +490,10 @@ static inline void perfmon_init(void) {} ...@@ -482,4 +490,10 @@ static inline void perfmon_init(void) {}
static inline void perfmon_exit(void) {} static inline void perfmon_exit(void) {}
#endif #endif
static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
{
idxd_dma_complete_txd(desc, reason);
idxd_free_desc(desc->wq, desc);
}
#endif #endif
...@@ -245,12 +245,6 @@ static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr) ...@@ -245,12 +245,6 @@ static inline bool match_fault(struct idxd_desc *desc, u64 fault_addr)
return false; return false;
} }
static inline void complete_desc(struct idxd_desc *desc, enum idxd_complete_type reason)
{
idxd_dma_complete_txd(desc, reason);
idxd_free_desc(desc->wq, desc);
}
static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
enum irq_work_type wtype, enum irq_work_type wtype,
int *processed, u64 data) int *processed, u64 data)
...@@ -272,8 +266,16 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry, ...@@ -272,8 +266,16 @@ static int irq_process_pending_llist(struct idxd_irq_entry *irq_entry,
reason = IDXD_COMPLETE_DEV_FAIL; reason = IDXD_COMPLETE_DEV_FAIL;
llist_for_each_entry_safe(desc, t, head, llnode) { llist_for_each_entry_safe(desc, t, head, llnode) {
if (desc->completion->status) { u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS)
if (status) {
if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
complete_desc(desc, IDXD_COMPLETE_ABORT);
(*processed)++;
continue;
}
if (unlikely(status != DSA_COMP_SUCCESS))
match_fault(desc, data); match_fault(desc, data);
complete_desc(desc, reason); complete_desc(desc, reason);
(*processed)++; (*processed)++;
...@@ -329,7 +331,14 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry, ...@@ -329,7 +331,14 @@ static int irq_process_work_list(struct idxd_irq_entry *irq_entry,
spin_unlock_irqrestore(&irq_entry->list_lock, flags); spin_unlock_irqrestore(&irq_entry->list_lock, flags);
list_for_each_entry(desc, &flist, list) { list_for_each_entry(desc, &flist, list) {
if ((desc->completion->status & DSA_COMP_STATUS_MASK) != DSA_COMP_SUCCESS) u8 status = desc->completion->status & DSA_COMP_STATUS_MASK;
if (unlikely(status == IDXD_COMP_DESC_ABORT)) {
complete_desc(desc, IDXD_COMPLETE_ABORT);
continue;
}
if (unlikely(status != DSA_COMP_SUCCESS))
match_fault(desc, data); match_fault(desc, data);
complete_desc(desc, reason); complete_desc(desc, reason);
} }
......
...@@ -87,9 +87,64 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc) ...@@ -87,9 +87,64 @@ void idxd_free_desc(struct idxd_wq *wq, struct idxd_desc *desc)
sbitmap_queue_clear(&wq->sbq, desc->id, cpu); sbitmap_queue_clear(&wq->sbq, desc->id, cpu);
} }
static struct idxd_desc *list_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
struct idxd_desc *desc)
{
struct idxd_desc *d, *n;
lockdep_assert_held(&ie->list_lock);
list_for_each_entry_safe(d, n, &ie->work_list, list) {
if (d == desc) {
list_del(&d->list);
return d;
}
}
/*
* At this point, the desc needs to be aborted is held by the completion
* handler where it has taken it off the pending list but has not added to the
* work list. It will be cleaned up by the interrupt handler when it sees the
* IDXD_COMP_DESC_ABORT for completion status.
*/
return NULL;
}
static void llist_abort_desc(struct idxd_wq *wq, struct idxd_irq_entry *ie,
struct idxd_desc *desc)
{
struct idxd_desc *d, *t, *found = NULL;
struct llist_node *head;
unsigned long flags;
desc->completion->status = IDXD_COMP_DESC_ABORT;
/*
* Grab the list lock so it will block the irq thread handler. This allows the
* abort code to locate the descriptor need to be aborted.
*/
spin_lock_irqsave(&ie->list_lock, flags);
head = llist_del_all(&ie->pending_llist);
if (head) {
llist_for_each_entry_safe(d, t, head, llnode) {
if (d == desc) {
found = desc;
continue;
}
list_add_tail(&desc->list, &ie->work_list);
}
}
if (!found)
found = list_abort_desc(wq, ie, desc);
spin_unlock_irqrestore(&ie->list_lock, flags);
if (found)
complete_desc(found, IDXD_COMPLETE_ABORT);
}
int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
{ {
struct idxd_device *idxd = wq->idxd; struct idxd_device *idxd = wq->idxd;
struct idxd_irq_entry *ie = NULL;
void __iomem *portal; void __iomem *portal;
int rc; int rc;
...@@ -107,6 +162,16 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) ...@@ -107,6 +162,16 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
* even on UP because the recipient is a device. * even on UP because the recipient is a device.
*/ */
wmb(); wmb();
/*
* Pending the descriptor to the lockless list for the irq_entry
* that we designated the descriptor to.
*/
if (desc->hw->flags & IDXD_OP_FLAG_RCI) {
ie = &idxd->irq_entries[desc->vector];
llist_add(&desc->llnode, &ie->pending_llist);
}
if (wq_dedicated(wq)) { if (wq_dedicated(wq)) {
iosubmit_cmds512(portal, desc->hw, 1); iosubmit_cmds512(portal, desc->hw, 1);
} else { } else {
...@@ -117,18 +182,13 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc) ...@@ -117,18 +182,13 @@ int idxd_submit_desc(struct idxd_wq *wq, struct idxd_desc *desc)
* device is not accepting descriptor at all. * device is not accepting descriptor at all.
*/ */
rc = enqcmds(portal, desc->hw); rc = enqcmds(portal, desc->hw);
if (rc < 0) if (rc < 0) {
if (ie)
llist_abort_desc(wq, ie, desc);
return rc; return rc;
} }
}
percpu_ref_put(&wq->wq_active); percpu_ref_put(&wq->wq_active);
/*
* Pending the descriptor to the lockless list for the irq_entry
* that we designated the descriptor to.
*/
if (desc->hw->flags & IDXD_OP_FLAG_RCI)
llist_add(&desc->llnode, &idxd->irq_entries[desc->vector].pending_llist);
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment