Commit f5419a42 authored by Damian Muszynski's avatar Damian Muszynski Committed by Herbert Xu

crypto: qat - add auto reset on error

Expose the `auto_reset` sysfs attribute to configure the driver to reset
the device when a fatal error is detected.

When auto reset is enabled, the driver resets the device when it detects
either an heartbeat failure or a fatal error through an interrupt.

This patch is based on earlier work done by Shashank Gupta.
Signed-off-by: default avatarDamian Muszynski <damian.muszynski@intel.com>
Reviewed-by: default avatarAhsan Atta <ahsan.atta@intel.com>
Reviewed-by: default avatarMarkas Rapoportas <markas.rapoportas@intel.com>
Reviewed-by: default avatarGiovanni Cabiddu <giovanni.cabiddu@intel.com>
Signed-off-by: default avatarMun Chun Yep <mun.chun.yep@intel.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 2aaa1995
...@@ -141,3 +141,23 @@ Description: ...@@ -141,3 +141,23 @@ Description:
64 64
This attribute is only available for qat_4xxx devices. This attribute is only available for qat_4xxx devices.
What: /sys/bus/pci/devices/<BDF>/qat/auto_reset
Date: March 2024
KernelVersion: 6.8
Contact: qat-linux@intel.com
Description: (RW) Reports the current state of the autoreset feature
for a QAT device
Write to the attribute to enable or disable device auto reset.
Device auto reset is disabled by default.
The values are::
* 1/Yy/on: auto reset enabled. If the device encounters an
unrecoverable error, it will be reset automatically.
* 0/Nn/off: auto reset disabled. If the device encounters an
unrecoverable error, it will not be reset.
This attribute is only available for qat_4xxx devices.
...@@ -402,6 +402,7 @@ struct adf_accel_dev { ...@@ -402,6 +402,7 @@ struct adf_accel_dev {
struct adf_error_counters ras_errors; struct adf_error_counters ras_errors;
struct mutex state_lock; /* protect state of the device */ struct mutex state_lock; /* protect state of the device */
bool is_vf; bool is_vf;
bool autoreset_on_error;
u32 accel_id; u32 accel_id;
}; };
#endif #endif
...@@ -204,6 +204,14 @@ const struct pci_error_handlers adf_err_handler = { ...@@ -204,6 +204,14 @@ const struct pci_error_handlers adf_err_handler = {
}; };
EXPORT_SYMBOL_GPL(adf_err_handler); EXPORT_SYMBOL_GPL(adf_err_handler);
int adf_dev_autoreset(struct adf_accel_dev *accel_dev)
{
if (accel_dev->autoreset_on_error)
return adf_dev_aer_schedule_reset(accel_dev, ADF_DEV_RESET_ASYNC);
return 0;
}
static void adf_notify_fatal_error_worker(struct work_struct *work) static void adf_notify_fatal_error_worker(struct work_struct *work)
{ {
struct adf_fatal_error_data *wq_data = struct adf_fatal_error_data *wq_data =
...@@ -215,10 +223,11 @@ static void adf_notify_fatal_error_worker(struct work_struct *work) ...@@ -215,10 +223,11 @@ static void adf_notify_fatal_error_worker(struct work_struct *work)
if (!accel_dev->is_vf) { if (!accel_dev->is_vf) {
/* Disable arbitration to stop processing of new requests */ /* Disable arbitration to stop processing of new requests */
if (hw_device->exit_arb) if (accel_dev->autoreset_on_error && hw_device->exit_arb)
hw_device->exit_arb(accel_dev); hw_device->exit_arb(accel_dev);
if (accel_dev->pf.vf_info) if (accel_dev->pf.vf_info)
adf_pf2vf_notify_fatal_error(accel_dev); adf_pf2vf_notify_fatal_error(accel_dev);
adf_dev_autoreset(accel_dev);
} }
kfree(wq_data); kfree(wq_data);
......
...@@ -87,6 +87,7 @@ int adf_ae_stop(struct adf_accel_dev *accel_dev); ...@@ -87,6 +87,7 @@ int adf_ae_stop(struct adf_accel_dev *accel_dev);
extern const struct pci_error_handlers adf_err_handler; extern const struct pci_error_handlers adf_err_handler;
void adf_reset_sbr(struct adf_accel_dev *accel_dev); void adf_reset_sbr(struct adf_accel_dev *accel_dev);
void adf_reset_flr(struct adf_accel_dev *accel_dev); void adf_reset_flr(struct adf_accel_dev *accel_dev);
int adf_dev_autoreset(struct adf_accel_dev *accel_dev);
void adf_dev_restore(struct adf_accel_dev *accel_dev); void adf_dev_restore(struct adf_accel_dev *accel_dev);
int adf_init_aer(void); int adf_init_aer(void);
void adf_exit_aer(void); void adf_exit_aer(void);
......
...@@ -204,6 +204,42 @@ static ssize_t pm_idle_enabled_store(struct device *dev, struct device_attribute ...@@ -204,6 +204,42 @@ static ssize_t pm_idle_enabled_store(struct device *dev, struct device_attribute
} }
static DEVICE_ATTR_RW(pm_idle_enabled); static DEVICE_ATTR_RW(pm_idle_enabled);
static ssize_t auto_reset_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
char *auto_reset;
struct adf_accel_dev *accel_dev;
accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
if (!accel_dev)
return -EINVAL;
auto_reset = accel_dev->autoreset_on_error ? "on" : "off";
return sysfs_emit(buf, "%s\n", auto_reset);
}
static ssize_t auto_reset_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct adf_accel_dev *accel_dev;
bool enabled = false;
int ret;
ret = kstrtobool(buf, &enabled);
if (ret)
return ret;
accel_dev = adf_devmgr_pci_to_accel_dev(to_pci_dev(dev));
if (!accel_dev)
return -EINVAL;
accel_dev->autoreset_on_error = enabled;
return count;
}
static DEVICE_ATTR_RW(auto_reset);
static DEVICE_ATTR_RW(state); static DEVICE_ATTR_RW(state);
static DEVICE_ATTR_RW(cfg_services); static DEVICE_ATTR_RW(cfg_services);
...@@ -291,6 +327,7 @@ static struct attribute *qat_attrs[] = { ...@@ -291,6 +327,7 @@ static struct attribute *qat_attrs[] = {
&dev_attr_pm_idle_enabled.attr, &dev_attr_pm_idle_enabled.attr,
&dev_attr_rp2srv.attr, &dev_attr_rp2srv.attr,
&dev_attr_num_rps.attr, &dev_attr_num_rps.attr,
&dev_attr_auto_reset.attr,
NULL, NULL,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment