Commit a95cadb9 authored by Bart Van Assche's avatar Bart Van Assche Committed by Roland Dreier

IB/srp: Add periodic reconnect functionality

After a transport layer occurred, periodically try to reconnect
to the target until the dev_loss timer expires.  Protect the
callback functions that can be invoked from inside the SCSI EH
against concurrent invocation with srp_reconnect_rport() via the
rport mutex. Change the default dev_loss_tmo from 60s into 600s
to give the reconnect mechanism a chance to kick in.
Signed-off-by: default avatarBart Van Assche <bvanassche@acm.org>
Acked-by: default avatarDavid Dillow <dillowda@ornl.gov>
Signed-off-by: default avatarRoland Dreier <roland@purestorage.com>
parent 8c64e453
...@@ -88,6 +88,11 @@ MODULE_PARM_DESC(topspin_workarounds, ...@@ -88,6 +88,11 @@ MODULE_PARM_DESC(topspin_workarounds,
static struct kernel_param_ops srp_tmo_ops; static struct kernel_param_ops srp_tmo_ops;
static int srp_reconnect_delay = 10;
module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
static int srp_fast_io_fail_tmo = 15; static int srp_fast_io_fail_tmo = 15;
module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
S_IRUGO | S_IWUSR); S_IRUGO | S_IWUSR);
...@@ -96,7 +101,7 @@ MODULE_PARM_DESC(fast_io_fail_tmo, ...@@ -96,7 +101,7 @@ MODULE_PARM_DESC(fast_io_fail_tmo,
" layer error and failing all I/O. \"off\" means that this" " layer error and failing all I/O. \"off\" means that this"
" functionality is disabled."); " functionality is disabled.");
static int srp_dev_loss_tmo = 60; static int srp_dev_loss_tmo = 600;
module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
S_IRUGO | S_IWUSR); S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(dev_loss_tmo, MODULE_PARM_DESC(dev_loss_tmo,
...@@ -144,10 +149,14 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp) ...@@ -144,10 +149,14 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp)
} else { } else {
tmo = -1; tmo = -1;
} }
if (kp->arg == &srp_fast_io_fail_tmo) if (kp->arg == &srp_reconnect_delay)
res = srp_tmo_valid(-1, tmo, srp_dev_loss_tmo); res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
srp_dev_loss_tmo);
else if (kp->arg == &srp_fast_io_fail_tmo)
res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
else else
res = srp_tmo_valid(-1, srp_fast_io_fail_tmo, tmo); res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
tmo);
if (res) if (res)
goto out; goto out;
*(int *)kp->arg = tmo; *(int *)kp->arg = tmo;
...@@ -1426,18 +1435,29 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) ...@@ -1426,18 +1435,29 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr)
static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
{ {
struct srp_target_port *target = host_to_target(shost); struct srp_target_port *target = host_to_target(shost);
struct srp_rport *rport = target->rport;
struct srp_request *req; struct srp_request *req;
struct srp_iu *iu; struct srp_iu *iu;
struct srp_cmd *cmd; struct srp_cmd *cmd;
struct ib_device *dev; struct ib_device *dev;
unsigned long flags; unsigned long flags;
int len, result; int len, result;
const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
/*
* The SCSI EH thread is the only context from which srp_queuecommand()
* can get invoked for blocked devices (SDEV_BLOCK /
* SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
* locking the rport mutex if invoked from inside the SCSI EH.
*/
if (in_scsi_eh)
mutex_lock(&rport->mutex);
result = srp_chkready(target->rport); result = srp_chkready(target->rport);
if (unlikely(result)) { if (unlikely(result)) {
scmnd->result = result; scmnd->result = result;
scmnd->scsi_done(scmnd); scmnd->scsi_done(scmnd);
return 0; goto unlock_rport;
} }
spin_lock_irqsave(&target->lock, flags); spin_lock_irqsave(&target->lock, flags);
...@@ -1482,6 +1502,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) ...@@ -1482,6 +1502,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
goto err_unmap; goto err_unmap;
} }
unlock_rport:
if (in_scsi_eh)
mutex_unlock(&rport->mutex);
return 0; return 0;
err_unmap: err_unmap:
...@@ -1496,6 +1520,9 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) ...@@ -1496,6 +1520,9 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
err_unlock: err_unlock:
spin_unlock_irqrestore(&target->lock, flags); spin_unlock_irqrestore(&target->lock, flags);
if (in_scsi_eh)
mutex_unlock(&rport->mutex);
return SCSI_MLQUEUE_HOST_BUSY; return SCSI_MLQUEUE_HOST_BUSY;
} }
...@@ -1780,6 +1807,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) ...@@ -1780,6 +1807,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
static int srp_send_tsk_mgmt(struct srp_target_port *target, static int srp_send_tsk_mgmt(struct srp_target_port *target,
u64 req_tag, unsigned int lun, u8 func) u64 req_tag, unsigned int lun, u8 func)
{ {
struct srp_rport *rport = target->rport;
struct ib_device *dev = target->srp_host->srp_dev->dev; struct ib_device *dev = target->srp_host->srp_dev->dev;
struct srp_iu *iu; struct srp_iu *iu;
struct srp_tsk_mgmt *tsk_mgmt; struct srp_tsk_mgmt *tsk_mgmt;
...@@ -1789,12 +1817,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, ...@@ -1789,12 +1817,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
init_completion(&target->tsk_mgmt_done); init_completion(&target->tsk_mgmt_done);
/*
* Lock the rport mutex to avoid that srp_create_target_ib() is
* invoked while a task management function is being sent.
*/
mutex_lock(&rport->mutex);
spin_lock_irq(&target->lock); spin_lock_irq(&target->lock);
iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT);
spin_unlock_irq(&target->lock); spin_unlock_irq(&target->lock);
if (!iu) if (!iu) {
mutex_unlock(&rport->mutex);
return -1; return -1;
}
ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
DMA_TO_DEVICE); DMA_TO_DEVICE);
...@@ -1811,8 +1847,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, ...@@ -1811,8 +1847,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target,
DMA_TO_DEVICE); DMA_TO_DEVICE);
if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { if (srp_post_send(target, iu, sizeof *tsk_mgmt)) {
srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT);
mutex_unlock(&rport->mutex);
return -1; return -1;
} }
mutex_unlock(&rport->mutex);
if (!wait_for_completion_timeout(&target->tsk_mgmt_done, if (!wait_for_completion_timeout(&target->tsk_mgmt_done,
msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
...@@ -2713,6 +2752,7 @@ static void srp_remove_one(struct ib_device *device) ...@@ -2713,6 +2752,7 @@ static void srp_remove_one(struct ib_device *device)
static struct srp_function_template ib_srp_transport_functions = { static struct srp_function_template ib_srp_transport_functions = {
.has_rport_state = true, .has_rport_state = true,
.reset_timer_if_blocked = true, .reset_timer_if_blocked = true,
.reconnect_delay = &srp_reconnect_delay,
.fast_io_fail_tmo = &srp_fast_io_fail_tmo, .fast_io_fail_tmo = &srp_fast_io_fail_tmo,
.dev_loss_tmo = &srp_dev_loss_tmo, .dev_loss_tmo = &srp_dev_loss_tmo,
.reconnect = srp_rport_reconnect, .reconnect = srp_rport_reconnect,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment