Commit 29c0fa0f authored by Kurt Hackel's avatar Kurt Hackel Committed by Mark Fasheh

ocfs2: handle network errors during recovery

Signed-off-by: default avatarKurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: default avatarMark Fasheh <mark.fasheh@oracle.com>
parent c3187ce5
...@@ -757,6 +757,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) ...@@ -757,6 +757,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
struct list_head *iter; struct list_head *iter;
int ret; int ret;
u8 dead_node, reco_master; u8 dead_node, reco_master;
int skip_all_done = 0;
dlm = item->dlm; dlm = item->dlm;
dead_node = item->u.ral.dead_node; dead_node = item->u.ral.dead_node;
...@@ -793,12 +794,18 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) ...@@ -793,12 +794,18 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
dlm_move_reco_locks_to_list(dlm, &resources, dead_node); dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
/* now we can begin blasting lockreses without the dlm lock */ /* now we can begin blasting lockreses without the dlm lock */
/* any errors returned will be due to the new_master dying,
* the dlm_reco_thread should detect this */
list_for_each(iter, &resources) { list_for_each(iter, &resources) {
res = list_entry (iter, struct dlm_lock_resource, recovering); res = list_entry (iter, struct dlm_lock_resource, recovering);
ret = dlm_send_one_lockres(dlm, res, mres, reco_master, ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
DLM_MRES_RECOVERY); DLM_MRES_RECOVERY);
if (ret < 0) if (ret < 0) {
mlog_errno(ret); mlog_errno(ret);
skip_all_done = 1;
break;
}
} }
/* move the resources back to the list */ /* move the resources back to the list */
...@@ -806,9 +813,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) ...@@ -806,9 +813,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
list_splice_init(&resources, &dlm->reco.resources); list_splice_init(&resources, &dlm->reco.resources);
spin_unlock(&dlm->spinlock); spin_unlock(&dlm->spinlock);
if (!skip_all_done) {
ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
if (ret < 0) if (ret < 0) {
mlog_errno(ret); mlog_errno(ret);
}
}
free_page((unsigned long)data); free_page((unsigned long)data);
} }
...@@ -828,8 +838,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ...@@ -828,8 +838,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret); sizeof(done_msg), send_to, &tmpret);
/* negative status is ignored by the caller */ if (ret < 0) {
if (ret >= 0) if (!dlm_is_host_down(ret)) {
mlog_errno(ret);
mlog(ML_ERROR, "%s: unknown error sending data-done "
"to %u\n", dlm->name, send_to);
BUG();
}
} else
ret = tmpret; ret = tmpret;
return ret; return ret;
} }
...@@ -1109,22 +1125,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, ...@@ -1109,22 +1125,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* we must send it immediately. */ * we must send it immediately. */
ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
res, total_locks); res, total_locks);
if (ret < 0) { if (ret < 0)
// TODO goto error;
mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
"returned %d, TODO\n", ret);
BUG();
}
} }
} }
/* flush any remaining locks */ /* flush any remaining locks */
ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
if (ret < 0) { if (ret < 0)
// TODO goto error;
mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " return ret;
"TODO\n", ret);
error:
mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
dlm->name, ret);
if (!dlm_is_host_down(ret))
BUG(); BUG();
} mlog(0, "%s: node %u went down while sending %s "
"lockres %.*s\n", dlm->name, send_to,
flags & DLM_MRES_RECOVERY ? "recovery" : "migration",
res->lockname.len, res->lockname.name);
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment