Commit 3bfaf333 authored by unknown's avatar unknown

ndb - bug#16772

  dont't allow node to join cluster until all nodes has completed failure handling


ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override.
  But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase for bug#16772
ndb/test/run-test/daily-basic-tests.txt:
  Run test in basic suite
parent 6ac6b08c
...@@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout) ...@@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout)
void Qmgr::execCONNECT_REP(Signal* signal) void Qmgr::execCONNECT_REP(Signal* signal)
{ {
jamEntry();
const Uint32 nodeId = signal->theData[0]; const Uint32 nodeId = signal->theData[0];
c_connectedNodes.set(nodeId); c_connectedNodes.set(nodeId);
NodeRecPtr nodePtr; NodeRecPtr nodePtr;
...@@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal) ...@@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal)
ptrCheckGuard(nodePtr, MAX_NODES, nodeRec); ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
switch(nodePtr.p->phase){ switch(nodePtr.p->phase){
case ZSTARTING: case ZSTARTING:
case ZRUNNING:
jam();
if(!c_start.m_nodes.isWaitingFor(nodeId)){
jam(); jam();
return;
}
break; break;
case ZRUNNING:
case ZPREPARE_FAIL: case ZPREPARE_FAIL:
case ZFAIL_CLOSING: case ZFAIL_CLOSING:
jam(); jam();
...@@ -278,20 +283,27 @@ void Qmgr::execCONNECT_REP(Signal* signal) ...@@ -278,20 +283,27 @@ void Qmgr::execCONNECT_REP(Signal* signal)
return; return;
} }
if(!c_start.m_nodes.isWaitingFor(nodeId)){
jam();
return;
}
switch(c_start.m_gsn){ switch(c_start.m_gsn){
case GSN_CM_REGREQ: case GSN_CM_REGREQ:
jam(); jam();
sendCmRegReq(signal, nodeId); sendCmRegReq(signal, nodeId);
return; return;
case GSN_CM_NODEINFOREQ:{ case GSN_CM_NODEINFOREQ:
jam(); jam();
sendCmNodeInfoReq(signal, nodeId, nodePtr.p); sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
return; return;
case GSN_CM_ADD:{
jam();
ndbrequire(getOwnNodeId() != cpresident);
c_start.m_nodes.clearWaitingFor(nodeId);
c_start.m_gsn = RNIL;
NodeRecPtr addNodePtr;
addNodePtr.i = nodeId;
ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
cmAddPrepare(signal, addNodePtr, nodePtr.p);
return;
} }
default: default:
return; return;
...@@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ ...@@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){
return; return;
case ZFAIL_CLOSING: case ZFAIL_CLOSING:
jam(); jam();
#ifdef VM_TRACE
ndbout_c("Enabling communication to CM_ADD node state=%d", #if 1
warningEvent("Recieved request to incorperate node %u, "
"while error handling has not yet completed",
nodePtr.i);
ndbrequire(getOwnNodeId() != cpresident);
ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
c_start.m_nodes.clearWaitingFor();
c_start.m_nodes.setWaitingFor(nodePtr.i);
c_start.m_gsn = GSN_CM_ADD;
#else
warningEvent("Enabling communication to CM_ADD node %u state=%d",
nodePtr.i,
nodePtr.p->phase); nodePtr.p->phase);
#endif
nodePtr.p->phase = ZSTARTING; nodePtr.p->phase = ZSTARTING;
nodePtr.p->failState = NORMAL; nodePtr.p->failState = NORMAL;
signal->theData[0] = 0; signal->theData[0] = 0;
signal->theData[1] = nodePtr.i; signal->theData[1] = nodePtr.i;
sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA); sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
#endif
return; return;
case ZSTARTING: case ZSTARTING:
break; break;
...@@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal) ...@@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal)
jamEntry(); jamEntry();
failedNodePtr.i = signal->theData[0]; failedNodePtr.i = signal->theData[0];
if (ERROR_INSERTED(930))
{
CLEAR_ERROR_INSERT_VALUE;
infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
return;
}
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){ if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
failedNodePtr.p->failState = NORMAL; failedNodePtr.p->failState = NORMAL;
} else { } else {
jam(); jam();
char buf[100];
BaseString::snprintf(buf, 100,
"Received NDB_FAILCONF for node %u with state: %d %d",
failedNodePtr.i,
failedNodePtr.p->phase,
failedNodePtr.p->failState);
progError(__LINE__, 0, buf);
systemErrorLab(signal, __LINE__); systemErrorLab(signal, __LINE__);
}//if }//if
if (cpresident == getOwnNodeId()) { if (cpresident == getOwnNodeId()) {
...@@ -2077,7 +2117,39 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, ...@@ -2077,7 +2117,39 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.i == getOwnNodeId()) { if (failedNodePtr.i == getOwnNodeId()) {
jam(); jam();
systemErrorLab(signal, __LINE__);
const char * msg = 0;
switch(aFailCause){
case FailRep::ZOWN_FAILURE:
msg = "Own failure";
break;
case FailRep::ZOTHER_NODE_WHEN_WE_START:
case FailRep::ZOTHERNODE_FAILED_DURING_START:
msg = "Other node died during start";
break;
case FailRep::ZIN_PREP_FAIL_REQ:
msg = "Prep fail";
break;
case FailRep::ZSTART_IN_REGREQ:
msg = "Start timeout";
break;
case FailRep::ZHEARTBEAT_FAILURE:
msg = "Hearbeat failure";
break;
case FailRep::ZLINK_FAILURE:
msg = "Connection failure";
break;
}
char buf[100];
BaseString::snprintf(buf, 100,
"We(%u) have been declared dead by %u reason: %s(%u)",
getOwnNodeId(),
refToNode(signal->getSendersBlockRef()),
aFailCause,
msg ? msg : "<Unknown>");
progError(__LINE__, 0, buf);
return; return;
}//if }//if
...@@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal, ...@@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal,
cfailureNr = cprepareFailureNr; cfailureNr = cprepareFailureNr;
ctoFailureNr = 0; ctoFailureNr = 0;
ctoStatus = Q_ACTIVE; ctoStatus = Q_ACTIVE;
c_start.reset(); // Don't take over nodes being started
if (cnoCommitFailedNodes > 0) { if (cnoCommitFailedNodes > 0) {
jam(); jam();
/**----------------------------------------------------------------- /**-----------------------------------------------------------------
......
...@@ -535,6 +535,52 @@ int runBug15685(NDBT_Context* ctx, NDBT_Step* step){ ...@@ -535,6 +535,52 @@ int runBug15685(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_FAILED; return NDBT_FAILED;
} }
int
runBug16772(NDBT_Context* ctx, NDBT_Step* step){
NdbRestarter restarter;
if (restarter.getNumDbNodes() < 2)
{
ctx->stopTest();
return NDBT_OK;
}
int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
int deadNodeId = aliveNodeId;
while (deadNodeId == aliveNodeId)
deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
if (restarter.insertErrorInNode(aliveNodeId, 930))
return NDBT_FAILED;
if (restarter.restartOneDbNode(deadNodeId,
/** initial */ false,
/** nostart */ true,
/** abort */ true))
return NDBT_FAILED;
if (restarter.waitNodesNoStart(&deadNodeId, 1))
return NDBT_FAILED;
if (restarter.startNodes(&deadNodeId, 1))
return NDBT_FAILED;
// It should now be hanging since we throw away NDB_FAILCONF
int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
// So this should fail...i.e it should not reach startphase 3
// Now send a NDB_FAILCONF for deadNo
int dump[] = { 7020, 323, 252, 0 };
dump[3] = deadNodeId;
if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
return NDBT_FAILED;
if (restarter.waitNodesStarted(&deadNodeId, 1))
return NDBT_FAILED;
return ret ? NDBT_OK : NDBT_FAILED;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
...@@ -820,6 +866,10 @@ TESTCASE("Bug15685", ...@@ -820,6 +866,10 @@ TESTCASE("Bug15685",
STEP(runBug15685); STEP(runBug15685);
FINALIZER(runClearTable); FINALIZER(runClearTable);
} }
TESTCASE("Bug16772",
"Test bug with restarting before NF handling is complete"){
STEP(runBug16772);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -446,6 +446,10 @@ max-time: 500 ...@@ -446,6 +446,10 @@ max-time: 500
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug15685 T1 args: -n Bug15685 T1
max-time: 500
cmd: testNodeRestart
args: -n Bug16772 T1
# OLD FLEX # OLD FLEX
max-time: 500 max-time: 500
cmd: flexBench cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment