Commit 9b18a0c7 authored by unknown's avatar unknown

ndb - bug#16772

  dont't allow node to join cluster until all nodes has completed failure handling


ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  When getting CM_ADD for node that I haven't completed failure handling for do _not_ just override.
  But instead set state...and send CM_ACK_ADD on execCONNECT_REP (much...later)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase for bug#16772
ndb/test/run-test/daily-basic-tests.txt:
  Run test in basic suite
parent 1db2668d
......@@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout)
void Qmgr::execCONNECT_REP(Signal* signal)
{
jamEntry();
const Uint32 nodeId = signal->theData[0];
c_connectedNodes.set(nodeId);
NodeRecPtr nodePtr;
......@@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal)
ptrCheckGuard(nodePtr, MAX_NODES, nodeRec);
switch(nodePtr.p->phase){
case ZSTARTING:
case ZRUNNING:
jam();
if(!c_start.m_nodes.isWaitingFor(nodeId)){
jam();
return;
}
break;
case ZRUNNING:
case ZPREPARE_FAIL:
case ZFAIL_CLOSING:
jam();
......@@ -278,20 +283,27 @@ void Qmgr::execCONNECT_REP(Signal* signal)
return;
}
if(!c_start.m_nodes.isWaitingFor(nodeId)){
jam();
return;
}
switch(c_start.m_gsn){
case GSN_CM_REGREQ:
jam();
sendCmRegReq(signal, nodeId);
return;
case GSN_CM_NODEINFOREQ:{
case GSN_CM_NODEINFOREQ:
jam();
sendCmNodeInfoReq(signal, nodeId, nodePtr.p);
return;
case GSN_CM_ADD:{
jam();
ndbrequire(getOwnNodeId() != cpresident);
c_start.m_nodes.clearWaitingFor(nodeId);
c_start.m_gsn = RNIL;
NodeRecPtr addNodePtr;
addNodePtr.i = nodeId;
ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec);
cmAddPrepare(signal, addNodePtr, nodePtr.p);
return;
}
default:
return;
......@@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){
return;
case ZFAIL_CLOSING:
jam();
#ifdef VM_TRACE
ndbout_c("Enabling communication to CM_ADD node state=%d",
#if 1
warningEvent("Recieved request to incorperate node %u, "
"while error handling has not yet completed",
nodePtr.i);
ndbrequire(getOwnNodeId() != cpresident);
ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD);
c_start.m_nodes.clearWaitingFor();
c_start.m_nodes.setWaitingFor(nodePtr.i);
c_start.m_gsn = GSN_CM_ADD;
#else
warningEvent("Enabling communication to CM_ADD node %u state=%d",
nodePtr.i,
nodePtr.p->phase);
#endif
nodePtr.p->phase = ZSTARTING;
nodePtr.p->failState = NORMAL;
signal->theData[0] = 0;
signal->theData[1] = nodePtr.i;
sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA);
#endif
return;
case ZSTARTING:
break;
......@@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal)
jamEntry();
failedNodePtr.i = signal->theData[0];
if (ERROR_INSERTED(930))
{
CLEAR_ERROR_INSERT_VALUE;
infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i);
return;
}
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){
failedNodePtr.p->failState = NORMAL;
} else {
jam();
char buf[100];
BaseString::snprintf(buf, 100,
"Received NDB_FAILCONF for node %u with state: %d %d",
failedNodePtr.i,
failedNodePtr.p->phase,
failedNodePtr.p->failState);
progError(__LINE__, 0, buf);
systemErrorLab(signal, __LINE__);
}//if
if (cpresident == getOwnNodeId()) {
......@@ -2077,7 +2117,39 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
if (failedNodePtr.i == getOwnNodeId()) {
jam();
systemErrorLab(signal, __LINE__);
const char * msg = 0;
switch(aFailCause){
case FailRep::ZOWN_FAILURE:
msg = "Own failure";
break;
case FailRep::ZOTHER_NODE_WHEN_WE_START:
case FailRep::ZOTHERNODE_FAILED_DURING_START:
msg = "Other node died during start";
break;
case FailRep::ZIN_PREP_FAIL_REQ:
msg = "Prep fail";
break;
case FailRep::ZSTART_IN_REGREQ:
msg = "Start timeout";
break;
case FailRep::ZHEARTBEAT_FAILURE:
msg = "Hearbeat failure";
break;
case FailRep::ZLINK_FAILURE:
msg = "Connection failure";
break;
}
char buf[100];
BaseString::snprintf(buf, 100,
"We(%u) have been declared dead by %u reason: %s(%u)",
getOwnNodeId(),
refToNode(signal->getSendersBlockRef()),
aFailCause,
msg ? msg : "<Unknown>");
progError(__LINE__, 0, buf);
return;
}//if
......@@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal,
cfailureNr = cprepareFailureNr;
ctoFailureNr = 0;
ctoStatus = Q_ACTIVE;
c_start.reset(); // Don't take over nodes being started
if (cnoCommitFailedNodes > 0) {
jam();
/**-----------------------------------------------------------------
......
......@@ -535,6 +535,52 @@ int runBug15685(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_FAILED;
}
int
runBug16772(NDBT_Context* ctx, NDBT_Step* step){
NdbRestarter restarter;
if (restarter.getNumDbNodes() < 2)
{
ctx->stopTest();
return NDBT_OK;
}
int aliveNodeId = restarter.getRandomNotMasterNodeId(rand());
int deadNodeId = aliveNodeId;
while (deadNodeId == aliveNodeId)
deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes());
if (restarter.insertErrorInNode(aliveNodeId, 930))
return NDBT_FAILED;
if (restarter.restartOneDbNode(deadNodeId,
/** initial */ false,
/** nostart */ true,
/** abort */ true))
return NDBT_FAILED;
if (restarter.waitNodesNoStart(&deadNodeId, 1))
return NDBT_FAILED;
if (restarter.startNodes(&deadNodeId, 1))
return NDBT_FAILED;
// It should now be hanging since we throw away NDB_FAILCONF
int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10);
// So this should fail...i.e it should not reach startphase 3
// Now send a NDB_FAILCONF for deadNo
int dump[] = { 7020, 323, 252, 0 };
dump[3] = deadNodeId;
if (restarter.dumpStateOneNode(aliveNodeId, dump, 4))
return NDBT_FAILED;
if (restarter.waitNodesStarted(&deadNodeId, 1))
return NDBT_FAILED;
return ret ? NDBT_OK : NDBT_FAILED;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -820,6 +866,10 @@ TESTCASE("Bug15685",
STEP(runBug15685);
FINALIZER(runClearTable);
}
TESTCASE("Bug16772",
"Test bug with restarting before NF handling is complete"){
STEP(runBug16772);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -446,6 +446,10 @@ max-time: 500
cmd: testNodeRestart
args: -n Bug15685 T1
max-time: 500
cmd: testNodeRestart
args: -n Bug16772 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment