Commit 162a2e41 authored by unknown's avatar unknown

ndb - bug#25364

  on master node failure during qmgr-commitreq
    make sure to remove all committed failed nodes from failed/prepfailed arrays


ndb/include/kernel/signaldata/DumpStateOrd.hpp:
  new error code
ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  extra error insert variable
ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  make sure to remove all committed failed nodes from failed/prepfailed arrays
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 2e39b557
...@@ -68,6 +68,7 @@ public: ...@@ -68,6 +68,7 @@ public:
// 100-105 TUP and ACC // 100-105 TUP and ACC
// 200-240 UTIL // 200-240 UTIL
// 300-305 TRIX // 300-305 TRIX
QmgrErr935 = 935,
NdbfsDumpFileStat = 400, NdbfsDumpFileStat = 400,
NdbfsDumpAllFiles = 401, NdbfsDumpAllFiles = 401,
NdbfsDumpOpenFiles = 402, NdbfsDumpOpenFiles = 402,
......
...@@ -21,6 +21,9 @@ Crash president when he starts to run in ArbitState 1-9. ...@@ -21,6 +21,9 @@ Crash president when he starts to run in ArbitState 1-9.
910: Crash new president after node crash 910: Crash new president after node crash
935 : Crash master on node failure (delayed)
and skip sending GSN_COMMIT_FAILREQ to specified node
ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, GLOBAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -426,6 +426,10 @@ private: ...@@ -426,6 +426,10 @@ private:
StopReq c_stopReq; StopReq c_stopReq;
bool check_multi_node_shutdown(Signal* signal); bool check_multi_node_shutdown(Signal* signal);
#ifdef ERROR_INSERT
Uint32 c_error_insert_extra;
#endif
}; };
#endif #endif
...@@ -3110,6 +3110,18 @@ Qmgr::sendCommitFailReq(Signal* signal) ...@@ -3110,6 +3110,18 @@ Qmgr::sendCommitFailReq(Signal* signal)
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
jam(); jam();
ptrAss(nodePtr, nodeRec); ptrAss(nodePtr, nodeRec);
#ifdef ERROR_INSERT
if (ERROR_INSERTED(935) && nodePtr.i == c_error_insert_extra)
{
ndbout_c("skipping node %d", c_error_insert_extra);
CLEAR_ERROR_INSERT_VALUE;
signal->theData[0] = 9999;
sendSignalWithDelay(CMVMI_REF, GSN_NDB_TAMPER, signal, 1000, 1);
continue;
}
#endif
if (nodePtr.p->phase == ZRUNNING) { if (nodePtr.p->phase == ZRUNNING) {
jam(); jam();
nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE; nodePtr.p->sendCommitFailReqStatus = Q_ACTIVE;
...@@ -3180,6 +3192,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal) ...@@ -3180,6 +3192,33 @@ void Qmgr::execPREP_FAILREF(Signal* signal)
return; return;
}//Qmgr::execPREP_FAILREF() }//Qmgr::execPREP_FAILREF()
static
Uint32
clear_nodes(Uint32 dstcnt, Uint16 dst[], Uint32 srccnt, const Uint16 src[])
{
if (srccnt == 0)
return dstcnt;
Uint32 pos = 0;
for (Uint32 i = 0; i<dstcnt; i++)
{
Uint32 node = dst[i];
for (Uint32 j = 0; j<srccnt; j++)
{
if (node == dst[j])
{
node = RNIL;
break;
}
}
if (node != RNIL)
{
dst[pos++] = node;
}
}
return pos;
}
/*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/
/* THE PRESIDENT IS NOW COMMITTING THE PREVIOUSLY PREPARED NODE FAILURE. */ /* THE PRESIDENT IS NOW COMMITTING THE PREVIOUSLY PREPARED NODE FAILURE. */
/*---------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/
...@@ -3267,19 +3306,18 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* signal) ...@@ -3267,19 +3306,18 @@ void Qmgr::execCOMMIT_FAILREQ(Signal* signal)
NodeFailRep::SignalLength, JBB); NodeFailRep::SignalLength, JBB);
}//if }//if
}//for }//for
if (cpresident != getOwnNodeId()) {
jam(); /**
cnoFailedNodes = cnoCommitFailedNodes - cnoFailedNodes; * Remove committed nodes from failed/prepared
if (cnoFailedNodes > 0) { */
jam(); cnoFailedNodes = clear_nodes(cnoFailedNodes,
guard0 = cnoFailedNodes - 1; cfailedNodes,
arrGuard(guard0 + cnoCommitFailedNodes, MAX_NDB_NODES); cnoCommitFailedNodes,
for (Tj = 0; Tj <= guard0; Tj++) { ccommitFailedNodes);
jam(); cnoPrepFailedNodes = clear_nodes(cnoPrepFailedNodes,
cfailedNodes[Tj] = cfailedNodes[Tj + cnoCommitFailedNodes]; cprepFailedNodes,
}//for cnoCommitFailedNodes,
}//if ccommitFailedNodes);
}//if
cnoCommitFailedNodes = 0; cnoCommitFailedNodes = 0;
}//if }//if
/**----------------------------------------------------------------------- /**-----------------------------------------------------------------------
...@@ -4658,6 +4696,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal) ...@@ -4658,6 +4696,14 @@ Qmgr::execDUMP_STATE_ORD(Signal* signal)
default: default:
; ;
}//switch }//switch
#ifdef ERROR_INSERT
if (signal->theData[0] == 935 && signal->getLength() == 2)
{
SET_ERROR_INSERT_VALUE(935);
c_error_insert_extra = signal->theData[1];
}
#endif
}//Qmgr::execDUMP_STATE_ORD() }//Qmgr::execDUMP_STATE_ORD()
void Qmgr::execSET_VAR_REQ(Signal* signal) void Qmgr::execSET_VAR_REQ(Signal* signal)
......
...@@ -955,6 +955,46 @@ int runBug24717(NDBT_Context* ctx, NDBT_Step* step){ ...@@ -955,6 +955,46 @@ int runBug24717(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK; return NDBT_OK;
} }
int runBug25364(NDBT_Context* ctx, NDBT_Step* step){
int result = NDBT_OK;
NdbRestarter restarter;
Ndb* pNdb = GETNDB(step);
int loops = ctx->getNumLoops();
if (restarter.getNumDbNodes() < 4)
return NDBT_OK;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
for (; loops; loops --)
{
int master = restarter.getMasterNodeId();
int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
int second = restarter.getRandomNodeSameNodeGroup(victim, rand());
int dump[] = { 935, victim } ;
if (restarter.dumpStateOneNode(master, dump, 2))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(master, val2, 2))
return NDBT_FAILED;
if (restarter.restartOneDbNode(second, false, true, true))
return NDBT_FAILED;
int nodes[2] = { master, second };
if (restarter.waitNodesNoStart(nodes, 2))
return NDBT_FAILED;
restarter.startNodes(nodes, 2);
if (restarter.waitNodesStarted(nodes, 2))
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
...@@ -1271,6 +1311,9 @@ TESTCASE("Bug20185", ...@@ -1271,6 +1311,9 @@ TESTCASE("Bug20185",
TESTCASE("Bug24717", ""){ TESTCASE("Bug24717", ""){
INITIALIZER(runBug24717); INITIALIZER(runBug24717);
} }
TESTCASE("Bug25364", ""){
INITIALIZER(runBug25364);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -469,6 +469,10 @@ max-time: 1000 ...@@ -469,6 +469,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug24717 T1 args: -n Bug24717 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug25364 T1
# OLD FLEX # OLD FLEX
max-time: 500 max-time: 500
cmd: flexBench cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment