Commit 7223be24 authored by unknown's avatar unknown

ndb - bug#26457

  master failure during master take over


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure to clear NF_XX_LCP if master fails during master take-over
ndb/test/include/NdbRestarter.hpp:
  Add support for querying next master and node group
    (for multi node failure testing)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
ndb/test/src/NdbRestarter.cpp:
  Add support for querying next master and node group
    (for multi node failure testing)
parent 0d3877bb
...@@ -5,7 +5,7 @@ Next DBACC 3002 ...@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014 Next DBTUP 4014
Next DBLQH 5043 Next DBLQH 5043
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7178 Next DBDIH 7181
Next DBTC 8039 Next DBTC 8039
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10022 Next BACKUP 10022
...@@ -71,6 +71,8 @@ Delay GCP_SAVEREQ by 10 secs ...@@ -71,6 +71,8 @@ Delay GCP_SAVEREQ by 10 secs
7177: Delay copying of sysfileData in execCOPY_GCIREQ 7177: Delay copying of sysfileData in execCOPY_GCIREQ
7180: Crash master during master-take-over in execMASTER_LCPCONF
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -4612,6 +4612,8 @@ void ...@@ -4612,6 +4612,8 @@ void
Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
jam(); jam();
Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;
c_lcpMasterTakeOverState.minTableId = ~0; c_lcpMasterTakeOverState.minTableId = ~0;
c_lcpMasterTakeOverState.minFragId = ~0; c_lcpMasterTakeOverState.minFragId = ~0;
c_lcpMasterTakeOverState.failedNodeId = nodeId; c_lcpMasterTakeOverState.failedNodeId = nodeId;
...@@ -4630,7 +4632,20 @@ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){ ...@@ -4630,7 +4632,20 @@ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
/** /**
* Node failure during master take over... * Node failure during master take over...
*/ */
ndbout_c("Nodefail during master take over"); ndbout_c("Nodefail during master take over (old: %d)", oldNode);
}
NodeRecordPtr nodePtr;
nodePtr.i = oldNode;
if (oldNode > 0 && oldNode < MAX_NDB_NODES)
{
jam();
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
{
jam();
checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
}
} }
setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER); setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
...@@ -5646,6 +5661,14 @@ void Dbdih::execMASTER_LCPREQ(Signal* signal) ...@@ -5646,6 +5661,14 @@ void Dbdih::execMASTER_LCPREQ(Signal* signal)
jamEntry(); jamEntry();
const BlockReference newMasterBlockref = req->masterRef; const BlockReference newMasterBlockref = req->masterRef;
if (newMasterBlockref != cmasterdihref)
{
jam();
ndbout_c("resending GSN_MASTER_LCPREQ");
sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
signal->getLength(), 50);
return;
}
Uint32 failedNodeId = req->failedNodeId; Uint32 failedNodeId = req->failedNodeId;
/** /**
...@@ -5946,6 +5969,8 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal) ...@@ -5946,6 +5969,8 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
nodePtr.p->lcpStateAtTakeOver = lcpState; nodePtr.p->lcpStateAtTakeOver = lcpState;
CRASH_INSERTION(7180);
#ifdef VM_TRACE #ifdef VM_TRACE
ndbout_c("MASTER_LCPCONF"); ndbout_c("MASTER_LCPCONF");
printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0); printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);
......
...@@ -62,6 +62,8 @@ public: ...@@ -62,6 +62,8 @@ public:
int dumpStateAllNodes(int * _args, int _num_args); int dumpStateAllNodes(int * _args, int _num_args);
int getMasterNodeId(); int getMasterNodeId();
int getNextMasterNodeId(int nodeId);
int getNodeGroup(int nodeId);
int getRandomNodeSameNodeGroup(int nodeId, int randomNumber); int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber); int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
int getRandomNotMasterNodeId(int randomNumber); int getRandomNotMasterNodeId(int randomNumber);
......
...@@ -1045,6 +1045,45 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){ ...@@ -1045,6 +1045,45 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK; return NDBT_OK;
} }
int
runBug26457(NDBT_Context* ctx, NDBT_Step* step)
{
NdbRestarter res;
if (res.getNumDbNodes() < 4)
return NDBT_OK;
int loops = ctx->getNumLoops();
while (loops --)
{
retry:
int master = res.getMasterNodeId();
int next = res.getNextMasterNodeId(master);
ndbout_c("master: %d next: %d", master, next);
if (res.getNodeGroup(master) == res.getNodeGroup(next))
{
res.restartOneDbNode(next, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
goto retry;
}
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 2 };
if (res.dumpStateOneNode(next, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(next, 7180))
return NDBT_FAILED;
res.restartOneDbNode(master, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
...@@ -1367,6 +1406,9 @@ TESTCASE("Bug25364", ""){ ...@@ -1367,6 +1406,9 @@ TESTCASE("Bug25364", ""){
TESTCASE("Bug25554", ""){ TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554); INITIALIZER(runBug25554);
} }
TESTCASE("Bug26457", ""){
INITIALIZER(runBug26457);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -477,6 +477,10 @@ max-time: 1000 ...@@ -477,6 +477,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug25554 T1 args: -n Bug25554 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug26457 T1
# OLD FLEX # OLD FLEX
max-time: 500 max-time: 500
cmd: flexBench cmd: flexBench
......
...@@ -128,6 +128,68 @@ NdbRestarter::getMasterNodeId(){ ...@@ -128,6 +128,68 @@ NdbRestarter::getMasterNodeId(){
return node; return node;
} }
int
NdbRestarter::getNodeGroup(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
for(size_t i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
return ndbNodes[i].node_group;
}
}
return -1;
}
int
NdbRestarter::getNextMasterNodeId(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
size_t i;
for(i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
break;
}
}
assert(i < ndbNodes.size());
if (i == ndbNodes.size())
return -1;
int dynid = ndbNodes[i].dynamic_id;
int minid = dynid;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > minid)
minid = ndbNodes[i].dynamic_id;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > dynid &&
ndbNodes[i].dynamic_id < minid)
{
minid = ndbNodes[i].dynamic_id;
}
if (minid != ~0)
{
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id == minid)
return ndbNodes[i].node_id;
}
return getMasterNodeId();
}
int int
NdbRestarter::getRandomNotMasterNodeId(int rand){ NdbRestarter::getRandomNotMasterNodeId(int rand){
int master = getMasterNodeId(); int master = getMasterNodeId();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment