Commit 7223be24 authored by unknown's avatar unknown

ndb - bug#26457

  master failure during master take over


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure to clear NF_XX_LCP if master fails during master take-over
ndb/test/include/NdbRestarter.hpp:
  Add support for querying next master and node group
    (for multi node failure testing)
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
ndb/test/src/NdbRestarter.cpp:
  Add support for querying next master and node group
    (for multi node failure testing)
parent 0d3877bb
......@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014
Next DBLQH 5043
Next DBDICT 6007
Next DBDIH 7178
Next DBDIH 7181
Next DBTC 8039
Next CMVMI 9000
Next BACKUP 10022
......@@ -71,6 +71,8 @@ Delay GCP_SAVEREQ by 10 secs
7177: Delay copying of sysfileData in execCOPY_GCIREQ
7180: Crash master during master-take-over in execMASTER_LCPCONF
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
......
......@@ -4612,6 +4612,8 @@ void
Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
jam();
Uint32 oldNode = c_lcpMasterTakeOverState.failedNodeId;
c_lcpMasterTakeOverState.minTableId = ~0;
c_lcpMasterTakeOverState.minFragId = ~0;
c_lcpMasterTakeOverState.failedNodeId = nodeId;
......@@ -4630,7 +4632,20 @@ Dbdih::startLcpMasterTakeOver(Signal* signal, Uint32 nodeId){
/**
* Node failure during master take over...
*/
ndbout_c("Nodefail during master take over");
ndbout_c("Nodefail during master take over (old: %d)", oldNode);
}
NodeRecordPtr nodePtr;
nodePtr.i = oldNode;
if (oldNode > 0 && oldNode < MAX_NDB_NODES)
{
jam();
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (nodePtr.p->m_nodefailSteps.get(NF_LCP_TAKE_OVER))
{
jam();
checkLocalNodefailComplete(signal, oldNode, NF_LCP_TAKE_OVER);
}
}
setLocalNodefailHandling(signal, nodeId, NF_LCP_TAKE_OVER);
......@@ -5646,6 +5661,14 @@ void Dbdih::execMASTER_LCPREQ(Signal* signal)
jamEntry();
const BlockReference newMasterBlockref = req->masterRef;
if (newMasterBlockref != cmasterdihref)
{
jam();
ndbout_c("resending GSN_MASTER_LCPREQ");
sendSignalWithDelay(reference(), GSN_MASTER_LCPREQ, signal,
signal->getLength(), 50);
return;
}
Uint32 failedNodeId = req->failedNodeId;
/**
......@@ -5946,6 +5969,8 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
nodePtr.p->lcpStateAtTakeOver = lcpState;
CRASH_INSERTION(7180);
#ifdef VM_TRACE
ndbout_c("MASTER_LCPCONF");
printMASTER_LCP_CONF(stdout, &signal->theData[0], 0, 0);
......
......@@ -62,6 +62,8 @@ public:
int dumpStateAllNodes(int * _args, int _num_args);
int getMasterNodeId();
int getNextMasterNodeId(int nodeId);
int getNodeGroup(int nodeId);
int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
int getRandomNotMasterNodeId(int randomNumber);
......
......@@ -1045,6 +1045,45 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK;
}
int
runBug26457(NDBT_Context* ctx, NDBT_Step* step)
{
NdbRestarter res;
if (res.getNumDbNodes() < 4)
return NDBT_OK;
int loops = ctx->getNumLoops();
while (loops --)
{
retry:
int master = res.getMasterNodeId();
int next = res.getNextMasterNodeId(master);
ndbout_c("master: %d next: %d", master, next);
if (res.getNodeGroup(master) == res.getNodeGroup(next))
{
res.restartOneDbNode(next, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
goto retry;
}
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 2 };
if (res.dumpStateOneNode(next, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(next, 7180))
return NDBT_FAILED;
res.restartOneDbNode(master, false, false, true);
if (res.waitClusterStarted())
return NDBT_FAILED;
}
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -1367,6 +1406,9 @@ TESTCASE("Bug25364", ""){
TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554);
}
TESTCASE("Bug26457", ""){
INITIALIZER(runBug26457);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -477,6 +477,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug25554 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug26457 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
......@@ -128,6 +128,68 @@ NdbRestarter::getMasterNodeId(){
return node;
}
int
NdbRestarter::getNodeGroup(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
for(size_t i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
return ndbNodes[i].node_group;
}
}
return -1;
}
int
NdbRestarter::getNextMasterNodeId(int nodeId){
if (!isConnected())
return -1;
if (getStatus() != 0)
return -1;
size_t i;
for(i = 0; i < ndbNodes.size(); i++)
{
if(ndbNodes[i].node_id == nodeId)
{
break;
}
}
assert(i < ndbNodes.size());
if (i == ndbNodes.size())
return -1;
int dynid = ndbNodes[i].dynamic_id;
int minid = dynid;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > minid)
minid = ndbNodes[i].dynamic_id;
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id > dynid &&
ndbNodes[i].dynamic_id < minid)
{
minid = ndbNodes[i].dynamic_id;
}
if (minid != ~0)
{
for (i = 0; i<ndbNodes.size(); i++)
if (ndbNodes[i].dynamic_id == minid)
return ndbNodes[i].node_id;
}
return getMasterNodeId();
}
int
NdbRestarter::getRandomNotMasterNodeId(int rand){
int master = getMasterNodeId();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment