Commit a29d7c5b authored by unknown's avatar unknown

ndb - bug#25984 - more than 7 failed node restart can cause cluster failure

new behaviour is as follows:
1) node is refused to start, and should fail with message in error log that it must be restarted --initial
2) if cluster failure in this situation, node must also be restarted --intial
   if not SR will fail, with this message


storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Prevent node from starting _at all_ if it has performed more than 6 failed
    node restart.
storage/ndb/test/ndbapi/testNodeRestart.cpp:
  test prg 25984
storage/ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 4767f391
......@@ -1525,10 +1525,26 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
*/
SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
warningEvent("Making filesystem for node %d unusable",
warningEvent("Making filesystem for node %d unusable (need --initial)",
nodePtr.i);
}
else if (nodePtr.p->nodeStatus == NodeRecord::ALIVE &&
SYSFILE->lastCompletedGCI[nodePtr.i] == 0)
{
jam();
CRASH_INSERTION(7170);
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
"Cluster requires this node to be started "
" with --initial as partial start has been performed"
" and this filesystem is unusable");
progError(__LINE__,
NDBD_EXIT_SR_RESTARTCONFLICT,
buf);
ndbrequire(false);
}
}
/**
* This set which GCI we will try to restart to
*/
......@@ -12515,14 +12531,23 @@ void Dbdih::newCrashedReplica(Uint32 nodeId, ReplicaRecordPtr ncrReplicaPtr)
/* THAT THE NEW REPLICA IS NOT STARTED YET AND REPLICA_LAST_GCI IS*/
/* SET TO -1 TO INDICATE THAT IT IS NOT DEAD YET. */
/*----------------------------------------------------------------------*/
Uint32 lastGCI = SYSFILE->lastCompletedGCI[nodeId];
arrGuardErr(ncrReplicaPtr.p->noCrashedReplicas + 1, 8,
NDBD_EXIT_MAX_CRASHED_REPLICAS);
ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
SYSFILE->lastCompletedGCI[nodeId];
lastGCI;
ncrReplicaPtr.p->noCrashedReplicas = ncrReplicaPtr.p->noCrashedReplicas + 1;
ncrReplicaPtr.p->createGci[ncrReplicaPtr.p->noCrashedReplicas] = 0;
ncrReplicaPtr.p->replicaLastGci[ncrReplicaPtr.p->noCrashedReplicas] =
(Uint32)-1;
if (ncrReplicaPtr.p->noCrashedReplicas == 7 && lastGCI)
{
jam();
SYSFILE->lastCompletedGCI[nodeId] = 0;
warningEvent("Making filesystem for node %d unusable (need --initial)",
nodeId);
}
}//Dbdih::newCrashedReplica()
/*************************************************************************/
......
......@@ -1178,6 +1178,101 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
return NDBT_OK;
}
int runBug25984(NDBT_Context* ctx, NDBT_Step* step){
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
NdbRestarter restarter;
if (restarter.getNumDbNodes() < 2)
return NDBT_OK;
if (restarter.restartAll(true, true, true))
return NDBT_FAILED;
if (restarter.waitClusterNoStart())
return NDBT_FAILED;
if (restarter.startAll())
return NDBT_FAILED;
if (restarter.waitClusterStarted())
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
int master = restarter.getMasterNodeId();
int victim = restarter.getRandomNodeOtherNodeGroup(master, rand());
if (victim == -1)
victim = restarter.getRandomNodeSameNodeGroup(master, rand());
restarter.restartOneDbNode(victim, false, true, true);
for (Uint32 i = 0; i<6; i++)
{
ndbout_c("Loop: %d", i);
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(victim, val2, 2))
return NDBT_FAILED;
if (restarter.insertErrorInNode(victim, 7016))
return NDBT_FAILED;
if (restarter.startNodes(&victim, 1))
return NDBT_FAILED;
if (restarter.waitNodesStartPhase(&victim, 1, 2))
return NDBT_FAILED;
}
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.dumpStateOneNode(victim, val2, 2))
return NDBT_FAILED;
if (restarter.insertErrorInNode(victim, 7170))
return NDBT_FAILED;
if (restarter.startNodes(&victim, 1))
return NDBT_FAILED;
if (restarter.waitNodesNoStart(&victim, 1))
return NDBT_FAILED;
if (restarter.restartAll(false, true, true))
return NDBT_FAILED;
if (restarter.insertErrorInAllNodes(932))
return NDBT_FAILED;
if (restarter.insertErrorInNode(master, 7170))
return NDBT_FAILED;
if (restarter.dumpStateAllNodes(val2, 2))
return NDBT_FAILED;
restarter.startNodes(&master, 1);
NdbSleep_MilliSleep(3000);
restarter.startAll();
if (restarter.waitClusterNoStart())
return NDBT_FAILED;
if (restarter.restartOneDbNode(victim, true, true, true))
return NDBT_FAILED;
if (restarter.startAll())
return NDBT_FAILED;
if (restarter.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
......@@ -1514,6 +1609,9 @@ TESTCASE("Bug25468", ""){
TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554);
}
TESTCASE("Bug25984", ""){
INITIALIZER(runBug25984);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -525,6 +525,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug25554 T1
max-time: 1000
cmd: testNodeRestart
args: -n Bug25984
#
# DICT TESTS
max-time: 1500
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment