Commit 70da85ee authored by unknown's avatar unknown

bug#28717, make sure only master updates activeStatus

  so that othernodes dont get confused after having recevied status from master
  and then tries to update it self


ndb/src/kernel/blocks/ERROR_codes.txt:
  error 1001, delay node_failrep
ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  error 1001, delay node_failrep
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 99691ef3
Next QMGR 1
Next NDBCNTR 1000
Next NDBCNTR 1002
Next NDBFS 2000
Next DBACC 3002
Next DBTUP 4014
......@@ -487,3 +487,8 @@ Dbdict:
6003 Crash in participant @ CreateTabReq::Prepare
6004 Crash in participant @ CreateTabReq::Commit
6005 Crash in participant @ CreateTabReq::CreateDrop
Ndbcntr:
--------
1001: Delay sending NODE_FAILREP (to own node), until error is cleared
......@@ -4448,12 +4448,18 @@ void Dbdih::failedNodeLcpHandling(Signal* signal, NodeRecordPtr failedNodePtr)
jam();
const Uint32 nodeId = failedNodePtr.i;
if (c_lcpState.m_participatingLQH.get(failedNodePtr.i)){
if (isMaster() && c_lcpState.m_participatingLQH.get(failedNodePtr.i))
{
/*----------------------------------------------------*/
/* THE NODE WAS INVOLVED IN A LOCAL CHECKPOINT. WE */
/* MUST UPDATE THE ACTIVE STATUS TO INDICATE THAT */
/* THE NODE HAVE MISSED A LOCAL CHECKPOINT. */
/*----------------------------------------------------*/
/**
* Bug#28717, Only master should do this, as this status is copied
* to other nodes
*/
switch (failedNodePtr.p->activeStatus) {
case Sysfile::NS_Active:
jam();
......
......@@ -1375,6 +1375,13 @@ void Ndbcntr::execNODE_FAILREP(Signal* signal)
{
jamEntry();
if (ERROR_INSERTED(1001))
{
sendSignalWithDelay(reference(), GSN_NODE_FAILREP, signal, 100,
signal->getLength());
return;
}
const NodeFailRep * nodeFail = (NodeFailRep *)&signal->theData[0];
NdbNodeBitmask allFailed;
allFailed.assign(NdbNodeBitmask::Size, nodeFail->theNodes);
......
......@@ -1045,6 +1045,84 @@ int runBug25554(NDBT_Context* ctx, NDBT_Step* step){
}
int
runBug28717(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
Ndb* pNdb = GETNDB(step);
NdbRestarter res;
if (res.getNumDbNodes() < 4)
{
return NDBT_OK;
}
int master = res.getMasterNodeId();
int node0 = res.getRandomNodeOtherNodeGroup(master, rand());
int node1 = res.getRandomNodeSameNodeGroup(node0, rand());
ndbout_c("master: %d node0: %d node1: %d", master, node0, node1);
if (res.restartOneDbNode(node0, false, true, true))
{
return NDBT_FAILED;
}
{
int filter[] = { 15, NDB_MGM_EVENT_CATEGORY_CHECKPOINT, 0 };
NdbLogEventHandle handle =
ndb_mgm_create_logevent_handle(res.handle, filter);
int dump[] = { DumpStateOrd::DihStartLcpImmediately };
struct ndb_logevent event;
for (Uint32 i = 0; i<3; i++)
{
res.dumpStateOneNode(master, dump, 1);
while(ndb_logevent_get_next(handle, &event, 0) >= 0 &&
event.type != NDB_LE_LocalCheckpointStarted);
while(ndb_logevent_get_next(handle, &event, 0) >= 0 &&
event.type != NDB_LE_LocalCheckpointCompleted);
}
}
if (res.waitNodesNoStart(&node0, 1))
return NDBT_FAILED;
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateOneNode(node0, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(node0, 5010))
return NDBT_FAILED;
if (res.insertErrorInNode(node1, 1001))
return NDBT_FAILED;
if (res.startNodes(&node0, 1))
return NDBT_FAILED;
NdbSleep_SecSleep(3);
if (res.insertErrorInNode(node1, 0))
return NDBT_FAILED;
if (res.waitNodesNoStart(&node0, 1))
return NDBT_FAILED;
if (res.startNodes(&node0, 1))
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
......@@ -1366,6 +1444,9 @@ TESTCASE("Bug25364", ""){
TESTCASE("Bug25554", ""){
INITIALIZER(runBug25554);
}
TESTCASE("Bug28717", ""){
INITIALIZER(runBug28717);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -492,6 +492,10 @@ max-time: 1500
cmd: testDict
args: -n CreateAndDrop
max-time: 1000
cmd: testNodeRestart
args: -n Bug28717 T1
max-time: 1500
cmd: testDict
args: -n CreateAndDropAtRandom -l 200 T1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment