Commit 327cd42a authored by unknown's avatar unknown

ndb - bug#32160 (recommit to 5.0)

  fix lcp master take over bug


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error codes
ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
  add debug code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  fix master lcp bug
  add 2 new error codes
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 81b5dbe2
...@@ -5,7 +5,7 @@ Next DBACC 3002 ...@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014 Next DBTUP 4014
Next DBLQH 5043 Next DBLQH 5043
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7183 Next DBDIH 7195
Next DBTC 8052 Next DBTC 8052
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10022 Next BACKUP 10022
...@@ -73,6 +73,11 @@ Delay GCP_SAVEREQ by 10 secs ...@@ -73,6 +73,11 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF 7180: Crash master during master-take-over in execMASTER_LCPCONF
7193: Dont send LCP_FRAG_ORD to self, and crash when sending first
LCP_FRAG_ORD(last)
7194: Force removeNodeFromStored to complete in the middle of MASTER_LCPCONF
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -1291,7 +1291,17 @@ private: ...@@ -1291,7 +1291,17 @@ private:
LcpStatus lcpStatus; LcpStatus lcpStatus;
Uint32 lcpStatusUpdatedPlace; Uint32 lcpStatusUpdatedPlace;
struct Save {
LcpStatus m_status;
Uint32 m_place;
} m_saveState[10];
void setLcpStatus(LcpStatus status, Uint32 line){ void setLcpStatus(LcpStatus status, Uint32 line){
for (Uint32 i = 9; i > 0; i--)
m_saveState[i] = m_saveState[i-1];
m_saveState[0].m_status = lcpStatus;
m_saveState[0].m_place = lcpStatusUpdatedPlace;
lcpStatus = status; lcpStatus = status;
lcpStatusUpdatedPlace = line; lcpStatusUpdatedPlace = line;
} }
......
...@@ -4764,11 +4764,19 @@ void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr) ...@@ -4764,11 +4764,19 @@ void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr)
} }
jam(); jam();
signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
signal->theData[1] = failedNodePtr.i; if (!ERROR_INSERTED(7194))
signal->theData[2] = 0; // Tab id {
sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB); signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
signal->theData[1] = failedNodePtr.i;
signal->theData[2] = 0; // Tab id
sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
}
else
{
ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE");
}
setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE); setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE);
}//Dbdih::startRemoveFailedNode() }//Dbdih::startRemoveFailedNode()
...@@ -5676,12 +5684,22 @@ Dbdih::checkEmptyLcpComplete(Signal *signal){ ...@@ -5676,12 +5684,22 @@ Dbdih::checkEmptyLcpComplete(Signal *signal){
signal->theData[0] = 7012; signal->theData[0] = 7012;
execDUMP_STATE_ORD(signal); execDUMP_STATE_ORD(signal);
if (ERROR_INSERTED(7194))
{
ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE");
signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId;
signal->theData[2] = 0; // Tab id
sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
}
c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__); c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__);
MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0]; MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
req->masterRef = reference(); req->masterRef = reference();
req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId; req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId;
sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ); sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ);
} else { } else {
sendMASTER_LCPCONF(signal); sendMASTER_LCPCONF(signal);
} }
...@@ -5998,6 +6016,15 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal) ...@@ -5998,6 +6016,15 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
{ {
const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0]; const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
jamEntry(); jamEntry();
if (ERROR_INSERTED(7194))
{
ndbout_c("delaying MASTER_LCPCONF due to error 7194");
sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
300, signal->getLength());
return;
}
Uint32 senderNodeId = conf->senderNodeId; Uint32 senderNodeId = conf->senderNodeId;
MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState; MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState;
const Uint32 failedNodeId = conf->failedNodeId; const Uint32 failedNodeId = conf->failedNodeId;
...@@ -6132,7 +6159,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId) ...@@ -6132,7 +6159,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
#endif #endif
c_lcpState.keepGci = SYSFILE->keepGCI; c_lcpState.keepGci = SYSFILE->keepGCI;
c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__);
startLcpRoundLoopLab(signal, 0, 0); startLcpRoundLoopLab(signal, 0, 0);
break; break;
} }
...@@ -9924,6 +9950,8 @@ void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal) ...@@ -9924,6 +9950,8 @@ void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal)
if(ERROR_INSERTED(7075)){ if(ERROR_INSERTED(7075)){
continue; continue;
} }
CRASH_INSERTION(7193);
BlockReference ref = calcLqhBlockRef(nodePtr.i); BlockReference ref = calcLqhBlockRef(nodePtr.i);
sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB); sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB);
} }
...@@ -10121,6 +10149,13 @@ Dbdih::checkLcpAllTablesDoneInLqh(){ ...@@ -10121,6 +10149,13 @@ Dbdih::checkLcpAllTablesDoneInLqh(){
CRASH_INSERTION2(7017, !isMaster()); CRASH_INSERTION2(7017, !isMaster());
c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__); c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__);
if (ERROR_INSERTED(7194))
{
ndbout_c("CLEARING 7194");
CLEAR_ERROR_INSERT_VALUE;
}
return true; return true;
} }
...@@ -10276,6 +10311,11 @@ Dbdih::sendLCP_FRAG_ORD(Signal* signal, ...@@ -10276,6 +10311,11 @@ Dbdih::sendLCP_FRAG_ORD(Signal* signal,
BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode); BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode);
if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId())
{
return;
}
LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0]; LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
lcpFragOrd->tableId = info.tableId; lcpFragOrd->tableId = info.tableId;
lcpFragOrd->fragmentId = info.fragId; lcpFragOrd->fragmentId = info.fragId;
...@@ -13686,6 +13726,14 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) ...@@ -13686,6 +13726,14 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
("immediateLcpStart = %d masterLcpNodeId = %d", ("immediateLcpStart = %d masterLcpNodeId = %d",
c_lcpState.immediateLcpStart, c_lcpState.immediateLcpStart,
refToNode(c_lcpState.m_masterLcpDihRef)); refToNode(c_lcpState.m_masterLcpDihRef));
for (Uint32 i = 0; i<10; i++)
{
infoEvent("%u : status: %u place: %u", i,
c_lcpState.m_saveState[i].m_status,
c_lcpState.m_saveState[i].m_place);
}
infoEvent("-- Node %d LCP STATE --", getOwnNodeId()); infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
} }
......
...@@ -1347,6 +1347,51 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step) ...@@ -1347,6 +1347,51 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step)
return NDBT_OK; return NDBT_OK;
} }
int
runBug32160(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
Ndb* pNdb = GETNDB(step);
NdbRestarter res;
if (res.getNumDbNodes() < 2)
{
return NDBT_OK;
}
int master = res.getMasterNodeId();
int next = res.getNextMasterNodeId(master);
if (res.insertErrorInNode(next, 7194))
{
return NDBT_FAILED;
}
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateOneNode(master, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(master, 7193))
return NDBT_FAILED;
int val3[] = { 7099 };
if (res.dumpStateOneNode(master, val3, 1))
return NDBT_FAILED;
if (res.waitNodesNoStart(&master, 1))
return NDBT_FAILED;
if (res.startNodes(&master, 1))
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart); NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad", TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\ "Test that one node at a time can be stopped and then restarted "\
...@@ -1686,6 +1731,9 @@ TESTCASE("Bug28717", ""){ ...@@ -1686,6 +1731,9 @@ TESTCASE("Bug28717", ""){
TESTCASE("Bug29364", ""){ TESTCASE("Bug29364", ""){
INITIALIZER(runBug29364); INITIALIZER(runBug29364);
} }
TESTCASE("Bug32160", ""){
INITIALIZER(runBug32160);
}
NDBT_TESTSUITE_END(testNodeRestart); NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){ int main(int argc, const char** argv){
......
...@@ -497,6 +497,10 @@ max-time: 1000 ...@@ -497,6 +497,10 @@ max-time: 1000
cmd: testNodeRestart cmd: testNodeRestart
args: -n Bug26481 T1 args: -n Bug26481 T1
max-time: 300
cmd: testNodeRestart
args: -n Bug32160 T1
# OLD FLEX # OLD FLEX
max-time: 500 max-time: 500
cmd: flexBench cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment