Commit 327cd42a authored by unknown's avatar unknown

ndb - bug#32160 (recommit to 5.0)

  fix lcp master take over bug


ndb/src/kernel/blocks/ERROR_codes.txt:
  new error codes
ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
  add debug code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  fix master lcp bug
  add 2 new error codes
ndb/test/ndbapi/testNodeRestart.cpp:
  testcase
ndb/test/run-test/daily-basic-tests.txt:
  testcase
parent 81b5dbe2
......@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4014
Next DBLQH 5043
Next DBDICT 6007
Next DBDIH 7183
Next DBDIH 7195
Next DBTC 8052
Next CMVMI 9000
Next BACKUP 10022
......@@ -73,6 +73,11 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF
7193: Dont send LCP_FRAG_ORD to self, and crash when sending first
LCP_FRAG_ORD(last)
7194: Force removeNodeFromStored to complete in the middle of MASTER_LCPCONF
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
......
......@@ -1291,7 +1291,17 @@ private:
LcpStatus lcpStatus;
Uint32 lcpStatusUpdatedPlace;
struct Save {
LcpStatus m_status;
Uint32 m_place;
} m_saveState[10];
void setLcpStatus(LcpStatus status, Uint32 line){
for (Uint32 i = 9; i > 0; i--)
m_saveState[i] = m_saveState[i-1];
m_saveState[0].m_status = lcpStatus;
m_saveState[0].m_place = lcpStatusUpdatedPlace;
lcpStatus = status;
lcpStatusUpdatedPlace = line;
}
......
......@@ -4764,10 +4764,18 @@ void Dbdih::startRemoveFailedNode(Signal* signal, NodeRecordPtr failedNodePtr)
}
jam();
if (!ERROR_INSERTED(7194))
{
signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
signal->theData[1] = failedNodePtr.i;
signal->theData[2] = 0; // Tab id
sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
}
else
{
ndbout_c("7194 Not starting ZREMOVE_NODE_FROM_TABLE");
}
setLocalNodefailHandling(signal, failedNodePtr.i, NF_REMOVE_NODE_FROM_TABLE);
}//Dbdih::startRemoveFailedNode()
......@@ -5677,11 +5685,21 @@ Dbdih::checkEmptyLcpComplete(Signal *signal){
signal->theData[0] = 7012;
execDUMP_STATE_ORD(signal);
if (ERROR_INSERTED(7194))
{
ndbout_c("7194 starting ZREMOVE_NODE_FROM_TABLE");
signal->theData[0] = DihContinueB::ZREMOVE_NODE_FROM_TABLE;
signal->theData[1] = c_lcpMasterTakeOverState.failedNodeId;
signal->theData[2] = 0; // Tab id
sendSignal(reference(), GSN_CONTINUEB, signal, 3, JBB);
}
c_lcpMasterTakeOverState.set(LMTOS_INITIAL, __LINE__);
MasterLCPReq * const req = (MasterLCPReq *)&signal->theData[0];
req->masterRef = reference();
req->failedNodeId = c_lcpMasterTakeOverState.failedNodeId;
sendLoopMacro(MASTER_LCPREQ, sendMASTER_LCPREQ);
} else {
sendMASTER_LCPCONF(signal);
}
......@@ -5998,6 +6016,15 @@ void Dbdih::execMASTER_LCPCONF(Signal* signal)
{
const MasterLCPConf * const conf = (MasterLCPConf *)&signal->theData[0];
jamEntry();
if (ERROR_INSERTED(7194))
{
ndbout_c("delaying MASTER_LCPCONF due to error 7194");
sendSignalWithDelay(reference(), GSN_MASTER_LCPCONF, signal,
300, signal->getLength());
return;
}
Uint32 senderNodeId = conf->senderNodeId;
MasterLCPConf::State lcpState = (MasterLCPConf::State)conf->lcpState;
const Uint32 failedNodeId = conf->failedNodeId;
......@@ -6132,7 +6159,6 @@ void Dbdih::MASTER_LCPhandling(Signal* signal, Uint32 failedNodeId)
#endif
c_lcpState.keepGci = SYSFILE->keepGCI;
c_lcpState.setLcpStatus(LCP_START_LCP_ROUND, __LINE__);
startLcpRoundLoopLab(signal, 0, 0);
break;
}
......@@ -9924,6 +9950,8 @@ void Dbdih::sendLastLCP_FRAG_ORD(Signal* signal)
if(ERROR_INSERTED(7075)){
continue;
}
CRASH_INSERTION(7193);
BlockReference ref = calcLqhBlockRef(nodePtr.i);
sendSignal(ref, GSN_LCP_FRAG_ORD, signal,LcpFragOrd::SignalLength, JBB);
}
......@@ -10121,6 +10149,13 @@ Dbdih::checkLcpAllTablesDoneInLqh(){
CRASH_INSERTION2(7017, !isMaster());
c_lcpState.setLcpStatus(LCP_TAB_COMPLETED, __LINE__);
if (ERROR_INSERTED(7194))
{
ndbout_c("CLEARING 7194");
CLEAR_ERROR_INSERT_VALUE;
}
return true;
}
......@@ -10276,6 +10311,11 @@ Dbdih::sendLCP_FRAG_ORD(Signal* signal,
BlockReference ref = calcLqhBlockRef(replicaPtr.p->procNode);
if (ERROR_INSERTED(7193) && replicaPtr.p->procNode == getOwnNodeId())
{
return;
}
LcpFragOrd * const lcpFragOrd = (LcpFragOrd *)&signal->theData[0];
lcpFragOrd->tableId = info.tableId;
lcpFragOrd->fragmentId = info.fragId;
......@@ -13686,6 +13726,14 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
("immediateLcpStart = %d masterLcpNodeId = %d",
c_lcpState.immediateLcpStart,
refToNode(c_lcpState.m_masterLcpDihRef));
for (Uint32 i = 0; i<10; i++)
{
infoEvent("%u : status: %u place: %u", i,
c_lcpState.m_saveState[i].m_status,
c_lcpState.m_saveState[i].m_place);
}
infoEvent("-- Node %d LCP STATE --", getOwnNodeId());
}
......
......@@ -1347,6 +1347,51 @@ runBug28717(NDBT_Context* ctx, NDBT_Step* step)
return NDBT_OK;
}
int
runBug32160(NDBT_Context* ctx, NDBT_Step* step)
{
int result = NDBT_OK;
int loops = ctx->getNumLoops();
int records = ctx->getNumRecords();
Ndb* pNdb = GETNDB(step);
NdbRestarter res;
if (res.getNumDbNodes() < 2)
{
return NDBT_OK;
}
int master = res.getMasterNodeId();
int next = res.getNextMasterNodeId(master);
if (res.insertErrorInNode(next, 7194))
{
return NDBT_FAILED;
}
int val2[] = { DumpStateOrd::CmvmiSetRestartOnErrorInsert, 1 };
if (res.dumpStateOneNode(master, val2, 2))
return NDBT_FAILED;
if (res.insertErrorInNode(master, 7193))
return NDBT_FAILED;
int val3[] = { 7099 };
if (res.dumpStateOneNode(master, val3, 1))
return NDBT_FAILED;
if (res.waitNodesNoStart(&master, 1))
return NDBT_FAILED;
if (res.startNodes(&master, 1))
return NDBT_FAILED;
if (res.waitClusterStarted())
return NDBT_FAILED;
return NDBT_OK;
}
NDBT_TESTSUITE(testNodeRestart);
TESTCASE("NoLoad",
"Test that one node at a time can be stopped and then restarted "\
......@@ -1686,6 +1731,9 @@ TESTCASE("Bug28717", ""){
TESTCASE("Bug29364", ""){
INITIALIZER(runBug29364);
}
TESTCASE("Bug32160", ""){
INITIALIZER(runBug32160);
}
NDBT_TESTSUITE_END(testNodeRestart);
int main(int argc, const char** argv){
......
......@@ -497,6 +497,10 @@ max-time: 1000
cmd: testNodeRestart
args: -n Bug26481 T1
max-time: 300
cmd: testNodeRestart
args: -n Bug32160 T1
# OLD FLEX
max-time: 500
cmd: flexBench
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment