Commit b4199e91 authored by unknown's avatar unknown

ndb - bug#29331 (51)

    Add better handling of GCP Stop
    Only kill "offending" node


storage/ndb/src/kernel/blocks/ERROR_codes.txt:
  add new error codes
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
  add better GCP stop handling
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  add better GCP stop handling
parent ea88a770
...@@ -5,7 +5,7 @@ Next DBACC 3002 ...@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029 Next DBTUP 4029
Next DBLQH 5045 Next DBLQH 5045
Next DBDICT 6007 Next DBDICT 6007
Next DBDIH 7183 Next DBDIH 7186
Next DBTC 8040 Next DBTC 8040
Next CMVMI 9000 Next CMVMI 9000
Next BACKUP 10038 Next BACKUP 10038
...@@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs ...@@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF 7180: Crash master during master-take-over in execMASTER_LCPCONF
7184: Crash before starting next GCP after a node failure
7185: Dont reply to COPY_GCI_REQ where reason == GCP
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
----------------------------------------------------------------- -----------------------------------------------------------------
......
...@@ -899,7 +899,7 @@ private: ...@@ -899,7 +899,7 @@ private:
void ndbsttorry10Lab(Signal *, Uint32 _line); void ndbsttorry10Lab(Signal *, Uint32 _line);
void createMutexes(Signal* signal, Uint32 no); void createMutexes(Signal* signal, Uint32 no);
void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal); void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
void crashSystemAtGcpStop(Signal *); void crashSystemAtGcpStop(Signal *, bool);
void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr); void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode); void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
void GCP_SAVEhandling(Signal *, Uint32 nodeId); void GCP_SAVEhandling(Signal *, Uint32 nodeId);
......
...@@ -747,6 +747,13 @@ void Dbdih::execCOPY_GCIREQ(Signal* signal) ...@@ -747,6 +747,13 @@ void Dbdih::execCOPY_GCIREQ(Signal* signal)
} }
ndbrequire(ok); ndbrequire(ok);
if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
{
jam();
return;
}
/* ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */ /* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */ /* ----------------------------------------------------------------------- */
...@@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal) ...@@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
CLEAR_ERROR_INSERT_VALUE; CLEAR_ERROR_INSERT_VALUE;
} }
if (ERROR_INSERTED(7184))
{
SET_ERROR_INSERT_VALUE(7000);
}
/*-------------------------------------------------------------------------*/ /*-------------------------------------------------------------------------*/
// The first step is to convert from a bit mask to an array of failed nodes. // The first step is to convert from a bit mask to an array of failed nodes.
/*-------------------------------------------------------------------------*/ /*-------------------------------------------------------------------------*/
...@@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal) ...@@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger.error("System crash due to GCP Stop in state = %u", g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus); (Uint32) cgcpStatus);
#endif #endif
crashSystemAtGcpStop(signal); crashSystemAtGcpStop(signal, false);
return; return;
}//if }//if
} else { } else {
...@@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal) ...@@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger.error("System crash due to GCP Stop in state = %u", g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus); (Uint32) cgcpStatus);
#endif #endif
crashSystemAtGcpStop(signal); crashSystemAtGcpStop(signal, false);
return; return;
}//if }//if
} else { } else {
...@@ -11117,41 +11129,132 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr) ...@@ -11117,41 +11129,132 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
* GCP stop detected, * GCP stop detected,
* send SYSTEM_ERROR to all other alive nodes * send SYSTEM_ERROR to all other alive nodes
*/ */
void Dbdih::crashSystemAtGcpStop(Signal* signal) void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
{ {
if (local)
goto dolocal;
switch(cgcpStatus){ switch(cgcpStatus){
case GCP_PREPARE_SENT:
{
jam();
/**
* We're waiting for a GCP PREPARE CONF
*/
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_PREPARE_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_PREPARE_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_GCP_PREPARE_Counter.done());
return;
}
case GCP_COMMIT_SENT:
{
jam();
/**
* We're waiting for a GCP_NODEFINISH
*/
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_COMMIT_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_COMMIT_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_GCP_COMMIT_Counter.done());
return;
}
case GCP_NODE_FINISHED: case GCP_NODE_FINISHED:
{ {
jam();
/** /**
* We're waiting for a GCP save conf * We're waiting for a GCP save conf
*/ */
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter); NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
signal->theData[0] = 2305; signal->theData[0] = 2305;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
infoEvent("Detected GCP stop...sending kill to %s", infoEvent("Detected GCP stop(%d)...sending kill to %s",
c_GCP_SAVEREQ_Counter.getText()); cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
g_eventLogger.error("Detected GCP stop...sending kill to %s", ndbout_c("Detected GCP stop(%d)...sending kill to %s",
c_GCP_SAVEREQ_Counter.getText()); cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
return; return;
} }
case GCP_SAVE_LQH_FINISHED: case GCP_SAVE_LQH_FINISHED:
g_eventLogger.error("m_copyReason: %d m_waiting: %d", {
c_copyGCIMaster.m_copyReason, jam();
c_copyGCIMaster.m_waiting); /**
break; * We're waiting for a COPY_GCICONF
case GCP_READY: // shut up lint */
case GCP_PREPARE_SENT: infoEvent("Detected GCP stop(%d)...sending kill to %s",
case GCP_COMMIT_SENT: cgcpStatus, c_COPY_GCIREQ_Counter.getText());
break; ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_COPY_GCIREQ_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_COPY_GCIREQ_Counter.done());
return;
}
case GCP_READY: (void)1;
} }
dolocal:
ndbout_c("m_copyReason: %d m_waiting: %d",
c_copyGCIMaster.m_copyReason,
c_copyGCIMaster.m_waiting);
g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d", ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
c_copyGCISlave.m_senderData, c_copyGCISlave.m_senderData,
c_copyGCISlave.m_senderRef, c_copyGCISlave.m_senderRef,
c_copyGCISlave.m_copyReason, c_copyGCISlave.m_copyReason,
c_copyGCISlave.m_expectedNextWord); c_copyGCISlave.m_expectedNextWord);
FileRecordPtr file0Ptr; FileRecordPtr file0Ptr;
file0Ptr.i = crestartInfoFile[0]; file0Ptr.i = crestartInfoFile[0];
...@@ -11202,23 +11305,39 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal) ...@@ -11202,23 +11305,39 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
c_TCGETOPSIZEREQ_Counter.getText()); c_TCGETOPSIZEREQ_Counter.getText());
ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText()); ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
NodeRecordPtr nodePtr; if (local == false)
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { {
jam(); jam();
ptrAss(nodePtr, nodeRecord); NodeRecordPtr nodePtr;
if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) { for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
jam(); jam();
const BlockReference ref = ptrAss(nodePtr, nodeRecord);
numberToRef(refToBlock(cntrlblockref), nodePtr.i); if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) {
SystemError * const sysErr = (SystemError*)&signal->theData[0]; jam();
sysErr->errorCode = SystemError::GCPStopDetected; const BlockReference ref =
sysErr->errorRef = reference(); numberToRef(refToBlock(cntrlblockref), nodePtr.i);
sysErr->data1 = cgcpStatus; SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->data2 = cgcpOrderBlocked; sysErr->errorCode = SystemError::GCPStopDetected;
sendSignal(ref, GSN_SYSTEM_ERROR, signal, sysErr->errorRef = reference();
SystemError::SignalLength, JBA); sysErr->data1 = cgcpStatus;
}//if sysErr->data2 = cgcpOrderBlocked;
}//for sendSignal(ref, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}//if
}//for
}
else
{
jam();
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
signal, SystemError::SignalLength);
ndbrequire(false);
}
return; return;
}//Dbdih::crashSystemAtGcpStop() }//Dbdih::crashSystemAtGcpStop()
...@@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) ...@@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
infoEvent(buf); infoEvent(buf);
} }
} }
if (arg == 7022)
{
jam();
crashSystemAtGcpStop(signal, true);
}
}//Dbdih::execDUMP_STATE_ORD() }//Dbdih::execDUMP_STATE_ORD()
void void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment