Commit 3a4078c6 authored by unknown's avatar unknown

ndb - bug#29331 (51)

    Add better handling of GCP Stop
    Only kill "offending" node


storage/ndb/src/kernel/blocks/ERROR_codes.txt:
  add new error codes
storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
  add better GCP stop handling
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  add better GCP stop handling
parent 5119ddfc
......@@ -5,7 +5,7 @@ Next DBACC 3002
Next DBTUP 4029
Next DBLQH 5045
Next DBDICT 6007
Next DBDIH 7183
Next DBDIH 7186
Next DBTC 8040
Next CMVMI 9000
Next BACKUP 10038
......@@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs
7180: Crash master during master-take-over in execMASTER_LCPCONF
7184: Crash before starting next GCP after a node failure
7185: Dont reply to COPY_GCI_REQ where reason == GCP
ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING:
-----------------------------------------------------------------
......
......@@ -899,7 +899,7 @@ private:
void ndbsttorry10Lab(Signal *, Uint32 _line);
void createMutexes(Signal* signal, Uint32 no);
void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal);
void crashSystemAtGcpStop(Signal *);
void crashSystemAtGcpStop(Signal *, bool);
void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr);
void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode);
void GCP_SAVEhandling(Signal *, Uint32 nodeId);
......
......@@ -747,6 +747,13 @@ void Dbdih::execCOPY_GCIREQ(Signal* signal)
}
ndbrequire(ok);
if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT)
{
jam();
return;
}
/* ----------------------------------------------------------------------- */
/* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */
/* ----------------------------------------------------------------------- */
......@@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal)
CLEAR_ERROR_INSERT_VALUE;
}
if (ERROR_INSERTED(7184))
{
SET_ERROR_INSERT_VALUE(7000);
}
/*-------------------------------------------------------------------------*/
// The first step is to convert from a bit mask to an array of failed nodes.
/*-------------------------------------------------------------------------*/
......@@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus);
#endif
crashSystemAtGcpStop(signal);
crashSystemAtGcpStop(signal, false);
return;
}//if
} else {
......@@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal)
g_eventLogger.error("System crash due to GCP Stop in state = %u",
(Uint32) cgcpStatus);
#endif
crashSystemAtGcpStop(signal);
crashSystemAtGcpStop(signal, false);
return;
}//if
} else {
......@@ -11117,37 +11129,128 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr)
* GCP stop detected,
* send SYSTEM_ERROR to all other alive nodes
*/
void Dbdih::crashSystemAtGcpStop(Signal* signal)
void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local)
{
if (local)
goto dolocal;
switch(cgcpStatus){
case GCP_PREPARE_SENT:
{
jam();
/**
* We're waiting for a GCP PREPARE CONF
*/
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_PREPARE_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_PREPARE_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_GCP_PREPARE_Counter.done());
return;
}
case GCP_COMMIT_SENT:
{
jam();
/**
* We're waiting for a GCP_NODEFINISH
*/
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_COMMIT_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_COMMIT_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_GCP_COMMIT_Counter.done());
return;
}
case GCP_NODE_FINISHED:
{
jam();
/**
* We're waiting for a GCP save conf
*/
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter);
signal->theData[0] = 2305;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB);
infoEvent("Detected GCP stop...sending kill to %s",
c_GCP_SAVEREQ_Counter.getText());
g_eventLogger.error("Detected GCP stop...sending kill to %s",
c_GCP_SAVEREQ_Counter.getText());
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_GCP_SAVEREQ_Counter.getText());
ndbrequire(!c_GCP_SAVEREQ_Counter.done());
return;
}
case GCP_SAVE_LQH_FINISHED:
g_eventLogger.error("m_copyReason: %d m_waiting: %d",
{
jam();
/**
* We're waiting for a COPY_GCICONF
*/
infoEvent("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_COPY_GCIREQ_Counter.getText());
ndbout_c("Detected GCP stop(%d)...sending kill to %s",
cgcpStatus, c_COPY_GCIREQ_Counter.getText());
{
NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter);
signal->theData[0] = 7022;
sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA);
}
{
NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter);
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
sendSignal(rg, GSN_SYSTEM_ERROR, signal,
SystemError::SignalLength, JBA);
}
ndbrequire(!c_COPY_GCIREQ_Counter.done());
return;
}
case GCP_READY: (void)1;
}
dolocal:
ndbout_c("m_copyReason: %d m_waiting: %d",
c_copyGCIMaster.m_copyReason,
c_copyGCIMaster.m_waiting);
break;
case GCP_READY: // shut up lint
case GCP_PREPARE_SENT:
case GCP_COMMIT_SENT:
break;
}
g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d",
c_copyGCISlave.m_senderData,
c_copyGCISlave.m_senderRef,
c_copyGCISlave.m_copyReason,
......@@ -11202,6 +11305,9 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
c_TCGETOPSIZEREQ_Counter.getText());
ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText());
if (local == false)
{
jam();
NodeRecordPtr nodePtr;
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
jam();
......@@ -11219,6 +11325,19 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal)
SystemError::SignalLength, JBA);
}//if
}//for
}
else
{
jam();
SystemError * const sysErr = (SystemError*)&signal->theData[0];
sysErr->errorCode = SystemError::GCPStopDetected;
sysErr->errorRef = reference();
sysErr->data1 = cgcpStatus;
sysErr->data2 = cgcpOrderBlocked;
EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR,
signal, SystemError::SignalLength);
ndbrequire(false);
}
return;
}//Dbdih::crashSystemAtGcpStop()
......@@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
infoEvent(buf);
}
}
if (arg == 7022)
{
jam();
crashSystemAtGcpStop(signal, true);
}
}//Dbdih::execDUMP_STATE_ORD()
void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment