Commit 050ae050 authored by unknown's avatar unknown

ndb - bug#25801

  - improve error message if starting wo/ enough REDO
  - decrease likelyhood of trying to start too early


storage/ndb/include/mgmapi/ndbd_exit_codes.h:
  Add new error code (that maybe should have been there a looong time)
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Add new check (during SR) for that sufficient REDO is present
    before continuing SR
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  Add list of GCI's of nodes so that we can check for sufficient REDO during a SR
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  Add check for REDO during SR
    so that
  1) cluster is not trying to start too soon
  2) a better error message (than internal error) is provided if not enough REDO is present
storage/ndb/src/kernel/error/ndbd_exit_codes.c:
  Add new error code (that maybe should have been there a looong time)
parent a29d7c5b
...@@ -146,6 +146,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification; ...@@ -146,6 +146,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification;
#define NDBD_EXIT_AFS_READ_UNDERFLOW 2816 #define NDBD_EXIT_AFS_READ_UNDERFLOW 2816
#define NDBD_EXIT_INVALID_LCP_FILE 2352 #define NDBD_EXIT_INVALID_LCP_FILE 2352
#define NDBD_EXIT_INSUFFICENT_NODES 2353
const char * const char *
ndbd_exit_message(int faultId, ndbd_exit_classification *cl); ndbd_exit_message(int faultId, ndbd_exit_classification *cl);
......
...@@ -1194,12 +1194,59 @@ void Dbdih::execTAB_COMMITREQ(Signal* signal) ...@@ -1194,12 +1194,59 @@ void Dbdih::execTAB_COMMITREQ(Signal* signal)
void Dbdih::execDIH_RESTARTREQ(Signal* signal) void Dbdih::execDIH_RESTARTREQ(Signal* signal)
{ {
jamEntry(); jamEntry();
if (signal->theData[0])
{
jam();
cntrlblockref = signal->theData[0]; cntrlblockref = signal->theData[0];
if(m_ctx.m_config.getInitialStart()){ if(m_ctx.m_config.getInitialStart()){
sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB); sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
} else { } else {
readGciFileLab(signal); readGciFileLab(signal);
} }
}
else
{
/**
* Precondition, (not checked)
* atleast 1 node in each node group
*/
Uint32 i;
NdbNodeBitmask mask;
mask.assign(NdbNodeBitmask::Size, signal->theData + 1);
Uint32 *node_gcis = signal->theData+1+NdbNodeBitmask::Size;
Uint32 node_group_gcis[MAX_NDB_NODES+1];
bzero(node_group_gcis, sizeof(node_group_gcis));
for (i = 0; i<MAX_NDB_NODES; i++)
{
if (mask.get(i))
{
jam();
Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
ndbrequire(ng < MAX_NDB_NODES);
Uint32 gci = node_gcis[i];
if (gci > node_group_gcis[ng])
{
jam();
node_group_gcis[ng] = gci;
}
}
}
for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
Uint32 gci = node_group_gcis[i];
for (i++ ; i<MAX_NDB_NODES; i++)
{
jam();
if (node_group_gcis[i] && node_group_gcis[i] != gci)
{
jam();
signal->theData[0] = i;
return;
}
}
signal->theData[0] = MAX_NDB_NODES;
return;
}
return; return;
}//Dbdih::execDIH_RESTARTREQ() }//Dbdih::execDIH_RESTARTREQ()
...@@ -12391,7 +12438,7 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[]) ...@@ -12391,7 +12438,7 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[])
(buf, sizeof(buf), (buf, sizeof(buf),
"Illegal initial start, no alive node in nodegroup %u", i); "Illegal initial start, no alive node in nodegroup %u", i);
progError(__LINE__, progError(__LINE__,
NDBD_EXIT_SR_RESTARTCONFLICT, NDBD_EXIT_INSUFFICENT_NODES,
buf); buf);
} }
......
...@@ -128,6 +128,7 @@ public: ...@@ -128,6 +128,7 @@ public:
Uint32 m_president_candidate_gci; Uint32 m_president_candidate_gci;
Uint16 m_regReqReqSent; Uint16 m_regReqReqSent;
Uint16 m_regReqReqRecv; Uint16 m_regReqReqRecv;
Uint32 m_node_gci[MAX_NDB_NODES];
} c_start; } c_start;
NdbNodeBitmask c_definedNodes; // DB nodes in config NdbNodeBitmask c_definedNodes; // DB nodes in config
......
...@@ -1093,6 +1093,7 @@ void Qmgr::execCM_REGREF(Signal* signal) ...@@ -1093,6 +1093,7 @@ void Qmgr::execCM_REGREF(Signal* signal)
jam(); jam();
c_start.m_starting_nodes_w_log.set(TaddNodeno); c_start.m_starting_nodes_w_log.set(TaddNodeno);
} }
c_start.m_node_gci[TaddNodeno] = node_gci;
skip_nodes.bitAND(c_definedNodes); skip_nodes.bitAND(c_definedNodes);
c_start.m_skip_nodes.bitOR(skip_nodes); c_start.m_skip_nodes.bitOR(skip_nodes);
...@@ -1242,6 +1243,7 @@ Qmgr::check_startup(Signal* signal) ...@@ -1242,6 +1243,7 @@ Qmgr::check_startup(Signal* signal)
wait.bitANDC(tmp); wait.bitANDC(tmp);
Uint32 retVal = 0; Uint32 retVal = 0;
Uint32 incompleteng = MAX_NDB_NODES; // Illegal value
NdbNodeBitmask report_mask; NdbNodeBitmask report_mask;
if ((c_start.m_latest_gci == 0) || if ((c_start.m_latest_gci == 0) ||
...@@ -1327,7 +1329,7 @@ Qmgr::check_startup(Signal* signal) ...@@ -1327,7 +1329,7 @@ Qmgr::check_startup(Signal* signal)
report_mask.assign(c_definedNodes); report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes); report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1; retVal = 1;
goto start_report; goto check_log;
case CheckNodeGroups::Partitioning: case CheckNodeGroups::Partitioning:
ndbrequire(result != CheckNodeGroups::Lose); ndbrequire(result != CheckNodeGroups::Lose);
signal->theData[1] = signal->theData[1] =
...@@ -1335,7 +1337,7 @@ Qmgr::check_startup(Signal* signal) ...@@ -1335,7 +1337,7 @@ Qmgr::check_startup(Signal* signal)
report_mask.assign(c_definedNodes); report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes); report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1; retVal = 1;
goto start_report; goto check_log;
} }
} }
...@@ -1359,12 +1361,7 @@ Qmgr::check_startup(Signal* signal) ...@@ -1359,12 +1361,7 @@ Qmgr::check_startup(Signal* signal)
case CheckNodeGroups::Partitioning: case CheckNodeGroups::Partitioning:
if (now < partitioned_timeout && result != CheckNodeGroups::Win) if (now < partitioned_timeout && result != CheckNodeGroups::Win)
{ {
signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5; goto missinglog;
signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 0;
goto start_report;
} }
// Fall through... // Fall through...
case CheckNodeGroups::Win: case CheckNodeGroups::Win:
...@@ -1372,12 +1369,61 @@ Qmgr::check_startup(Signal* signal) ...@@ -1372,12 +1369,61 @@ Qmgr::check_startup(Signal* signal)
all ? 0x8001 : (result == CheckNodeGroups::Win ? 0x8002 : 0x8003); all ? 0x8001 : (result == CheckNodeGroups::Win ? 0x8002 : 0x8003);
report_mask.assign(c_definedNodes); report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes); report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1; retVal = 2;
goto start_report; goto check_log;
} }
} }
ndbrequire(false); ndbrequire(false);
check_log:
jam();
{
Uint32 save[4+4*NdbNodeBitmask::Size];
memcpy(save, signal->theData, sizeof(save));
signal->theData[0] = 0;
c_start.m_starting_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
memcpy(signal->theData+1+NdbNodeBitmask::Size, c_start.m_node_gci,
4*MAX_NDB_NODES);
EXECUTE_DIRECT(DBDIH, GSN_DIH_RESTARTREQ, signal,
1+NdbNodeBitmask::Size+MAX_NDB_NODES);
incompleteng = signal->theData[0];
memcpy(signal->theData, save, sizeof(save));
if (incompleteng != MAX_NDB_NODES)
{
jam();
if (retVal == 1)
{
jam();
goto incomplete_log;
}
else if (retVal == 2)
{
if (now <= partitioned_timeout)
{
jam();
goto missinglog;
}
else
{
goto incomplete_log;
}
}
ndbrequire(false);
}
}
goto start_report;
missinglog:
signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 0;
goto start_report;
start_report: start_report:
jam(); jam();
{ {
...@@ -1396,6 +1442,7 @@ Qmgr::check_startup(Signal* signal) ...@@ -1396,6 +1442,7 @@ Qmgr::check_startup(Signal* signal)
missing_nodegroup: missing_nodegroup:
jam(); jam();
{
char buf[100], mask1[100], mask2[100]; char buf[100], mask1[100], mask2[100];
c_start.m_starting_nodes.getText(mask1); c_start.m_starting_nodes.getText(mask1);
tmp.assign(c_start.m_starting_nodes); tmp.assign(c_start.m_starting_nodes);
...@@ -1405,8 +1452,22 @@ Qmgr::check_startup(Signal* signal) ...@@ -1405,8 +1452,22 @@ Qmgr::check_startup(Signal* signal)
"Unable to start missing node group! " "Unable to start missing node group! "
" starting: %s (missing fs for: %s)", " starting: %s (missing fs for: %s)",
mask1, mask2); mask1, mask2);
progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT, buf); progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
return 0; // Deadcode
}
incomplete_log:
jam();
{
char buf[100], mask1[100];
c_start.m_starting_nodes.getText(mask1);
BaseString::snprintf(buf, sizeof(buf),
"Incomplete log for node group: %d! "
" starting nodes: %s",
incompleteng, mask1);
progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
return 0; // Deadcode return 0; // Deadcode
}
} }
void void
......
...@@ -160,6 +160,7 @@ static const ErrStruct errArray[] = ...@@ -160,6 +160,7 @@ static const ErrStruct errArray[] =
{NDBD_EXIT_AFS_READ_UNDERFLOW , XFI, "Read underflow"}, {NDBD_EXIT_AFS_READ_UNDERFLOW , XFI, "Read underflow"},
{NDBD_EXIT_INVALID_LCP_FILE, XFI, "Invalid LCP" }, {NDBD_EXIT_INVALID_LCP_FILE, XFI, "Invalid LCP" },
{NDBD_EXIT_INSUFFICENT_NODES, XRE, "Insufficent nodes for system restart" },
/* Sentinel */ /* Sentinel */
{0, XUE, {0, XUE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment