Commit 050ae050 authored by unknown's avatar unknown

ndb - bug#25801

  - improve error message if starting wo/ enough REDO
  - decrease likelyhood of trying to start too early


storage/ndb/include/mgmapi/ndbd_exit_codes.h:
  Add new error code (that maybe should have been there a looong time)
storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Add new check (during SR) for that sufficient REDO is present
    before continuing SR
storage/ndb/src/kernel/blocks/qmgr/Qmgr.hpp:
  Add list of GCI's of nodes so that we can check for sufficient REDO during a SR
storage/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp:
  Add check for REDO during SR
    so that
  1) cluster is not trying to start too soon
  2) a better error message (than internal error) is provided if not enough REDO is present
storage/ndb/src/kernel/error/ndbd_exit_codes.c:
  Add new error code (that maybe should have been there a looong time)
parent a29d7c5b
......@@ -146,6 +146,7 @@ typedef ndbd_exit_classification_enum ndbd_exit_classification;
#define NDBD_EXIT_AFS_READ_UNDERFLOW 2816
#define NDBD_EXIT_INVALID_LCP_FILE 2352
#define NDBD_EXIT_INSUFFICENT_NODES 2353
const char *
ndbd_exit_message(int faultId, ndbd_exit_classification *cl);
......
......@@ -1194,11 +1194,58 @@ void Dbdih::execTAB_COMMITREQ(Signal* signal)
void Dbdih::execDIH_RESTARTREQ(Signal* signal)
{
jamEntry();
cntrlblockref = signal->theData[0];
if(m_ctx.m_config.getInitialStart()){
sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
} else {
readGciFileLab(signal);
if (signal->theData[0])
{
jam();
cntrlblockref = signal->theData[0];
if(m_ctx.m_config.getInitialStart()){
sendSignal(cntrlblockref, GSN_DIH_RESTARTREF, signal, 1, JBB);
} else {
readGciFileLab(signal);
}
}
else
{
/**
* Precondition, (not checked)
* atleast 1 node in each node group
*/
Uint32 i;
NdbNodeBitmask mask;
mask.assign(NdbNodeBitmask::Size, signal->theData + 1);
Uint32 *node_gcis = signal->theData+1+NdbNodeBitmask::Size;
Uint32 node_group_gcis[MAX_NDB_NODES+1];
bzero(node_group_gcis, sizeof(node_group_gcis));
for (i = 0; i<MAX_NDB_NODES; i++)
{
if (mask.get(i))
{
jam();
Uint32 ng = Sysfile::getNodeGroup(i, SYSFILE->nodeGroups);
ndbrequire(ng < MAX_NDB_NODES);
Uint32 gci = node_gcis[i];
if (gci > node_group_gcis[ng])
{
jam();
node_group_gcis[ng] = gci;
}
}
}
for (i = 0; i<MAX_NDB_NODES && node_group_gcis[i] == 0; i++);
Uint32 gci = node_group_gcis[i];
for (i++ ; i<MAX_NDB_NODES; i++)
{
jam();
if (node_group_gcis[i] && node_group_gcis[i] != gci)
{
jam();
signal->theData[0] = i;
return;
}
}
signal->theData[0] = MAX_NDB_NODES;
return;
}
return;
}//Dbdih::execDIH_RESTARTREQ()
......@@ -12391,7 +12438,7 @@ void Dbdih::makeNodeGroups(Uint32 nodeArray[])
(buf, sizeof(buf),
"Illegal initial start, no alive node in nodegroup %u", i);
progError(__LINE__,
NDBD_EXIT_SR_RESTARTCONFLICT,
NDBD_EXIT_INSUFFICENT_NODES,
buf);
}
......
......@@ -128,6 +128,7 @@ public:
Uint32 m_president_candidate_gci;
Uint16 m_regReqReqSent;
Uint16 m_regReqReqRecv;
Uint32 m_node_gci[MAX_NDB_NODES];
} c_start;
NdbNodeBitmask c_definedNodes; // DB nodes in config
......
......@@ -1093,7 +1093,8 @@ void Qmgr::execCM_REGREF(Signal* signal)
jam();
c_start.m_starting_nodes_w_log.set(TaddNodeno);
}
c_start.m_node_gci[TaddNodeno] = node_gci;
skip_nodes.bitAND(c_definedNodes);
c_start.m_skip_nodes.bitOR(skip_nodes);
......@@ -1242,6 +1243,7 @@ Qmgr::check_startup(Signal* signal)
wait.bitANDC(tmp);
Uint32 retVal = 0;
Uint32 incompleteng = MAX_NDB_NODES; // Illegal value
NdbNodeBitmask report_mask;
if ((c_start.m_latest_gci == 0) ||
......@@ -1327,7 +1329,7 @@ Qmgr::check_startup(Signal* signal)
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1;
goto start_report;
goto check_log;
case CheckNodeGroups::Partitioning:
ndbrequire(result != CheckNodeGroups::Lose);
signal->theData[1] =
......@@ -1335,7 +1337,7 @@ Qmgr::check_startup(Signal* signal)
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1;
goto start_report;
goto check_log;
}
}
......@@ -1359,12 +1361,7 @@ Qmgr::check_startup(Signal* signal)
case CheckNodeGroups::Partitioning:
if (now < partitioned_timeout && result != CheckNodeGroups::Win)
{
signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 0;
goto start_report;
goto missinglog;
}
// Fall through...
case CheckNodeGroups::Win:
......@@ -1372,12 +1369,61 @@ Qmgr::check_startup(Signal* signal)
all ? 0x8001 : (result == CheckNodeGroups::Win ? 0x8002 : 0x8003);
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 1;
goto start_report;
retVal = 2;
goto check_log;
}
}
ndbrequire(false);
check_log:
jam();
{
Uint32 save[4+4*NdbNodeBitmask::Size];
memcpy(save, signal->theData, sizeof(save));
signal->theData[0] = 0;
c_start.m_starting_nodes.copyto(NdbNodeBitmask::Size, signal->theData+1);
memcpy(signal->theData+1+NdbNodeBitmask::Size, c_start.m_node_gci,
4*MAX_NDB_NODES);
EXECUTE_DIRECT(DBDIH, GSN_DIH_RESTARTREQ, signal,
1+NdbNodeBitmask::Size+MAX_NDB_NODES);
incompleteng = signal->theData[0];
memcpy(signal->theData, save, sizeof(save));
if (incompleteng != MAX_NDB_NODES)
{
jam();
if (retVal == 1)
{
jam();
goto incomplete_log;
}
else if (retVal == 2)
{
if (now <= partitioned_timeout)
{
jam();
goto missinglog;
}
else
{
goto incomplete_log;
}
}
ndbrequire(false);
}
}
goto start_report;
missinglog:
signal->theData[1] = c_restartPartionedTimeout == (Uint32) ~0 ? 4 : 5;
signal->theData[2] = Uint32((partitioned_timeout - now + 500) / 1000);
report_mask.assign(c_definedNodes);
report_mask.bitANDC(c_start.m_starting_nodes);
retVal = 0;
goto start_report;
start_report:
jam();
{
......@@ -1396,17 +1442,32 @@ Qmgr::check_startup(Signal* signal)
missing_nodegroup:
jam();
char buf[100], mask1[100], mask2[100];
c_start.m_starting_nodes.getText(mask1);
tmp.assign(c_start.m_starting_nodes);
tmp.bitANDC(c_start.m_starting_nodes_w_log);
tmp.getText(mask2);
BaseString::snprintf(buf, sizeof(buf),
"Unable to start missing node group! "
" starting: %s (missing fs for: %s)",
mask1, mask2);
progError(__LINE__, NDBD_EXIT_SR_RESTARTCONFLICT, buf);
return 0; // Deadcode
{
char buf[100], mask1[100], mask2[100];
c_start.m_starting_nodes.getText(mask1);
tmp.assign(c_start.m_starting_nodes);
tmp.bitANDC(c_start.m_starting_nodes_w_log);
tmp.getText(mask2);
BaseString::snprintf(buf, sizeof(buf),
"Unable to start missing node group! "
" starting: %s (missing fs for: %s)",
mask1, mask2);
progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
return 0; // Deadcode
}
incomplete_log:
jam();
{
char buf[100], mask1[100];
c_start.m_starting_nodes.getText(mask1);
BaseString::snprintf(buf, sizeof(buf),
"Incomplete log for node group: %d! "
" starting nodes: %s",
incompleteng, mask1);
progError(__LINE__, NDBD_EXIT_INSUFFICENT_NODES, buf);
return 0; // Deadcode
}
}
void
......
......@@ -160,6 +160,7 @@ static const ErrStruct errArray[] =
{NDBD_EXIT_AFS_READ_UNDERFLOW , XFI, "Read underflow"},
{NDBD_EXIT_INVALID_LCP_FILE, XFI, "Invalid LCP" },
{NDBD_EXIT_INSUFFICENT_NODES, XRE, "Insufficent nodes for system restart" },
/* Sentinel */
{0, XUE,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment