Commit ea16e525 authored by unknown's avatar unknown

BUG#18966 Change in stop/shutdown behaviour

fix behaviour of ALL STOP and SHUTDOWN in relation to MGM nodes


ndb/src/mgmclient/CommandInterpreter.cpp:
  Shutdown management servers properly (one we're connected to last).
  
  Fix potential problems if disconnect from one mgmd and connect to another (that's a different cluster)
ndb/src/mgmsrv/MgmtSrvr.cpp:
  fix regression in how nodes are stopped
  ALL STOP - db nodes only
  SHUTDOWN - db and mgm nodes
  
  fix race where mgmd could stop itself before it sends the response to stop.
  In ~MgmApiSession() we now do the disconnect.
  
  We have (in various places) a return stopSelf which tells the caller if they
  need to stop or restart this daemon.
ndb/src/mgmsrv/MgmtSrvr.hpp:
  add stopSelf return parameter to stopNodes and restartNodes.
  
  Rename stop to shutdownDB as this name better reflects what it does
  
  Rename restart to restartDB as this name better reflects what it does
ndb/src/mgmsrv/Services.cpp:
  Stop or restart server only on mgm session disconnect (of session that asked us to)
ndb/src/mgmsrv/Services.hpp:
  add m_stopSelf member for tracking what we should do
parent c172fe2c
......@@ -162,6 +162,7 @@ class CommandInterpreter {
NdbMgmHandle m_mgmsrv;
NdbMgmHandle m_mgmsrv2;
const char *m_constr;
bool m_connected;
int m_verbose;
int try_reconnect;
......@@ -390,22 +391,7 @@ convert(const char* s, int& val) {
CommandInterpreter::CommandInterpreter(const char *_host,int verbose)
: m_verbose(verbose)
{
m_mgmsrv = ndb_mgm_create_handle();
if(m_mgmsrv == NULL) {
ndbout_c("Cannot create handle to management server.");
exit(-1);
}
m_mgmsrv2 = ndb_mgm_create_handle();
if(m_mgmsrv2 == NULL) {
ndbout_c("Cannot create 2:nd handle to management server.");
exit(-1);
}
if (ndb_mgm_set_connectstring(m_mgmsrv, _host))
{
printError();
exit(-1);
}
m_constr= _host;
m_connected= false;
m_event_thread= 0;
try_reconnect = 0;
......@@ -422,8 +408,6 @@ CommandInterpreter::CommandInterpreter(const char *_host,int verbose)
CommandInterpreter::~CommandInterpreter()
{
disconnect();
ndb_mgm_destroy_handle(&m_mgmsrv);
ndb_mgm_destroy_handle(&m_mgmsrv2);
}
static bool
......@@ -447,7 +431,6 @@ CommandInterpreter::printError()
{
if (ndb_mgm_check_connection(m_mgmsrv))
{
m_connected= false;
disconnect();
}
ndbout_c("* %5d: %s",
......@@ -500,10 +483,30 @@ bool
CommandInterpreter::connect()
{
DBUG_ENTER("CommandInterpreter::connect");
if(!m_connected)
{
if(!ndb_mgm_connect(m_mgmsrv, try_reconnect-1, 5, 1))
if(m_connected)
DBUG_RETURN(m_connected);
m_mgmsrv = ndb_mgm_create_handle();
if(m_mgmsrv == NULL) {
ndbout_c("Cannot create handle to management server.");
exit(-1);
}
m_mgmsrv2 = ndb_mgm_create_handle();
if(m_mgmsrv2 == NULL) {
ndbout_c("Cannot create 2:nd handle to management server.");
exit(-1);
}
if (ndb_mgm_set_connectstring(m_mgmsrv, m_constr))
{
printError();
exit(-1);
}
if(ndb_mgm_connect(m_mgmsrv, try_reconnect-1, 5, 1))
DBUG_RETURN(m_connected); // couldn't connect, always false
const char *host= ndb_mgm_get_connected_host(m_mgmsrv);
unsigned port= ndb_mgm_get_connected_port(m_mgmsrv);
BaseString constr;
......@@ -567,8 +570,7 @@ CommandInterpreter::connect()
printf("Connected to Management Server at: %s:%d\n",
host, port);
}
}
}
DBUG_RETURN(m_connected);
}
......@@ -576,20 +578,18 @@ bool
CommandInterpreter::disconnect()
{
DBUG_ENTER("CommandInterpreter::disconnect");
if (m_event_thread) {
void *res;
do_event_thread= 0;
NdbThread_WaitFor(m_event_thread, &res);
NdbThread_Destroy(&m_event_thread);
m_event_thread= 0;
ndb_mgm_disconnect(m_mgmsrv2);
ndb_mgm_destroy_handle(&m_mgmsrv2);
}
if (m_connected)
{
if (ndb_mgm_disconnect(m_mgmsrv) == -1) {
ndbout_c("Could not disconnect from management server");
printError();
}
ndb_mgm_destroy_handle(&m_mgmsrv);
m_connected= false;
}
DBUG_RETURN(true);
......@@ -1066,28 +1066,39 @@ CommandInterpreter::executeShutdown(char* parameters)
ndbout << result << " NDB Cluster node(s) have shutdown." << endl;
int mgm_id= 0;
mgm_id= ndb_mgm_get_mgmd_nodeid(m_mgmsrv);
if (mgm_id == 0)
int nodeId= 0;
int this_mgmd= 0;
this_mgmd= ndb_mgm_get_mgmd_nodeid(m_mgmsrv);
while(get_next_nodeid(state, &nodeId, NDB_MGM_NODE_TYPE_MGM))
{
ndbout << "Unable to locate management server, "
<< "shutdown manually with <id> STOP"
<< endl;
return 1;
if(nodeId==this_mgmd)
continue;
ndbout << "Shutting down NDB Cluster management server nodeId="
<< nodeId << "...";
result = ndb_mgm_stop(m_mgmsrv, 1, &nodeId);
if (result <= 0) {
ndbout << " failed." << endl;
printError();
}
else
ndbout << "Done." << endl;
}
result = ndb_mgm_stop(m_mgmsrv, 1, &mgm_id);
ndbout << "Shutting down NDB Cluster management server nodeId="
<< this_mgmd << "...";
result= ndb_mgm_stop(m_mgmsrv, 1, &this_mgmd);
if (result <= 0) {
ndbout << "Shutdown of NDB Cluster management server failed." << endl;
ndbout << " failed." << endl;
printError();
if (result == 0)
return 1;
return result;
}
m_connected= false;
else
{
ndbout << "Done." << endl;
ndbout << "Disconnecting to allow management server to shutdown."
<< endl;
disconnect();
ndbout << "NDB Cluster management server shutdown." << endl;
}
ndbout << "NDB Cluster management servers shutdown." << endl;
return 0;
}
......@@ -1311,12 +1322,7 @@ CommandInterpreter::executeConnect(char* parameters)
{
disconnect();
if (!emptyString(parameters)) {
if (ndb_mgm_set_connectstring(m_mgmsrv,
BaseString(parameters).trim().c_str()))
{
printError();
return;
}
m_constr= BaseString(parameters).trim().c_str();
}
connect();
}
......@@ -1507,10 +1513,25 @@ CommandInterpreter::executeStop(Vector<BaseString> &command_list,
ndbout_c("NDB Cluster has shutdown.");
else
{
int mgm_id= 0;
int need_reconnect= 0;
mgm_id= ndb_mgm_get_mgmd_nodeid(m_mgmsrv);
ndbout << "Node";
for (int i= 0; i < no_of_nodes; i++)
{
if(node_ids[i] == mgm_id)
need_reconnect= 1;
else
ndbout << " " << node_ids[i];
}
ndbout_c(" has shutdown.");
if(need_reconnect)
{
ndbout << "You are connected to node " << mgm_id
<< ", disconnecting to allow it to shutdown"
<< endl;
disconnect();
}
}
}
}
......@@ -1640,9 +1661,16 @@ CommandInterpreter::executeRestart(Vector<BaseString> &command_list,
ndbout_c("NDB Cluster is being restarted.");
else
{
int mgm_id= 0;
mgm_id= ndb_mgm_get_mgmd_nodeid(m_mgmsrv);
ndbout << "Node";
for (int i= 0; i < no_of_nodes; i++)
{
if(node_ids[i] == mgm_id)
disconnect();
ndbout << " " << node_ids[i];
}
ndbout_c(" is being restarted");
}
}
......
......@@ -60,9 +60,6 @@
#include <SignalSender.hpp>
extern bool g_StopServer;
extern bool g_RestartServer;
//#define MGM_SRV_DEBUG
#ifdef MGM_SRV_DEBUG
#define DEBUG(x) do ndbout << x << endl; while(0)
......@@ -932,6 +929,13 @@ int MgmtSrvr::sendStopMgmd(NodeId nodeId,
* client connection to that mgmd and stop it that way.
* This allows us to stop mgm servers when there isn't any real
* distributed communication up.
*
* node_ids.size()==0 means to stop all DB nodes.
* MGM nodes will *NOT* be stopped.
*
* If we work out we should be stopping or restarting ourselves,
* we return <0 in stopSelf for restart, >0 for stop
* and 0 for do nothing.
*/
int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
......@@ -941,7 +945,8 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
bool stop,
bool restart,
bool nostart,
bool initialStart)
bool initialStart,
int* stopSelf)
{
int error = 0;
DBUG_ENTER("MgmtSrvr::sendSTOP_REQ");
......@@ -990,12 +995,13 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
NodeId nodeId= 0;
int use_master_node= 0;
int do_send= 0;
int do_stop_self= 0;
*stopSelf= 0;
NdbNodeBitmask nodes_to_stop;
{
for (unsigned i= 0; i < node_ids.size(); i++)
{
nodeId= node_ids[i];
ndbout << "asked to stop " << nodeId << endl;
if (getNodeType(nodeId) != NDB_MGM_NODE_TYPE_MGM)
nodes_to_stop.set(nodeId);
else if (nodeId != getOwnNodeId())
......@@ -1006,7 +1012,11 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
stoppedNodes.set(nodeId);
}
else
do_stop_self= 1;;
{
ndbout << "which is me" << endl;
*stopSelf= (restart)? -1 : 1;
stoppedNodes.set(nodeId);
}
}
}
int no_of_nodes_to_stop= nodes_to_stop.count();
......@@ -1039,14 +1049,6 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
nodes.set(nodeId);
}
}
nodeId= 0;
while(getNextNodeId(&nodeId, NDB_MGM_NODE_TYPE_MGM))
{
if(nodeId==getOwnNodeId())
continue;
if(sendStopMgmd(nodeId, abort, stop, restart, nostart, initialStart)==0)
stoppedNodes.set(nodeId);
}
}
// now wait for the replies
......@@ -1153,11 +1155,9 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
DBUG_RETURN(SEND_OR_RECEIVE_FAILED);
}
}
if (!error && do_stop_self)
if (error && *stopSelf)
{
if (restart)
g_RestartServer= true;
g_StopServer= true;
*stopSelf= 0;
}
DBUG_RETURN(error);
}
......@@ -1167,7 +1167,7 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
*/
int MgmtSrvr::stopNodes(const Vector<NodeId> &node_ids,
int *stopCount, bool abort)
int *stopCount, bool abort, int* stopSelf)
{
if (!abort)
{
......@@ -1189,20 +1189,25 @@ int MgmtSrvr::stopNodes(const Vector<NodeId> &node_ids,
false,
false,
false,
false);
false,
stopSelf);
if (stopCount)
*stopCount= nodes.count();
return ret;
}
/*
* Perform system shutdown
* Perform DB nodes shutdown.
* MGM servers are left in their current state
*/
int MgmtSrvr::stop(int * stopCount, bool abort)
int MgmtSrvr::shutdownDB(int * stopCount, bool abort)
{
NodeBitmask nodes;
Vector<NodeId> node_ids;
int tmp;
int ret = sendSTOP_REQ(node_ids,
nodes,
0,
......@@ -1210,7 +1215,8 @@ int MgmtSrvr::stop(int * stopCount, bool abort)
true,
false,
false,
false);
false,
&tmp);
if (stopCount)
*stopCount = nodes.count();
return ret;
......@@ -1235,6 +1241,7 @@ int MgmtSrvr::enterSingleUser(int * stopCount, Uint32 singleUserNodeId)
}
NodeBitmask nodes;
Vector<NodeId> node_ids;
int stopSelf;
int ret = sendSTOP_REQ(node_ids,
nodes,
singleUserNodeId,
......@@ -1242,7 +1249,8 @@ int MgmtSrvr::enterSingleUser(int * stopCount, Uint32 singleUserNodeId)
false,
false,
false,
false);
false,
&stopSelf);
if (stopCount)
*stopCount = nodes.count();
return ret;
......@@ -1254,7 +1262,8 @@ int MgmtSrvr::enterSingleUser(int * stopCount, Uint32 singleUserNodeId)
int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids,
int * stopCount, bool nostart,
bool initialStart, bool abort)
bool initialStart, bool abort,
int *stopSelf)
{
NodeBitmask nodes;
int ret= sendSTOP_REQ(node_ids,
......@@ -1264,21 +1273,24 @@ int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids,
false,
true,
nostart,
initialStart);
initialStart,
stopSelf);
if (stopCount)
*stopCount = nodes.count();
return ret;
}
/*
* Perform system restart
* Perform restart of all DB nodes
*/
int MgmtSrvr::restart(bool nostart, bool initialStart,
bool abort, int * stopCount )
int MgmtSrvr::restartDB(bool nostart, bool initialStart,
bool abort, int * stopCount)
{
NodeBitmask nodes;
Vector<NodeId> node_ids;
int tmp;
int ret = sendSTOP_REQ(node_ids,
nodes,
0,
......@@ -1286,7 +1298,8 @@ int MgmtSrvr::restart(bool nostart, bool initialStart,
true,
true,
true,
initialStart);
initialStart,
&tmp);
if (ret)
return ret;
......
......@@ -253,12 +253,13 @@ public:
* @param processId: Id of the DB process to stop
* @return 0 if succeeded, otherwise: as stated above, plus:
*/
int stopNodes(const Vector<NodeId> &node_ids, int *stopCount, bool abort);
int stopNodes(const Vector<NodeId> &node_ids, int *stopCount, bool abort,
int *stopSelf);
/**
* Stop the system
* shutdown the DB nodes
*/
int stop(int * cnt = 0, bool abort = false);
int shutdownDB(int * cnt = 0, bool abort = false);
/**
* print version info about a node
......@@ -292,12 +293,12 @@ public:
*/
int restartNodes(const Vector<NodeId> &node_ids,
int *stopCount, bool nostart,
bool initialStart, bool abort);
bool initialStart, bool abort, int *stopSelf);
/**
* Restart the system
* Restart all DB nodes
*/
int restart(bool nostart, bool initialStart,
int restartDB(bool nostart, bool initialStart,
bool abort = false,
int * stopCount = 0);
......@@ -503,7 +504,8 @@ private:
bool stop,
bool restart,
bool nostart,
bool initialStart);
bool initialStart,
int *stopSelf);
/**
* Check if it is possible to send a signal to a (DB) process
......
......@@ -35,6 +35,7 @@
#include <base64.h>
extern bool g_StopServer;
extern bool g_RestartServer;
extern EventLogger g_eventLogger;
static const unsigned int MAX_READ_TIMEOUT = 1000 ;
......@@ -267,6 +268,7 @@ MgmApiSession::MgmApiSession(class MgmtSrvr & mgm, NDB_SOCKET_TYPE sock)
m_output = new SocketOutputStream(sock);
m_parser = new Parser_t(commands, *m_input, true, true, true);
m_allocated_resources= new MgmtSrvr::Allocated_resources(m_mgmsrv);
m_stopSelf= 0;
DBUG_VOID_RETURN;
}
......@@ -286,6 +288,10 @@ MgmApiSession::~MgmApiSession()
NDB_CLOSE_SOCKET(m_socket);
m_socket= NDB_INVALID_SOCKET;
}
if(m_stopSelf < 0)
g_RestartServer= true;
if(m_stopSelf)
g_StopServer= true;
DBUG_VOID_RETURN;
}
......@@ -870,7 +876,8 @@ MgmApiSession::restart(Parser<MgmApiSession>::Context &,
&restarted,
nostart != 0,
initialstart != 0,
abort != 0);
abort != 0,
&m_stopSelf);
m_output->println("restart reply");
if(result != 0){
......@@ -894,7 +901,7 @@ MgmApiSession::restartAll(Parser<MgmApiSession>::Context &,
args.get("nostart", &nostart);
int count = 0;
int result = m_mgmsrv.restart(nostart, initialstart, abort, &count);
int result = m_mgmsrv.restartDB(nostart, initialstart, abort, &count);
m_output->println("restart reply");
if(result != 0)
......@@ -1013,7 +1020,7 @@ MgmApiSession::stop(Parser<MgmApiSession>::Context &,
int stopped= 0;
int result= 0;
if (nodes.size())
result= m_mgmsrv.stopNodes(nodes, &stopped, abort != 0);
result= m_mgmsrv.stopNodes(nodes, &stopped, abort != 0, &m_stopSelf);
m_output->println("stop reply");
if(result != 0)
......@@ -1032,7 +1039,7 @@ MgmApiSession::stopAll(Parser<MgmApiSession>::Context &,
Uint32 abort;
args.get("abort", &abort);
int result = m_mgmsrv.stop(&stopped, abort != 0);
int result = m_mgmsrv.shutdownDB(&stopped, abort != 0);
m_output->println("stop reply");
if(result != 0)
......
......@@ -40,6 +40,7 @@ private:
Parser_t *m_parser;
MgmtSrvr::Allocated_resources *m_allocated_resources;
char m_err_str[1024];
int m_stopSelf; // -1 is restart, 0 do nothing, 1 stop
void getConfig_common(Parser_t::Context &ctx,
const class Properties &args,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment