Commit 4c314f71 authored by unknown's avatar unknown

Merge tulin@bk-internal.mysql.com:/home/bk/mysql-5.0

into  poseidon.mysql.com:/home/tomas/mysql-5.0-ndb
parents f72e6447 46d676a4
......@@ -107,6 +107,10 @@ public:
CmvmiDumpLongSignalMemory = 2601,
CmvmiSetRestartOnErrorInsert = 2602,
CmvmiTestLongSigWithDelay = 2603,
CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
to be able to debug if events
for some reason does not end up
in clusterlog */
// 7000 DIH
// 7001 DIH
// 7002 DIH
......
......@@ -16,6 +16,7 @@
#include <ndb_global.h>
#include "EventLogger.hpp"
#include <TransporterCallback.hpp>
#include <NdbConfig.h>
#include <kernel/BlockNumbers.h>
......@@ -528,10 +529,100 @@ void getTextUndoLogBlocked(QQQQ) {
theData[2]);
}
void getTextTransporterError(QQQQ) {
BaseString::snprintf(m_text, m_text_len,
"Transporter to node %d reported error 0x%x",
theData[1],
theData[2]);
struct myTransporterError{
int errorNum;
char errorString[256];
};
int i = 0;
int lenth = 0;
static const struct myTransporterError TransporterErrorString[]=
{
//TE_NO_ERROR = 0
{TE_NO_ERROR,"No error"},
//TE_ERROR_CLOSING_SOCKET = 0x1
{TE_ERROR_CLOSING_SOCKET,"Error found during closing of socket"},
//TE_ERROR_IN_SELECT_BEFORE_ACCEPT = 0x2
{TE_ERROR_IN_SELECT_BEFORE_ACCEPT,"Error found before accept. The transporter will retry"},
//TE_INVALID_MESSAGE_LENGTH = 0x3 | TE_DO_DISCONNECT
{TE_INVALID_MESSAGE_LENGTH,"Error found in message (invalid message length)"},
//TE_INVALID_CHECKSUM = 0x4 | TE_DO_DISCONNECT
{TE_INVALID_CHECKSUM,"Error found in message (checksum)"},
//TE_COULD_NOT_CREATE_SOCKET = 0x5
{TE_COULD_NOT_CREATE_SOCKET,"Error found while creating socket(can't create socket)"},
//TE_COULD_NOT_BIND_SOCKET = 0x6
{TE_COULD_NOT_BIND_SOCKET,"Error found while binding server socket"},
//TE_LISTEN_FAILED = 0x7
{TE_LISTEN_FAILED,"Error found while listening to server socket"},
//TE_ACCEPT_RETURN_ERROR = 0x8
{TE_ACCEPT_RETURN_ERROR,"Error found during accept(accept return error)"},
//TE_SHM_DISCONNECT = 0xb | TE_DO_DISCONNECT
{TE_SHM_DISCONNECT,"The remote node has disconnected"},
//TE_SHM_IPC_STAT = 0xc | TE_DO_DISCONNECT
{TE_SHM_IPC_STAT,"Unable to check shm segment"},
//TE_SHM_UNABLE_TO_CREATE_SEGMENT = 0xd
{TE_SHM_UNABLE_TO_CREATE_SEGMENT,"Unable to create shm segment"},
//TE_SHM_UNABLE_TO_ATTACH_SEGMENT = 0xe
{TE_SHM_UNABLE_TO_ATTACH_SEGMENT,"Unable to attach shm segment"},
//TE_SHM_UNABLE_TO_REMOVE_SEGMENT = 0xf
{TE_SHM_UNABLE_TO_REMOVE_SEGMENT,"Unable to remove shm segment"},
//TE_TOO_SMALL_SIGID = 0x10
{TE_TOO_SMALL_SIGID,"Sig ID too small"},
//TE_TOO_LARGE_SIGID = 0x11
{TE_TOO_LARGE_SIGID,"Sig ID too large"},
//TE_WAIT_STACK_FULL = 0x12 | TE_DO_DISCONNECT
{TE_WAIT_STACK_FULL,"Wait stack was full"},
//TE_RECEIVE_BUFFER_FULL = 0x13 | TE_DO_DISCONNECT
{TE_RECEIVE_BUFFER_FULL,"Receive buffer was full"},
//TE_SIGNAL_LOST_SEND_BUFFER_FULL = 0x14 | TE_DO_DISCONNECT
{TE_SIGNAL_LOST_SEND_BUFFER_FULL,"Send buffer was full,and trying to force send fails"},
//TE_SIGNAL_LOST = 0x15
{TE_SIGNAL_LOST,"Send failed for unknown reason(signal lost)"},
//TE_SEND_BUFFER_FULL = 0x16
{TE_SEND_BUFFER_FULL,"The send buffer was full, but sleeping for a while solved"},
//TE_SCI_LINK_ERROR = 0x0017
{TE_SCI_LINK_ERROR,"There is no link from this node to the switch"},
//TE_SCI_UNABLE_TO_START_SEQUENCE = 0x18 | TE_DO_DISCONNECT
{TE_SCI_UNABLE_TO_START_SEQUENCE,"Could not start a sequence, because system resources are exumed or no sequence has been created"},
//TE_SCI_UNABLE_TO_REMOVE_SEQUENCE = 0x19 | TE_DO_DISCONNECT
{TE_SCI_UNABLE_TO_REMOVE_SEQUENCE,"Could not remove a sequence"},
//TE_SCI_UNABLE_TO_CREATE_SEQUENCE = 0x1a | TE_DO_DISCONNECT
{TE_SCI_UNABLE_TO_CREATE_SEQUENCE,"Could not create a sequence, because system resources are exempted. Must reboot"},
//TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR = 0x1b | TE_DO_DISCONNECT
{TE_SCI_UNRECOVERABLE_DATA_TFX_ERROR,"Tried to send data on redundant link but failed"},
//TE_SCI_CANNOT_INIT_LOCALSEGMENT = 0x1c | TE_DO_DISCONNECT
{TE_SCI_CANNOT_INIT_LOCALSEGMENT,"Cannot initialize local segment"},
//TE_SCI_CANNOT_MAP_REMOTESEGMENT = 0x1d | TE_DO_DISCONNEC
{TE_SCI_CANNOT_MAP_REMOTESEGMENT,"Cannot map remote segment"},
//TE_SCI_UNABLE_TO_UNMAP_SEGMENT = 0x1e | TE_DO_DISCONNECT
{TE_SCI_UNABLE_TO_UNMAP_SEGMENT,"Cannot free the resources used by this segment (step 1)"},
//TE_SCI_UNABLE_TO_REMOVE_SEGMENT = 0x1f | TE_DO_DISCONNEC
{TE_SCI_UNABLE_TO_REMOVE_SEGMENT,"Cannot free the resources used by this segment (step 2)"},
//TE_SCI_UNABLE_TO_DISCONNECT_SEGMENT = 0x20 | TE_DO_DISCONNECT
{TE_SCI_UNABLE_TO_DISCONNECT_SEGMENT,"Cannot disconnect from a remote segment"},
//TE_SHM_IPC_PERMANENT = 0x21
{TE_SHM_IPC_PERMANENT,"Shm ipc Permanent error"},
//TE_SCI_UNABLE_TO_CLOSE_CHANNEL = 0x22
{TE_SCI_UNABLE_TO_CLOSE_CHANNEL,"Unable to close the sci channel and the resources allocated"}
};
lenth = sizeof(TransporterErrorString)/sizeof(struct myTransporterError);
for(i=0; i<lenth; i++)
{
if(theData[2] == TransporterErrorString[i].errorNum)
{
BaseString::snprintf(m_text, m_text_len,
"Transporter to node %d reported error 0x%x: %s",
theData[1],
theData[2],
TransporterErrorString[i].errorString);
break;
}
}
if(i == lenth)
BaseString::snprintf(m_text, m_text_len,
"Transporter to node %d reported error 0x%x: unknown error",
theData[1],
theData[2]);
}
void getTextTransporterWarning(QQQQ) {
getTextTransporterError(m_text, m_text_len, theData);
......
......@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
case TimeToWaitAlive:
// QMGR
case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt ocks
case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
case HeartbeatIntervalDbApi:
case ArbitTimeout:
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
......@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
}
}
if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
{
SubscriberPtr ptr;
subscribers.first(ptr);
g_eventLogger.info("List subscriptions:");
while(ptr.i != RNIL)
{
g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
{
Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
g_eventLogger.info("Category %u Level %u", i, level);
}
subscribers.next(ptr);
}
}
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
g_sectionSegmentPool.getSize(),
......
This diff is collapsed.
......@@ -22,7 +22,10 @@
#include <NdbOut.hpp>
#include <NdbSleep.h>
#include <ErrorHandlingMacros.hpp>
#include <EventLogger.hpp>
extern EventLogger g_eventLogger;
extern "C"
void*
runWatchDog(void* w){
......@@ -125,7 +128,7 @@ WatchDog::run(){
last_stuck_action = "Unknown place";
break;
}//switch
ndbout << "Ndb kernel is stuck in: " << last_stuck_action << endl;
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
if(alerts == 3){
shutdownSystem(last_stuck_action);
}
......
......@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
return WRONG_PROCESS_TYPE;
// Check if we have contact with it
if(unCond){
if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
return 0;
}
else if (theFacade->get_node_alive(nodeId) == true)
......@@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId,
}
int
MgmtSrvr::setEventReportingLevelImpl(int nodeId,
MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
const EventSubscribeReq& ll)
{
SignalSender ss(theFacade);
ss.lock();
SimpleSignal ssig;
EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
EventSubscribeReq::SignalLength);
*dst = ll;
NodeBitmask nodes;
NdbNodeBitmask nodes;
int retries = 30;
nodes.clear();
Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
for(; (Uint32) nodeId <= max; nodeId++)
while (1)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (okToSendTo(nodeId, true))
continue;
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
Uint32 nodeId, max;
ss.lock();
SimpleSignal ssig;
EventSubscribeReq * dst =
CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
EventSubscribeReq::SignalLength);
*dst = ll;
if (nodeId_arg == 0)
{
nodes.set(nodeId);
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// first make sure nodes are sendable
for(; nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (okToSendTo(nodeId, true))
{
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
{
// node not connected we can safely skip this one
continue;
}
// api_reg_conf not recevied yet, need to retry
break;
}
}
if (nodeId <= max)
{
if (--retries)
{
ss.unlock();
NdbSleep_MilliSleep(100);
continue;
}
return SEND_OR_RECEIVE_FAILED;
}
if (nodeId_arg == 0)
{
// all nodes
nodeId = 1;
max = MAX_NDB_NODES;
}
else
{
// only one node
max = nodeId = nodeId_arg;
}
// now send to all sendable nodes nodes
// note, lock is held, so states have not changed
for(; (Uint32) nodeId <= max; nodeId++)
{
if (nodeTypes[nodeId] != NODE_TYPE_DB)
continue;
if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
continue; // node is not connected, skip
if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
nodes.set(nodeId);
}
break;
}
if (nodes.isclear())
......@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
int error = 0;
while (!nodes.isclear())
{
Uint32 nodeId;
SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber();
nodeId = refToNode(signal->header.theSendersBlockRef);
......
......@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node()
: m_state(NodeState::SL_NOTHING) {
compatible = nfCompleteRep = true;
connected = defined = m_alive = false;
connected = defined = m_alive = m_api_reg_conf = false;
m_state.m_connected_nodes.clear();
}
......@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
node.m_info.m_version);
}
node.m_api_reg_conf = true;
node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
......@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
noOfConnectedNodes--;
theNodes[nodeId].connected = false;
theNodes[nodeId].m_api_reg_conf = false;
theNodes[nodeId].m_state.m_connected_nodes.clear();
reportNodeFailed(nodeId, true);
......
......@@ -65,6 +65,7 @@ public:
bool compatible; // Version is compatible
bool nfCompleteRep; // NF Complete Rep has arrived
bool m_alive; // Node is alive
bool m_api_reg_conf;// API_REGCONF has arrived
NodeInfo m_info;
NodeState m_state;
......
......@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
SendStatus
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
s->readSignalNumber() == GSN_API_REGREQ);
return theFacade->theTransporterRegistry->prepareSend(&s->header,
1, // JBB
&s->theData[0],
......
......@@ -32,7 +32,7 @@ public:
Uint32 theData[25];
LinearSectionPtr ptr[3];
int readSignalNumber() {return header.theVerId_signalNumber; }
int readSignalNumber() const {return header.theVerId_signalNumber; }
Uint32 *getDataPtrSend() { return theData; }
const Uint32 *getDataPtr() const { return theData; }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment