Commit 1182b801 authored by unknown's avatar unknown

Bug #28899 not possible to set separate watchdog timeout at startup


storage/ndb/include/mgmapi/mgmapi_config_parameters.h:
  add new configuration parameter TimeBetweenWatchDogCheckInitial
storage/ndb/include/portlib/NdbTick.h:
  enable timing code
storage/ndb/src/common/portlib/NdbTick.c:
  enable timing code
storage/ndb/src/kernel/blocks/ndbcntr/NdbcntrMain.cpp:
  read watchdog timeout to set it after malloc
storage/ndb/src/kernel/vm/Configuration.cpp:
  read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/vm/Configuration.hpp:
  read initial watchdog timeout and set it in the beginning
storage/ndb/src/kernel/vm/SimulatedBlock.cpp:
  introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/SimulatedBlock.hpp:
  introduce new state for "action" malloc of memory
storage/ndb/src/kernel/vm/WatchDog.cpp:
  rewrite watchdog to check every 100ms for being stuch, but keep shutdown after 3 * interval
  for "action" == 9 (malloc)  keep old behavior and only output every interval
storage/ndb/src/mgmsrv/ConfigInfo.cpp:
  add new configuration parameter TimeBetweenWatchDogCheckInitial
parent 1a166bc4
...@@ -81,6 +81,8 @@ ...@@ -81,6 +81,8 @@
#define CFG_DB_BACKUP_WRITE_SIZE 136 #define CFG_DB_BACKUP_WRITE_SIZE 136
#define CFG_DB_BACKUP_MAX_WRITE_SIZE 139 #define CFG_DB_BACKUP_MAX_WRITE_SIZE 139
#define CFG_DB_WATCHDOG_INTERVAL_INITIAL 141
#define CFG_LOG_DESTINATION 147 #define CFG_LOG_DESTINATION 147
#define CFG_DB_DISCLESS 148 #define CFG_DB_DISCLESS 148
......
...@@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(void); ...@@ -37,9 +37,6 @@ NDB_TICKS NdbTick_CurrentMillisecond(void);
*/ */
int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros); int NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros);
/*#define TIME_MEASUREMENT*/
#ifdef TIME_MEASUREMENT
struct MicroSecondTimer { struct MicroSecondTimer {
NDB_TICKS seconds; NDB_TICKS seconds;
NDB_TICKS micro_seconds; NDB_TICKS micro_seconds;
...@@ -54,7 +51,6 @@ struct MicroSecondTimer { ...@@ -54,7 +51,6 @@ struct MicroSecondTimer {
NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start, NDB_TICKS NdbTick_getMicrosPassed(struct MicroSecondTimer start,
struct MicroSecondTimer stop); struct MicroSecondTimer stop);
int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now); int NdbTick_getMicroTimer(struct MicroSecondTimer* time_now);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#include <ndb_global.h> #include <ndb_global.h>
#include "NdbTick.h" #include <NdbTick.h>
#define NANOSEC_PER_SEC 1000000000 #define NANOSEC_PER_SEC 1000000000
#define MICROSEC_PER_SEC 1000000 #define MICROSEC_PER_SEC 1000000
...@@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros){ ...@@ -71,7 +71,6 @@ NdbTick_CurrentMicrosecond(NDB_TICKS * secs, Uint32 * micros){
} }
#endif #endif
#ifdef TIME_MEASUREMENT
int int
NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer) NdbTick_getMicroTimer(struct MicroSecondTimer* input_timer)
{ {
...@@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSecondTimer start, ...@@ -102,4 +101,3 @@ NdbTick_getMicrosPassed(struct MicroSecondTimer start,
} }
return ret_value; return ret_value;
} }
#endif
...@@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal) ...@@ -277,6 +277,14 @@ void Ndbcntr::execSTTOR(Signal* signal)
break; break;
case ZSTART_PHASE_1: case ZSTART_PHASE_1:
jam(); jam();
{
Uint32 db_watchdog_interval = 0;
const ndb_mgm_configuration_iterator * p =
m_ctx.m_config.getOwnConfigIterator();
ndb_mgm_get_int_parameter(p, CFG_DB_WATCHDOG_INTERVAL, &db_watchdog_interval);
ndbrequire(db_watchdog_interval);
update_watch_dog_timer(db_watchdog_interval);
}
startPhase1Lab(signal); startPhase1Lab(signal);
break; break;
case ZSTART_PHASE_2: case ZSTART_PHASE_2:
......
...@@ -443,6 +443,11 @@ Configuration::setupConfiguration(){ ...@@ -443,6 +443,11 @@ Configuration::setupConfiguration(){
"TimeBetweenWatchDogCheck missing"); "TimeBetweenWatchDogCheck missing");
} }
if(iter.get(CFG_DB_WATCHDOG_INTERVAL_INITIAL, &_timeBetweenWatchDogCheckInitial)){
ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, "Invalid configuration fetched",
"TimeBetweenWatchDogCheckInitial missing");
}
/** /**
* Get paths * Get paths
*/ */
...@@ -462,9 +467,12 @@ Configuration::setupConfiguration(){ ...@@ -462,9 +467,12 @@ Configuration::setupConfiguration(){
* Create the watch dog thread * Create the watch dog thread
*/ */
{ {
Uint32 t = _timeBetweenWatchDogCheck; if (_timeBetweenWatchDogCheckInitial < _timeBetweenWatchDogCheck)
_timeBetweenWatchDogCheckInitial = _timeBetweenWatchDogCheck;
Uint32 t = _timeBetweenWatchDogCheckInitial;
t = globalEmulatorData.theWatchDog ->setCheckInterval(t); t = globalEmulatorData.theWatchDog ->setCheckInterval(t);
_timeBetweenWatchDogCheck = t; _timeBetweenWatchDogCheckInitial = t;
} }
ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config); ConfigValues* cf = ConfigValuesFactory::extractCurrentSection(iter.m_config);
......
...@@ -84,6 +84,7 @@ private: ...@@ -84,6 +84,7 @@ private:
Uint32 _maxErrorLogs; Uint32 _maxErrorLogs;
Uint32 _lockPagesInMainMemory; Uint32 _lockPagesInMainMemory;
Uint32 _timeBetweenWatchDogCheck; Uint32 _timeBetweenWatchDogCheck;
Uint32 _timeBetweenWatchDogCheckInitial;
ndb_mgm_configuration * m_ownConfig; ndb_mgm_configuration * m_ownConfig;
ndb_mgm_configuration * m_clusterConfig; ndb_mgm_configuration * m_clusterConfig;
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <NdbOut.hpp> #include <NdbOut.hpp>
#include <GlobalData.hpp> #include <GlobalData.hpp>
#include <Emulator.hpp> #include <Emulator.hpp>
#include <WatchDog.hpp>
#include <ErrorHandlingMacros.hpp> #include <ErrorHandlingMacros.hpp>
#include <TimeQueue.hpp> #include <TimeQueue.hpp>
#include <TransporterRegistry.hpp> #include <TransporterRegistry.hpp>
...@@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U ...@@ -662,7 +663,7 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
void * p = NULL; void * p = NULL;
size_t size = n*s; size_t size = n*s;
Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s); Uint64 real_size = (Uint64)((Uint64)n)*((Uint64)s);
refresh_watch_dog(); refresh_watch_dog(9);
if (real_size > 0){ if (real_size > 0){
#ifdef VM_TRACE_MEM #ifdef VM_TRACE_MEM
ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes", ndbout_c("%s::allocRecord(%s, %u, %u) = %llu bytes",
...@@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U ...@@ -696,12 +697,12 @@ SimulatedBlock::allocRecord(const char * type, size_t s, size_t n, bool clear, U
char * ptr = (char*)p; char * ptr = (char*)p;
const Uint32 chunk = 128 * 1024; const Uint32 chunk = 128 * 1024;
while(size > chunk){ while(size > chunk){
refresh_watch_dog(); refresh_watch_dog(9);
memset(ptr, 0, chunk); memset(ptr, 0, chunk);
ptr += chunk; ptr += chunk;
size -= chunk; size -= chunk;
} }
refresh_watch_dog(); refresh_watch_dog(9);
memset(ptr, 0, size); memset(ptr, 0, size);
} }
} }
...@@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** ptr, ...@@ -720,9 +721,16 @@ SimulatedBlock::deallocRecord(void ** ptr,
} }
void void
SimulatedBlock::refresh_watch_dog() SimulatedBlock::refresh_watch_dog(Uint32 place)
{ {
globalData.incrementWatchDogCounter(1); globalData.incrementWatchDogCounter(place);
}
void
SimulatedBlock::update_watch_dog_timer(Uint32 interval)
{
extern EmulatorData globalEmulatorData;
globalEmulatorData.theWatchDog->setCheckInterval(interval);
} }
void void
......
...@@ -334,7 +334,8 @@ protected: ...@@ -334,7 +334,8 @@ protected:
* Refresh Watch Dog in initialising code * Refresh Watch Dog in initialising code
* *
*/ */
void refresh_watch_dog(); void refresh_watch_dog(Uint32 place = 1);
void update_watch_dog_timer(Uint32 interval);
/** /**
* Prog error * Prog error
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include <ErrorHandlingMacros.hpp> #include <ErrorHandlingMacros.hpp>
#include <EventLogger.hpp> #include <EventLogger.hpp>
#include <NdbTick.h>
extern EventLogger g_eventLogger; extern EventLogger g_eventLogger;
extern "C" extern "C"
...@@ -72,76 +74,118 @@ WatchDog::doStop(){ ...@@ -72,76 +74,118 @@ WatchDog::doStop(){
} }
} }
void const char *get_action(Uint32 IPValue)
WatchDog::run(){ {
unsigned int anIPValue; const char *action;
unsigned int alerts = 0; switch (IPValue) {
unsigned int oldIPValue = 0;
// WatchDog for the single threaded NDB
while(!theStop){
Uint32 tmp = theInterval / 500;
tmp= (tmp ? tmp : 1);
while(!theStop && tmp > 0){
NdbSleep_MilliSleep(500);
tmp--;
}
if(theStop)
break;
// Verify that the IP thread is not stuck in a loop
anIPValue = *theIPValue;
if(anIPValue != 0) {
oldIPValue = anIPValue;
globalData.incrementWatchDogCounter(0);
alerts = 0;
} else {
const char *last_stuck_action;
alerts++;
switch (oldIPValue) {
case 1: case 1:
last_stuck_action = "Job Handling"; action = "Job Handling";
break; break;
case 2: case 2:
last_stuck_action = "Scanning Timers"; action = "Scanning Timers";
break; break;
case 3: case 3:
last_stuck_action = "External I/O"; action = "External I/O";
break; break;
case 4: case 4:
last_stuck_action = "Print Job Buffers at crash"; action = "Print Job Buffers at crash";
break; break;
case 5: case 5:
last_stuck_action = "Checking connections"; action = "Checking connections";
break; break;
case 6: case 6:
last_stuck_action = "Performing Send"; action = "Performing Send";
break; break;
case 7: case 7:
last_stuck_action = "Polling for Receive"; action = "Polling for Receive";
break; break;
case 8: case 8:
last_stuck_action = "Performing Receive"; action = "Performing Receive";
break;
case 9:
action = "Allocating memory";
break; break;
default: default:
last_stuck_action = "Unknown place"; action = "Unknown place";
break; break;
}//switch }//switch
return action;
}
void
WatchDog::run()
{
unsigned int anIPValue, sleep_time;
unsigned int oldIPValue = 0;
unsigned int theIntervalCheck = theInterval;
struct MicroSecondTimer start_time, last_time, now;
NdbTick_getMicroTimer(&start_time);
last_time = start_time;
// WatchDog for the single threaded NDB
while (!theStop)
{
sleep_time= 100;
NdbSleep_MilliSleep(sleep_time);
if(theStop)
break;
NdbTick_getMicroTimer(&now);
if (NdbTick_getMicrosPassed(last_time, now)/1000 > sleep_time*2)
{
struct tms my_tms;
times(&my_tms);
g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
(Uint64)my_tms.tms_utime,
(Uint64)my_tms.tms_stime);
g_eventLogger.warning("Watchdog: Warning overslept %u ms, expected %u ms.",
NdbTick_getMicrosPassed(last_time, now)/1000,
sleep_time);
}
last_time = now;
// Verify that the IP thread is not stuck in a loop
anIPValue = *theIPValue;
if (anIPValue != 0)
{
oldIPValue = anIPValue;
globalData.incrementWatchDogCounter(0);
NdbTick_getMicroTimer(&start_time);
theIntervalCheck = theInterval;
}
else
{
int warn = 1;
Uint32 elapsed = NdbTick_getMicrosPassed(start_time, now)/1000;
/*
oldIPValue == 9 indicates malloc going on, this can take some time
so only warn if we pass the watchdog interval
*/
if (oldIPValue == 9)
if (elapsed < theIntervalCheck)
warn = 0;
else
theIntervalCheck += theInterval;
if (warn)
{
const char *last_stuck_action = get_action(oldIPValue);
g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action); g_eventLogger.warning("Ndb kernel is stuck in: %s", last_stuck_action);
{ {
struct tms my_tms; struct tms my_tms;
times(&my_tms); times(&my_tms);
g_eventLogger.info("User time: %llu System time: %llu", g_eventLogger.info("Watchdog: User time: %llu System time: %llu",
(Uint64)my_tms.tms_utime, (Uint64)my_tms.tms_utime,
(Uint64)my_tms.tms_stime); (Uint64)my_tms.tms_stime);
} }
if(alerts == 3){ if (elapsed > 3 * theInterval)
{
shutdownSystem(last_stuck_action); shutdownSystem(last_stuck_action);
} }
} }
} }
}
return; return;
} }
......
...@@ -571,6 +571,18 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { ...@@ -571,6 +571,18 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
"70", "70",
STR_VALUE(MAX_INT_RNIL) }, STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_WATCHDOG_INTERVAL_INITIAL,
"TimeBetweenWatchDogCheckInitial",
DB_TOKEN,
"Time between execution checks inside a database node in the early start phases when memory is allocated",
ConfigInfo::CI_USED,
true,
ConfigInfo::CI_INT,
"6000",
"70",
STR_VALUE(MAX_INT_RNIL) },
{ {
CFG_DB_STOP_ON_ERROR, CFG_DB_STOP_ON_ERROR,
"StopOnError", "StopOnError",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment