guardian.cc 11.1 KB
Newer Older
1 2 3 4
/* Copyright (C) 2004 MySQL AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
unknown's avatar
unknown committed
5
   the Free Software Foundation; version 2 of the License.
6 7 8 9 10 11 12 13 14 15 16

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */


17
#if defined(__GNUC__) && defined(USE_PRAGMA_IMPLEMENTATION)
18 19 20 21 22
#pragma implementation
#endif

#include "guardian.h"
#include <string.h>
23 24 25
#include <sys/types.h>
#include <signal.h>

26 27 28 29
#include "instance.h"
#include "instance_map.h"
#include "log.h"
#include "mysql_manager_error.h"
unknown's avatar
unknown committed
30
#include "options.h"
31

32

unknown's avatar
unknown committed
33 34 35
/*************************************************************************
 {{{ Constructor & destructor.
*************************************************************************/
36

unknown's avatar
unknown committed
37 38
/**
  Guardian constructor.
39

unknown's avatar
unknown committed
40 41 42 43
  SYNOPSIS
    Guardian()
    thread_registry_arg
    instance_map_arg
44

unknown's avatar
unknown committed
45 46 47 48
  DESCRIPTION
    Nominal contructor intended for assigning references and initialize
    trivial objects. Real initialization is made by init() method.
*/
49

50
Guardian::Guardian(Thread_registry *thread_registry_arg,
unknown's avatar
unknown committed
51 52 53
                   Instance_map *instance_map_arg)
  :shutdown_requested(FALSE),
  stopped(FALSE),
54
  thread_registry(thread_registry_arg),
unknown's avatar
unknown committed
55
  instance_map(instance_map_arg)
56 57
{
  pthread_mutex_init(&LOCK_guardian, 0);
58
  pthread_cond_init(&COND_guardian, 0);
59 60 61
}


62
Guardian::~Guardian()
63
{
unknown's avatar
unknown committed
64 65 66 67 68 69 70 71 72
  /*
    NOTE: it's necessary to synchronize here, because Guiardian thread can be
    still alive an hold the mutex (because it is detached and we have no
    control over it).
  */

  lock();
  unlock();

73
  pthread_mutex_destroy(&LOCK_guardian);
74
  pthread_cond_destroy(&COND_guardian);
75 76
}

unknown's avatar
unknown committed
77 78 79
/*************************************************************************
  }}}
*************************************************************************/
80

81

unknown's avatar
unknown committed
82 83 84 85 86 87 88
/**
  Send request to stop Guardian.

  SYNOPSIS
    request_shutdown()
*/

89
void Guardian::request_shutdown()
90
{
unknown's avatar
unknown committed
91
  stop_instances();
unknown's avatar
unknown committed
92 93

  lock();
94
  shutdown_requested= TRUE;
unknown's avatar
unknown committed
95 96 97
  unlock();

  ping();
98 99 100
}


unknown's avatar
unknown committed
101 102 103 104 105 106 107 108 109 110 111 112 113
/**
  Process an instance.

  SYNOPSIS
    process_instance()
    instance  a pointer to the instance for processing

  MT-NOTE:
    - the given instance must be locked before calling this operation;
    - Guardian must be locked before calling this operation.
*/

void Guardian::process_instance(Instance *instance)
114
{
115 116 117
  int restart_retry= 100;
  time_t current_time= time(NULL);

unknown's avatar
unknown committed
118
  if (instance->get_state() == Instance::STOPPING)
119
  {
unknown's avatar
unknown committed
120
    /* This brach is executed during shutdown. */
121

unknown's avatar
unknown committed
122
    /* This returns TRUE if and only if an instance was stopped for sure. */
123
    if (instance->is_crashed())
124
    {
unknown's avatar
unknown committed
125 126 127 128 129 130 131 132 133 134 135 136
      log_info("Guardian: '%s' stopped.",
               (const char *) instance->get_name()->str);

      instance->set_state(Instance::STOPPED);
    }
    else if ((uint) (current_time - instance->last_checked) >=
             instance->options.get_shutdown_delay())
    {
      log_info("Guardian: '%s' hasn't stopped within %d secs.",
               (const char *) instance->get_name()->str,
               (int) instance->options.get_shutdown_delay());

137
      instance->kill_mysqld(SIGKILL);
unknown's avatar
unknown committed
138 139 140 141 142 143 144 145 146 147 148 149

      log_info("Guardian: pretend that '%s' is killed.",
               (const char *) instance->get_name()->str);

      instance->set_state(Instance::STOPPED);
    }
    else
    {
      log_info("Guardian: waiting for '%s' to stop (%d secs left).",
               (const char *) instance->get_name()->str,
               (int) (instance->options.get_shutdown_delay() -
                      current_time + instance->last_checked));
150
    }
151 152 153 154

    return;
  }

155
  if (instance->is_mysqld_running())
156
  {
157 158 159
    /* The instance can be contacted  on it's port */

    /* If STARTING also check that pidfile has been created */
unknown's avatar
unknown committed
160 161
    if (instance->get_state() == Instance::STARTING &&
        instance->options.load_pid() == 0)
162 163 164
    {
      /* Pid file not created yet, don't go to STARTED state yet  */
    }
unknown's avatar
unknown committed
165
    else if (instance->get_state() != Instance::STARTED)
166 167
    {
      /* clear status fields */
168
      log_info("Guardian: '%s' is running, set state to STARTED.",
unknown's avatar
unknown committed
169
               (const char *) instance->options.instance_name.str);
unknown's avatar
unknown committed
170 171
      instance->reset_stat();
      instance->set_state(Instance::STARTED);
172
    }
173 174 175
  }
  else
  {
unknown's avatar
unknown committed
176 177
    switch (instance->get_state()) {
    case Instance::NOT_STARTED:
178
      log_info("Guardian: starting '%s'...",
unknown's avatar
unknown committed
179
               (const char *) instance->options.instance_name.str);
180

unknown's avatar
unknown committed
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
      /* NOTE: set state to STARTING _before_ start() is called. */
      instance->set_state(Instance::STARTING);
      instance->last_checked= current_time;

      instance->start_mysqld();

      return;

    case Instance::STARTED:     /* fallthrough */
    case Instance::STARTING:    /* let the instance start or crash */
      if (!instance->is_crashed())
        return;

      instance->crash_moment= current_time;
      instance->last_checked= current_time;
      instance->set_state(Instance::JUST_CRASHED);
      /* fallthrough -- restart an instance immediately */

    case Instance::JUST_CRASHED:
      if (current_time - instance->crash_moment <= 2)
201
      {
202 203
        if (instance->is_crashed())
        {
unknown's avatar
unknown committed
204
          instance->start_mysqld();
205
          log_info("Guardian: starting '%s'...",
unknown's avatar
unknown committed
206
                   (const char *) instance->options.instance_name.str);
207
        }
208
      }
unknown's avatar
unknown committed
209
      else
unknown's avatar
unknown committed
210 211 212 213 214
        instance->set_state(Instance::CRASHED);

      return;

    case Instance::CRASHED:    /* just regular restarts */
unknown's avatar
unknown committed
215 216
      if ((ulong) (current_time - instance->last_checked) <=
          (ulong) Options::Main::monitoring_interval)
unknown's avatar
unknown committed
217 218 219
        return;

      if (instance->restart_counter < restart_retry)
220
      {
unknown's avatar
unknown committed
221
        if (instance->is_crashed())
222
        {
unknown's avatar
unknown committed
223 224 225 226
          instance->start_mysqld();
          instance->last_checked= current_time;

          log_info("Guardian: restarting '%s'...",
227
                   (const char *) instance->options.instance_name.str);
228
        }
229
      }
unknown's avatar
unknown committed
230 231 232 233 234 235 236 237 238 239 240 241 242 243
      else
      {
        log_info("Guardian: can not start '%s'. "
                 "Abandoning attempts to (re)start it",
                 (const char *) instance->options.instance_name.str);

        instance->set_state(Instance::CRASHED_AND_ABANDONED);
      }

      return;

    case Instance::CRASHED_AND_ABANDONED:
      return; /* do nothing */

244 245 246 247
    default:
      DBUG_ASSERT(0);
    }
  }
248 249 250
}


unknown's avatar
unknown committed
251
/**
252
  Main function of Guardian thread.
253

254
  SYNOPSIS
255 256 257
    run()

  DESCRIPTION
unknown's avatar
unknown committed
258
    Check for all guarded instances and restart them if needed.
259 260
*/

261
void Guardian::run()
262
{
263
  struct timespec timeout;
264

265 266
  log_info("Guardian: started.");

267
  thread_registry->register_thread(&thread_info);
268

unknown's avatar
unknown committed
269
  /* Loop, until all instances were shut down at the end. */
270

unknown's avatar
unknown committed
271
  while (true)
272
  {
unknown's avatar
unknown committed
273 274 275 276 277
    Instance_map::Iterator instances_it(instance_map);
    Instance *instance;
    bool all_instances_stopped= TRUE;

    instance_map->lock();
278

unknown's avatar
unknown committed
279
    while ((instance= instances_it.next()))
280
    {
unknown's avatar
unknown committed
281
      instance->lock();
282

unknown's avatar
unknown committed
283 284 285 286 287 288 289 290 291 292 293 294 295
      if (!instance->is_guarded() ||
          instance->get_state() == Instance::STOPPED)
      {
        instance->unlock();
        continue;
      }

      process_instance(instance);

      if (instance->get_state() != Instance::STOPPED)
        all_instances_stopped= FALSE;

      instance->unlock();
296
    }
unknown's avatar
unknown committed
297 298 299 300 301 302 303 304 305 306 307 308 309 310

    instance_map->unlock();

    lock();

    if (shutdown_requested && all_instances_stopped)
    {
      log_info("Guardian: all guarded mysqlds stopped.");

      stopped= TRUE;
      unlock();
      break;
    }

311
    set_timespec(timeout, Options::Main::monitoring_interval);
312

unknown's avatar
unknown committed
313 314 315
    thread_registry->cond_timedwait(&thread_info, &COND_guardian,
                                    &LOCK_guardian, &timeout);
    unlock();
316 317
  }

318 319
  log_info("Guardian: stopped.");

unknown's avatar
unknown committed
320 321
  /* Now, when the Guardian is stopped we can stop the IM. */

322 323
  thread_registry->unregister_thread(&thread_info);
  thread_registry->request_shutdown();
324 325

  log_info("Guardian: finished.");
326 327 328
}


unknown's avatar
unknown committed
329 330 331 332 333
/**
  Return the value of stopped flag.
*/

bool Guardian::is_stopped()
334 335
{
  int var;
unknown's avatar
unknown committed
336 337

  lock();
338
  var= stopped;
unknown's avatar
unknown committed
339 340
  unlock();

341 342 343 344
  return var;
}


unknown's avatar
unknown committed
345 346
/**
  Wake up Guardian thread.
347

unknown's avatar
unknown committed
348 349 350 351 352
  MT-NOTE: though usually the mutex associated with condition variable should
  be acquired before signalling the variable, here this is not needed.
  Signalling under locked mutex is used to avoid lost signals. In the current
  logic however locking mutex does not guarantee that the signal will not be
  lost.
353 354
*/

unknown's avatar
unknown committed
355
void Guardian::ping()
unknown's avatar
unknown committed
356
{
unknown's avatar
unknown committed
357
  pthread_cond_signal(&COND_guardian);
unknown's avatar
unknown committed
358 359 360
}


unknown's avatar
unknown committed
361 362
/**
  Prepare list of instances.
363

364
  SYNOPSIS
unknown's avatar
unknown committed
365
    init()
366

unknown's avatar
unknown committed
367
  MT-NOTE: Instance Map must be locked before calling the operation.
368 369
*/

unknown's avatar
unknown committed
370
void Guardian::init()
371
{
unknown's avatar
unknown committed
372 373
  Instance *instance;
  Instance_map::Iterator iterator(instance_map);
374

unknown's avatar
unknown committed
375 376 377
  while ((instance= iterator.next()))
  {
    instance->lock();
378

unknown's avatar
unknown committed
379 380
    instance->reset_stat();
    instance->set_state(Instance::NOT_STARTED);
381

unknown's avatar
unknown committed
382 383
    instance->unlock();
  }
384 385
}

unknown's avatar
unknown committed
386 387

/**
unknown's avatar
unknown committed
388 389
  An internal method which is called at shutdown to unregister instances and
  attempt to stop them if requested.
390

391
  SYNOPSIS
392 393 394 395
    stop_instances()

  DESCRIPTION
    Loops through the guarded_instances list and prepares them for shutdown.
unknown's avatar
unknown committed
396 397
    For each instance we issue a stop command and change the state
    accordingly.
unknown's avatar
unknown committed
398 399

  NOTE
unknown's avatar
unknown committed
400
    Guardian object should be locked by the caller.
401 402 403

*/

unknown's avatar
unknown committed
404
void Guardian::stop_instances()
405
{
406 407
  static const int NUM_STOP_ATTEMPTS = 100;

unknown's avatar
unknown committed
408 409 410 411 412 413
  Instance_map::Iterator instances_it(instance_map);
  Instance *instance;

  instance_map->lock();

  while ((instance= instances_it.next()))
414
  {
unknown's avatar
unknown committed
415 416 417 418 419 420 421 422 423
    instance->lock();

    if (!instance->is_guarded() ||
        instance->get_state() == Instance::STOPPED)
    {
      instance->unlock();
      continue;
    }

unknown's avatar
unknown committed
424 425 426 427
    /*
      If instance is running or was running (and now probably hanging),
      request stop.
    */
unknown's avatar
unknown committed
428 429 430

    if (instance->is_mysqld_running() ||
        instance->get_state() == Instance::STARTED)
431
    {
unknown's avatar
unknown committed
432 433
      instance->set_state(Instance::STOPPING);
      instance->last_checked= time(NULL);
434 435
    }
    else
unknown's avatar
unknown committed
436 437 438 439 440 441 442
    {
      /* Otherwise mark it as STOPPED. */
      instance->set_state(Instance::STOPPED);
    }

    /* Request mysqld to stop. */

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
    bool instance_stopped= FALSE;

    for (int cur_attempt= 0; cur_attempt < NUM_STOP_ATTEMPTS; ++cur_attempt)
    {
      if (!instance->kill_mysqld(SIGTERM))
      {
        instance_stopped= TRUE;
        break;
      }

      if (!instance->is_active())
      {
        instance_stopped= TRUE;
        break;
      }

      /* Sleep for 0.3 sec and check again. */

      my_sleep(300000);
    }

    /*
      Abort if we failed to stop mysqld instance. That should not happen,
      but if it happened, we don't know what to do and prefer to have clear
      failure with coredump.
    */

    DBUG_ASSERT(instance_stopped);
unknown's avatar
unknown committed
471 472

    instance->unlock();
473
  }
unknown's avatar
unknown committed
474 475

  instance_map->unlock();
476
}
477 478


unknown's avatar
unknown committed
479 480 481 482
/**
  Lock Guardian.
*/

483
void Guardian::lock()
484
{
485
  pthread_mutex_lock(&LOCK_guardian);
486 487 488
}


unknown's avatar
unknown committed
489 490 491 492
/**
  Unlock Guardian.
*/

493
void Guardian::unlock()
494
{
495
  pthread_mutex_unlock(&LOCK_guardian);
496
}