log.cc 303 KB
Newer Older
Sergei Golubchik's avatar
Sergei Golubchik committed
1
/* Copyright (c) 2000, 2013, Oracle and/or its affiliates.
2
   Copyright (c) 2009, 2015, MariaDB
unknown's avatar
unknown committed
3

unknown's avatar
unknown committed
4 5
   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
unknown's avatar
unknown committed
6
   the Free Software Foundation; version 2 of the License.
unknown's avatar
unknown committed
7

unknown's avatar
unknown committed
8 9 10 11
   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.
unknown's avatar
unknown committed
12

unknown's avatar
unknown committed
13 14
   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
15
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA */
unknown's avatar
unknown committed
16 17


unknown's avatar
unknown committed
18 19 20 21 22 23 24 25 26
/**
  @file

  @brief
  logging of commands

  @todo
    Abort logging when we get an error in reading or writing log files
*/
unknown's avatar
unknown committed
27

28
#include <my_global.h>		/* NO_EMBEDDED_ACCESS_CHECKS */
29 30
#include "sql_priv.h"
#include "log.h"
31
#include "sql_base.h"                           // open_log_table
unknown's avatar
unknown committed
32
#include "sql_repl.h"
33 34 35 36 37 38
#include "sql_delete.h"                         // mysql_truncate
#include "sql_parse.h"                          // command_name
#include "sql_time.h"           // calc_time_from_sec, my_time_compare
#include "tztime.h"             // my_tz_OFFSET0, struct Time_zone
#include "sql_acl.h"            // SUPER_ACL
#include "log_event.h"          // Query_log_event
unknown's avatar
unknown committed
39
#include "rpl_filter.h"
40
#include "rpl_rli.h"
41
#include "sql_audit.h"
42
#include "log_slow.h"
43
#include "mysqld.h"
unknown's avatar
unknown committed
44 45 46 47 48

#include <my_dir.h>
#include <stdarg.h>
#include <m_ctype.h>				// For test_if_number

Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
49
#ifdef _WIN32
unknown's avatar
unknown committed
50 51 52
#include "message.h"
#endif

53
#include "sql_plugin.h"
He Zhenxing's avatar
He Zhenxing committed
54
#include "rpl_handler.h"
55
#include "debug_sync.h"
unknown's avatar
unknown committed
56
#include "sql_show.h"
57
#include "my_pthread.h"
58
#include "wsrep_mysqld.h"
unknown's avatar
unknown committed
59

60 61 62
/* max size of the log message */
#define MAX_LOG_BUFFER_SIZE 1024
#define MAX_TIME_SIZE 32
63
#define MY_OFF_T_UNDEF (~(my_off_t)0UL)
64

65
#define FLAGSTR(V,F) ((V)&(F)?#F" ":"")
66

67
handlerton *binlog_hton;
68 69
LOGGER logger;

70 71 72
const char *log_bin_index= 0;
const char *log_bin_basename= 0;

73
MYSQL_BIN_LOG mysql_bin_log(&sync_binlog_period);
unknown's avatar
unknown committed
74 75

static bool test_if_number(const char *str,
76
			   ulong *res, bool allow_wildcards);
77 78 79 80
static int binlog_init(void *p);
static int binlog_close_connection(handlerton *hton, THD *thd);
static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv);
static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv);
81 82
static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
                                                      THD *thd);
83 84 85
static int binlog_commit(handlerton *hton, THD *thd, bool all);
static int binlog_rollback(handlerton *hton, THD *thd, bool all);
static int binlog_prepare(handlerton *hton, THD *thd, bool all);
86
static int binlog_start_consistent_snapshot(handlerton *hton, THD *thd);
87

unknown's avatar
unknown committed
88 89 90
static LEX_STRING const write_error_msg=
    { C_STRING_WITH_LEN("error writing to the binary log") };

91
static my_bool opt_optimize_thread_scheduling= TRUE;
92
ulong binlog_checksum_options;
unknown's avatar
unknown committed
93
#ifndef DBUG_OFF
94
ulong opt_binlog_dbug_fsync_sleep= 0;
unknown's avatar
unknown committed
95 96
#endif

Sergei Golubchik's avatar
Sergei Golubchik committed
97
mysql_mutex_t LOCK_prepare_ordered;
98
mysql_cond_t COND_prepare_ordered;
99
mysql_mutex_t LOCK_after_binlog_sync;
Sergei Golubchik's avatar
Sergei Golubchik committed
100
mysql_mutex_t LOCK_commit_ordered;
unknown's avatar
unknown committed
101 102 103

static ulonglong binlog_status_var_num_commits;
static ulonglong binlog_status_var_num_group_commits;
104 105 106
static ulonglong binlog_status_group_commit_trigger_count;
static ulonglong binlog_status_group_commit_trigger_lock_wait;
static ulonglong binlog_status_group_commit_trigger_timeout;
unknown's avatar
unknown committed
107 108
static char binlog_snapshot_file[FN_REFLEN];
static ulonglong binlog_snapshot_position;
unknown's avatar
unknown committed
109 110 111 112 113 114 115

static SHOW_VAR binlog_status_vars_detail[]=
{
  {"commits",
    (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
  {"group_commits",
    (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
116 117 118 119 120 121
  {"group_commit_trigger_count",
    (char *)&binlog_status_group_commit_trigger_count, SHOW_LONGLONG},
  {"group_commit_trigger_lock_wait",
    (char *)&binlog_status_group_commit_trigger_lock_wait, SHOW_LONGLONG},
  {"group_commit_trigger_timeout",
    (char *)&binlog_status_group_commit_trigger_timeout, SHOW_LONGLONG},
unknown's avatar
unknown committed
122 123 124 125
  {"snapshot_file",
    (char *)&binlog_snapshot_file, SHOW_CHAR},
  {"snapshot_position",
   (char *)&binlog_snapshot_position, SHOW_LONGLONG},
unknown's avatar
unknown committed
126 127 128
  {NullS, NullS, SHOW_LONG}
};

129 130 131 132 133 134 135 136 137 138 139
/*
  Variables for the binlog background thread.
  Protected by the MYSQL_BIN_LOG::LOCK_binlog_background_thread mutex.
 */
static bool binlog_background_thread_started= false;
static bool binlog_background_thread_stop= false;
static MYSQL_BIN_LOG::xid_count_per_binlog *
    binlog_background_thread_queue= NULL;

static bool start_binlog_background_thread();

unknown's avatar
unknown committed
140
static rpl_binlog_state rpl_global_gtid_binlog_state;
141

142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
/**
   purge logs, master and slave sides both, related error code
   convertor.
   Called from @c purge_error_message(), @c MYSQL_BIN_LOG::reset_logs()

   @param  res  an internal to purging routines error code 

   @return the user level error code ER_*
*/
uint purge_log_get_error_code(int res)
{
  uint errcode= 0;

  switch (res)  {
  case 0: break;
  case LOG_INFO_EOF:	errcode= ER_UNKNOWN_TARGET_BINLOG; break;
  case LOG_INFO_IO:	errcode= ER_IO_ERR_LOG_INDEX_READ; break;
  case LOG_INFO_INVALID:errcode= ER_BINLOG_PURGE_PROHIBITED; break;
  case LOG_INFO_SEEK:	errcode= ER_FSEEK_FAIL; break;
  case LOG_INFO_MEM:	errcode= ER_OUT_OF_RESOURCES; break;
  case LOG_INFO_FATAL:	errcode= ER_BINLOG_PURGE_FATAL_ERR; break;
  case LOG_INFO_IN_USE: errcode= ER_LOG_IN_USE; break;
  case LOG_INFO_EMFILE: errcode= ER_BINLOG_PURGE_EMFILE; break;
  default:		errcode= ER_LOG_PURGE_UNKNOWN_ERR; break;
  }

  return errcode;
}

171 172 173 174 175 176 177 178 179
/**
  Silence all errors and warnings reported when performing a write
  to a log table.
  Errors and warnings are not reported to the client or SQL exception
  handlers, so that the presence of logging does not interfere and affect
  the logic of an application.
*/
class Silence_log_table_errors : public Internal_error_handler
{
180
  char m_message[MYSQL_ERRMSG_SIZE];
181 182
public:
  Silence_log_table_errors()
183 184 185
  {
    m_message[0]= '\0';
  }
186 187 188

  virtual ~Silence_log_table_errors() {}

Marc Alff's avatar
Marc Alff committed
189 190 191
  virtual bool handle_condition(THD *thd,
                                uint sql_errno,
                                const char* sql_state,
192
                                Sql_condition::enum_warning_level level,
Marc Alff's avatar
Marc Alff committed
193
                                const char* msg,
194
                                Sql_condition ** cond_hdl);
195
  const char *message() const { return m_message; }
196 197 198
};

bool
Marc Alff's avatar
Marc Alff committed
199 200 201
Silence_log_table_errors::handle_condition(THD *,
                                           uint,
                                           const char*,
202
                                           Sql_condition::enum_warning_level,
Marc Alff's avatar
Marc Alff committed
203
                                           const char* msg,
204
                                           Sql_condition ** cond_hdl)
205
{
Marc Alff's avatar
Marc Alff committed
206
  *cond_hdl= NULL;
207
  strmake_buf(m_message, msg);
208 209 210
  return TRUE;
}

211 212 213 214 215 216 217
sql_print_message_func sql_print_message_handlers[3] =
{
  sql_print_information,
  sql_print_warning,
  sql_print_error
};

218

Sergei Golubchik's avatar
Sergei Golubchik committed
219 220 221 222 223 224
/**
  Create the name of the log file
  
  @param[OUT] out    a pointer to a new allocated name will go there
  @param[IN] log_ext The extension for the file (e.g .log)
  @param[IN] once    whether to use malloc_once or a normal malloc.
225
*/
Sergei Golubchik's avatar
Sergei Golubchik committed
226
void make_default_log_name(char **out, const char* log_ext, bool once)
227
{
Sergei Golubchik's avatar
Sergei Golubchik committed
228 229 230 231 232
  char buff[FN_REFLEN+10];
  fn_format(buff, opt_log_basename, "", log_ext, MYF(MY_REPLACE_EXT));
  if (once)
    *out= my_once_strdup(buff, MYF(MY_WME));
  else
233
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
234 235
    my_free(*out);
    *out= my_strdup(buff, MYF(MY_WME));
236
  }
237
}
238 239


240
/*
241 242
  Helper classes to store non-transactional and transactional data
  before copying it to the binary log.
243
*/
244 245
class binlog_cache_data
{
246
public:
247
  binlog_cache_data(): m_pending(0), before_stmt_pos(MY_OFF_T_UNDEF),
248 249 250 251 252
  incident(FALSE), changes_to_non_trans_temp_table_flag(FALSE),
  saved_max_binlog_cache_size(0), ptr_binlog_cache_use(0),
  ptr_binlog_cache_disk_use(0)
  { }
  
253
  ~binlog_cache_data()
254
  {
255 256
    DBUG_ASSERT(empty());
    close_cached_file(&cache_log);
257 258
  }

259 260 261
  bool empty() const
  {
    return pending() == NULL && my_b_tell(&cache_log) == 0;
262 263
  }

264
  Rows_log_event *pending() const
265
  {
266
    return m_pending;
267
  }
268

269
  void set_pending(Rows_log_event *const pending_arg)
270
  {
271
    m_pending= pending_arg;
272
  }
273

274 275 276 277 278 279 280 281
  void set_incident(void)
  {
    incident= TRUE;
  }
  
  bool has_incident(void)
  {
    return(incident);
282 283
  }

284 285 286 287 288 289 290 291 292 293
  void set_changes_to_non_trans_temp_table()
  {
    changes_to_non_trans_temp_table_flag= TRUE;    
  }

  bool changes_to_non_trans_temp_table()
  {
    return (changes_to_non_trans_temp_table_flag);    
  }

294 295
  void reset()
  {
296
    compute_statistics();
297
    truncate(0);
298
    changes_to_non_trans_temp_table_flag= FALSE;
299
    incident= FALSE;
300
    before_stmt_pos= MY_OFF_T_UNDEF;
301
    /*
302 303 304 305 306 307
      The truncate function calls reinit_io_cache that calls
      my_b_flush_io_cache which may increase disk_writes. This breaks
      the disk_writes use by the binary log which aims to compute the
      ratio between in-memory cache usage and disk cache usage. To
      avoid this undesirable behavior, we reset the variable after
      truncating the cache.
308 309
    */
    cache_log.disk_writes= 0;
310
    DBUG_ASSERT(empty());
311 312
  }

313
  my_off_t get_byte_position() const
314
  {
315
    return my_b_tell(&cache_log);
316 317
  }

318
  my_off_t get_prev_position()
319
  {
320
     return(before_stmt_pos);
321 322
  }

323
  void set_prev_position(my_off_t pos)
324
  {
325
     before_stmt_pos= pos;
326 327
  }
  
328
  void restore_prev_position()
329
  {
330
    truncate(before_stmt_pos);
331 332
  }

333 334 335 336 337
  void restore_savepoint(my_off_t pos)
  {
    truncate(pos);
    if (pos < before_stmt_pos)
      before_stmt_pos= MY_OFF_T_UNDEF;
338 339
  }

340
  void set_binlog_cache_info(my_off_t param_max_binlog_cache_size,
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369
                             ulong *param_ptr_binlog_cache_use,
                             ulong *param_ptr_binlog_cache_disk_use)
  {
    /*
      The assertions guarantee that the set_binlog_cache_info is
      called just once and information passed as parameters are
      never zero.

      This is done while calling the constructor binlog_cache_mngr.
      We cannot set informaton in the constructor binlog_cache_data
      because the space for binlog_cache_mngr is allocated through
      a placement new.

      In the future, we can refactor this and change it to avoid
      the set_binlog_info. 
    */
    DBUG_ASSERT(saved_max_binlog_cache_size == 0 &&
                param_max_binlog_cache_size != 0 &&
                ptr_binlog_cache_use == 0 &&
                param_ptr_binlog_cache_use != 0 &&
                ptr_binlog_cache_disk_use == 0 &&
                param_ptr_binlog_cache_disk_use != 0);

    saved_max_binlog_cache_size= param_max_binlog_cache_size;
    ptr_binlog_cache_use= param_ptr_binlog_cache_use;
    ptr_binlog_cache_disk_use= param_ptr_binlog_cache_disk_use;
    cache_log.end_of_file= saved_max_binlog_cache_size;
  }

370 371
  /*
    Cache to store data before copying it to the binary log.
372
  */
373
  IO_CACHE cache_log;
374

375 376
private:
  /*
377 378
    Pending binrows event. This event is the event where the rows are currently
    written.
379 380 381 382 383 384 385
   */
  Rows_log_event *m_pending;

  /*
    Binlog position before the start of the current statement.
  */
  my_off_t before_stmt_pos;
386 387 388 389 390 391 392
 
  /*
    This indicates that some events did not get into the cache and most likely
    it is corrupted.
  */ 
  bool incident;

393 394 395 396 397 398
  /*
    This flag indicates if the cache has changes to temporary tables.
    @TODO This a temporary fix and should be removed after BUG#54562.
  */
  bool changes_to_non_trans_temp_table_flag;

399 400
  /**
    This function computes binlog cache and disk usage.
401
  */
402 403 404 405
  void compute_statistics()
  {
    if (!empty())
    {
406
      statistic_increment(*ptr_binlog_cache_use, &LOCK_status);
407
      if (cache_log.disk_writes != 0)
408
        statistic_increment(*ptr_binlog_cache_disk_use, &LOCK_status);
409 410
    }
  }
411

412 413 414 415 416
  /*
    Stores the values of maximum size of the cache allowed when this cache
    is configured. This corresponds to either
      . max_binlog_cache_size or max_binlog_stmt_cache_size.
  */
417
  my_off_t saved_max_binlog_cache_size;
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432

  /*
    Stores a pointer to the status variable that keeps track of the in-memory 
    cache usage. This corresponds to either
      . binlog_cache_use or binlog_stmt_cache_use.
  */
  ulong *ptr_binlog_cache_use;

  /*
    Stores a pointer to the status variable that keeps track of the disk
    cache usage. This corresponds to either
      . binlog_cache_disk_use or binlog_stmt_cache_disk_use.
  */
  ulong *ptr_binlog_cache_disk_use;

433 434 435 436 437 438 439 440 441 442 443 444 445
  /*
    It truncates the cache to a certain position. This includes deleting the
    pending event.
   */
  void truncate(my_off_t pos)
  {
    DBUG_PRINT("info", ("truncating to position %lu", (ulong) pos));
    if (pending())
    {
      delete pending();
      set_pending(0);
    }
    reinit_io_cache(&cache_log, WRITE_CACHE, pos, 0, 0);
446
    cache_log.end_of_file= saved_max_binlog_cache_size;
447 448 449 450 451 452 453 454
  }
 
  binlog_cache_data& operator=(const binlog_cache_data& info);
  binlog_cache_data(const binlog_cache_data& info);
};

class binlog_cache_mngr {
public:
455 456
  binlog_cache_mngr(my_off_t param_max_binlog_stmt_cache_size,
                    my_off_t param_max_binlog_cache_size,
457 458 459 460
                    ulong *param_ptr_binlog_stmt_cache_use,
                    ulong *param_ptr_binlog_stmt_cache_disk_use,
                    ulong *param_ptr_binlog_cache_use,
                    ulong *param_ptr_binlog_cache_disk_use)
Sergei Golubchik's avatar
Sergei Golubchik committed
461
    : last_commit_pos_offset(0), using_xa(FALSE), xa_xid(0)
462 463 464 465 466 467 468
  {
     stmt_cache.set_binlog_cache_info(param_max_binlog_stmt_cache_size,
                                      param_ptr_binlog_stmt_cache_use,
                                      param_ptr_binlog_stmt_cache_disk_use);
     trx_cache.set_binlog_cache_info(param_max_binlog_cache_size,
                                     param_ptr_binlog_cache_use,
                                     param_ptr_binlog_cache_disk_use);
Sergei Golubchik's avatar
Sergei Golubchik committed
469
     last_commit_pos_file[0]= 0;
470
  }
471

Sergei Golubchik's avatar
Sergei Golubchik committed
472
  void reset(bool do_stmt, bool do_trx)
473
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
474 475 476 477 478 479 480 481 482
    if (do_stmt)
      stmt_cache.reset();
    if (do_trx)
    {
      trx_cache.reset();
      using_xa= FALSE;
      last_commit_pos_file[0]= 0;
      last_commit_pos_offset= 0;
    }
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
  }

  binlog_cache_data* get_binlog_cache_data(bool is_transactional)
  {
    return (is_transactional ? &trx_cache : &stmt_cache);
  }

  IO_CACHE* get_binlog_cache_log(bool is_transactional)
  {
    return (is_transactional ? &trx_cache.cache_log : &stmt_cache.cache_log);
  }

  binlog_cache_data stmt_cache;

  binlog_cache_data trx_cache;

499
  /*
500 501 502 503 504
    Binlog position for current transaction.
    For START TRANSACTION WITH CONSISTENT SNAPSHOT, this is the binlog
    position corresponding to the snapshot taken. During (and after) commit,
    this is set to the binlog position corresponding to just after the
    commit (so storage engines can store it in their transaction log).
505
  */
506 507
  char last_commit_pos_file[FN_REFLEN];
  my_off_t last_commit_pos_offset;
508

509 510 511 512 513
  /*
    Flag set true if this transaction is committed with log_xid() as part of
    XA, false if not.
  */
  bool using_xa;
unknown's avatar
unknown committed
514
  my_xid xa_xid;
515 516 517 518 519 520 521 522
  bool need_unlog;
  /*
    Id of binlog that transaction was written to; only needed if need_unlog is
    true.
  */
  ulong binlog_id;
  /* Set if we get an error during commit that must be returned from unlog(). */
  bool delayed_error;
Sergei Golubchik's avatar
Sergei Golubchik committed
523

524 525 526 527
private:

  binlog_cache_mngr& operator=(const binlog_cache_mngr& info);
  binlog_cache_mngr(const binlog_cache_mngr& info);
528 529
};

530 531 532 533
bool LOGGER::is_log_table_enabled(uint log_table_type)
{
  switch (log_table_type) {
  case QUERY_LOG_SLOW:
Monty's avatar
Monty committed
534
    return (table_log_handler != NULL) && global_system_variables.sql_log_slow;
535 536 537 538 539 540 541 542
  case QUERY_LOG_GENERAL:
    return (table_log_handler != NULL) && opt_log ;
  default:
    DBUG_ASSERT(0);
    return FALSE;                             /* make compiler happy */
  }
}

543 544 545 546 547 548 549 550 551 552 553 554 555 556
/**
   Check if a given table is opened log table

   @param table             Table to check
   @param check_if_opened   Only fail if it's a log table in use
   @param error_msg	    String to put in error message if not ok.
                            No error message if 0
   @return 0 ok
   @return # Type of log file
 */

int check_if_log_table(const TABLE_LIST *table,
                       bool check_if_opened,
                       const char *error_msg)
557
{
558 559
  int result= 0;
  if (table->db_length == 5 &&
560
      !my_strcasecmp(table_alias_charset, table->db, "mysql"))
561
  {
562 563 564
    const char *table_name= table->table_name;

    if (table->table_name_length == 11 &&
565
        !my_strcasecmp(table_alias_charset, table_name, "general_log"))
566
    {
567 568
      result= QUERY_LOG_GENERAL;
      goto end;
569 570
    }

571 572
    if (table->table_name_length == 8 &&
        !my_strcasecmp(table_alias_charset, table_name, "slow_log"))
573
    {
574 575
      result= QUERY_LOG_SLOW;
      goto end;
576
    }
577 578
  }
  return 0;
579 580 581 582 583 584 585 586 587

end:
  if (!check_if_opened || logger.is_log_table_enabled(result))
  {
    if (error_msg)
      my_error(ER_BAD_LOG_STATEMENT, MYF(0), error_msg);
    return result;
  }
  return 0;
588 589
}

590

591 592 593 594 595 596 597 598 599
Log_to_csv_event_handler::Log_to_csv_event_handler()
{
}


Log_to_csv_event_handler::~Log_to_csv_event_handler()
{
}

600

601 602 603 604 605 606 607
void Log_to_csv_event_handler::cleanup()
{
  logger.is_log_tables_initialized= FALSE;
}

/* log event handlers */

608
/**
609 610
  Log command to the general log table

611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
  Log given command to the general log table.

  @param  event_time        command start timestamp
  @param  user_host         the pointer to the string with user@host info
  @param  user_host_len     length of the user_host string. this is computed
                            once and passed to all general log event handlers
  @param  thread_id         Id of the thread, issued a query
  @param  command_type      the type of the command being logged
  @param  command_type_len  the length of the string above
  @param  sql_text          the very text of the query being executed
  @param  sql_text_len      the length of sql_text string


  @return This function attempts to never call my_error(). This is
  necessary, because general logging happens already after a statement
  status has been sent to the client, so the client can not see the
  error anyway. Besides, the error is not related to the statement
  being executed and is internal, and thus should be handled
  internally (@todo: how?).
  If a write to the table has failed, the function attempts to
  write to a short error message to the file. The failure is also
  indicated in the return value. 

  @retval  FALSE   OK
  @retval  TRUE    error occured
636 637 638
*/

bool Log_to_csv_event_handler::
639
  log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
640
              uint user_host_len, int thread_id_arg,
641
              const char *command_type, uint command_type_len,
642 643
              const char *sql_text, uint sql_text_len,
              CHARSET_INFO *client_cs)
644
{
645 646 647 648 649 650
  TABLE_LIST table_list;
  TABLE *table;
  bool result= TRUE;
  bool need_close= FALSE;
  bool need_pop= FALSE;
  bool need_rnd_end= FALSE;
unknown's avatar
unknown committed
651
  uint field_index;
652
  Silence_log_table_errors error_handler;
653
  Open_tables_backup open_tables_backup;
654 655
  ulonglong save_thd_options;
  bool save_time_zone_used;
656
  DBUG_ENTER("log_general");
657

658 659 660 661 662
  /*
    CSV uses TIME_to_timestamp() internally if table needs to be repaired
    which will set thd->time_zone_used
  */
  save_time_zone_used= thd->time_zone_used;
663

664 665
  save_thd_options= thd->variables.option_bits;
  thd->variables.option_bits&= ~OPTION_BIN_LOG;
666

Konstantin Osipov's avatar
Konstantin Osipov committed
667 668 669 670
  table_list.init_one_table(MYSQL_SCHEMA_NAME.str, MYSQL_SCHEMA_NAME.length,
                            GENERAL_LOG_NAME.str, GENERAL_LOG_NAME.length,
                            GENERAL_LOG_NAME.str,
                            TL_WRITE_CONCURRENT_INSERT);
671

672
  /*
Marc Alff's avatar
Marc Alff committed
673
    1) open_log_table generates an error of the
674 675 676 677 678 679 680 681 682 683 684 685
    table can not be opened or is corrupted.
    2) "INSERT INTO general_log" can generate warning sometimes.

    Suppress these warnings and errors, they can't be dealt with
    properly anyway.

    QQ: this problem needs to be studied in more detail.
    Comment this 2 lines and run "cast.test" to see what's happening.
  */
  thd->push_internal_handler(& error_handler);
  need_pop= TRUE;

Marc Alff's avatar
Marc Alff committed
686
  if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
687 688
    goto err;

689 690
  need_close= TRUE;

691
  if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
692
      table->file->ha_rnd_init_with_error(0))
693 694 695 696 697 698
    goto err;

  need_rnd_end= TRUE;

  /* Honor next number columns if present */
  table->next_number_field= table->found_next_number_field;
699

700 701 702 703 704 705 706 707 708
  /*
    NOTE: we do not call restore_record() here, as all fields are
    filled by the Logger (=> no need to load default ones).
  */

  /*
    We do not set a value for table->field[0], as it will use
    default value (which is CURRENT_TIMESTAMP).
  */
709

710
  /* check that all columns exist */
711
  if (table->s->fields < 6)
712 713
    goto err;

714 715
  DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);

716 717
  ((Field_timestamp*) table->field[0])->store_TIME(
                  hrtime_to_my_time(event_time), hrtime_sec_part(event_time));
718

719 720
  /* do a write */
  if (table->field[1]->store(user_host, user_host_len, client_cs) ||
721
      table->field[2]->store((longlong) thread_id_arg, TRUE) ||
722 723
      table->field[3]->store((longlong) global_system_variables.server_id,
                             TRUE) ||
724
      table->field[4]->store(command_type, command_type_len, client_cs))
725 726
    goto err;

727 728 729 730
  /*
    A positive return value in store() means truncation.
    Still logging a message in the log in this case.
  */
731
  table->field[5]->flags|= FIELDFLAG_HEX_ESCAPE;
732 733
  if (table->field[5]->store(sql_text, sql_text_len, client_cs) < 0)
    goto err;
734 735

  /* mark all fields as not null */
736 737 738 739 740
  table->field[1]->set_notnull();
  table->field[2]->set_notnull();
  table->field[3]->set_notnull();
  table->field[4]->set_notnull();
  table->field[5]->set_notnull();
741

742 743 744 745 746 747
  /* Set any extra columns to their default values */
  for (field_index= 6 ; field_index < table->s->fields ; field_index++)
  {
    table->field[field_index]->set_default();
  }

748
  /* log table entries are not replicated */
749
  if (table->file->ha_write_row(table->record[0]))
750
    goto err;
751

752
  result= FALSE;
753

754
err:
755
  if (result && !thd->killed)
756 757
    sql_print_error("Failed to write to mysql.general_log: %s",
                    error_handler.message());
758

759 760 761 762 763 764 765 766
  if (need_rnd_end)
  {
    table->file->ha_rnd_end();
    table->file->ha_release_auto_increment();
  }
  if (need_pop)
    thd->pop_internal_handler();
  if (need_close)
Marc Alff's avatar
Marc Alff committed
767
    close_log_table(thd, &open_tables_backup);
768

769
  thd->variables.option_bits= save_thd_options;
770
  thd->time_zone_used= save_time_zone_used;
771
  DBUG_RETURN(result);
772 773 774 775 776 777 778
}


/*
  Log a query to the slow log table

  SYNOPSIS
779
    log_slow()
780 781 782 783 784
    thd               THD of the query
    current_time      current timestamp
    user_host         the pointer to the string with user@host info
    user_host_len     length of the user_host string. this is computed once
                      and passed to all general log event handlers
785 786
    query_time        Amount of time the query took to execute (in microseconds)
    lock_time         Amount of time the query was locked (in microseconds)
787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
    is_command        The flag, which determines, whether the sql_text is a
                      query or an administrator command (these are treated
                      differently by the old logging routines)
    sql_text          the very text of the query or administrator command
                      processed
    sql_text_len      the length of sql_text string

  DESCRIPTION

   Log a query to the slow log table

  RETURN
    FALSE - OK
    TRUE - error occured
*/

bool Log_to_csv_event_handler::
804
  log_slow(THD *thd, my_hrtime_t current_time,
805
           const char *user_host, uint user_host_len,
806
           ulonglong query_utime, ulonglong lock_utime, bool is_command,
807 808
           const char *sql_text, uint sql_text_len)
{
809 810 811 812 813
  TABLE_LIST table_list;
  TABLE *table;
  bool result= TRUE;
  bool need_close= FALSE;
  bool need_rnd_end= FALSE;
814
  Silence_log_table_errors error_handler;
815
  Open_tables_backup open_tables_backup;
816
  CHARSET_INFO *client_cs= thd->variables.character_set_client;
817
  bool save_time_zone_used;
818 819
  long query_time= (long) MY_MIN(query_utime/1000000, TIME_MAX_VALUE_SECONDS);
  long lock_time=  (long) MY_MIN(lock_utime/1000000, TIME_MAX_VALUE_SECONDS);
820 821 822
  long query_time_micro= (long) (query_utime % 1000000);
  long lock_time_micro=  (long) (lock_utime % 1000000);

823
  DBUG_ENTER("Log_to_csv_event_handler::log_slow");
824

825
  thd->push_internal_handler(& error_handler);
826 827 828 829 830 831
  /*
    CSV uses TIME_to_timestamp() internally if table needs to be repaired
    which will set thd->time_zone_used
  */
  save_time_zone_used= thd->time_zone_used;

Konstantin Osipov's avatar
Konstantin Osipov committed
832 833 834 835
  table_list.init_one_table(MYSQL_SCHEMA_NAME.str, MYSQL_SCHEMA_NAME.length,
                            SLOW_LOG_NAME.str, SLOW_LOG_NAME.length,
                            SLOW_LOG_NAME.str,
                            TL_WRITE_CONCURRENT_INSERT);
836

Marc Alff's avatar
Marc Alff committed
837
  if (!(table= open_log_table(thd, &table_list, &open_tables_backup)))
838
    goto err;
839 840 841

  need_close= TRUE;

842
  if (table->file->extra(HA_EXTRA_MARK_AS_LOG_TABLE) ||
843
      table->file->ha_rnd_init_with_error(0))
844 845 846 847 848 849
    goto err;

  need_rnd_end= TRUE;

  /* Honor next number columns if present */
  table->next_number_field= table->found_next_number_field;
850

851
  restore_record(table, s->default_values);    // Get empty record
852

853
  /* check that all columns exist */
854
  if (table->s->fields < 13)
855 856
    goto err;

857 858
  /* store the time and user values */
  DBUG_ASSERT(table->field[0]->type() == MYSQL_TYPE_TIMESTAMP);
859 860
  ((Field_timestamp*) table->field[0])->store_TIME(
             hrtime_to_my_time(current_time), hrtime_sec_part(current_time));
861 862
  if (table->field[1]->store(user_host, user_host_len, client_cs))
    goto err;
863

864 865 866 867 868 869 870 871 872 873
  /*
    A TIME field can not hold the full longlong range; query_time or
    lock_time may be truncated without warning here, if greater than
    839 hours (~35 days)
  */
  MYSQL_TIME t;
  t.neg= 0;

  /* fill in query_time field */
  calc_time_from_sec(&t, query_time, query_time_micro);
874
  if (table->field[2]->store_time(&t))
875 876 877
    goto err;
  /* lock_time */
  calc_time_from_sec(&t, lock_time, lock_time_micro);
878
  if (table->field[3]->store_time(&t))
879 880
    goto err;
  /* rows_sent */
Sergei Golubchik's avatar
Sergei Golubchik committed
881
  if (table->field[4]->store((longlong) thd->get_sent_row_count(), TRUE))
882 883
    goto err;
  /* rows_examined */
Sergei Golubchik's avatar
Sergei Golubchik committed
884
  if (table->field[5]->store((longlong) thd->get_examined_row_count(), TRUE))
885
    goto err;
886

887
  /* fill database field */
888
  if (thd->db)
889
  {
890 891
    if (table->field[6]->store(thd->db, thd->db_length, client_cs))
      goto err;
892 893
    table->field[6]->set_notnull();
  }
894

895
  if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
896
  {
897 898 899 900 901
    if (table->
        field[7]->store((longlong)
                        thd->first_successful_insert_id_in_prev_stmt_for_binlog,
                        TRUE))
      goto err;
902 903
    table->field[7]->set_notnull();
  }
904

905 906 907 908 909 910 911
  /*
    Set value if we do an insert on autoincrement column. Note that for
    some engines (those for which get_auto_increment() does not leave a
    table lock until the statement ends), this is just the first value and
    the next ones used may not be contiguous to it.
  */
  if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
912
  {
913 914 915 916
    if (table->
        field[8]->store((longlong)
          thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(), TRUE))
      goto err;
917 918
    table->field[8]->set_notnull();
  }
919

920
  if (table->field[9]->store((longlong)global_system_variables.server_id, TRUE))
921
    goto err;
922
  table->field[9]->set_notnull();
923

924 925 926 927 928 929
  /*
    Column sql_text.
    A positive return value in store() means truncation.
    Still logging a message in the log in this case.
  */
  if (table->field[10]->store(sql_text, sql_text_len, client_cs) < 0)
930 931
    goto err;

932 933 934
  if (table->field[11]->store((longlong) thd->thread_id, TRUE))
    goto err;

935 936 937 938 939 940
  /* Rows_affected */
  if (table->field[12]->store(thd->get_stmt_da()->is_ok() ?
                              (longlong) thd->get_stmt_da()->affected_rows() :
                              0, TRUE))
    goto err;

941
  /* log table entries are not replicated */
942
  if (table->file->ha_write_row(table->record[0]))
943
    goto err;
944

945
  result= FALSE;
946

947
err:
948 949
  thd->pop_internal_handler();

950
  if (result && !thd->killed)
951 952
    sql_print_error("Failed to write to mysql.slow_log: %s",
                    error_handler.message());
953

954 955 956 957 958 959
  if (need_rnd_end)
  {
    table->file->ha_rnd_end();
    table->file->ha_release_auto_increment();
  }
  if (need_close)
Marc Alff's avatar
Marc Alff committed
960
    close_log_table(thd, &open_tables_backup);
961 962 963 964 965 966 967 968 969
  thd->time_zone_used= save_time_zone_used;
  DBUG_RETURN(result);
}

int Log_to_csv_event_handler::
  activate_log(THD *thd, uint log_table_type)
{
  TABLE_LIST table_list;
  TABLE *table;
Konstantin Osipov's avatar
Konstantin Osipov committed
970
  LEX_STRING *UNINIT_VAR(log_name);
971
  int result;
972
  Open_tables_backup open_tables_backup;
973 974 975 976 977

  DBUG_ENTER("Log_to_csv_event_handler::activate_log");

  if (log_table_type == QUERY_LOG_GENERAL)
  {
Konstantin Osipov's avatar
Konstantin Osipov committed
978
    log_name= &GENERAL_LOG_NAME;
979 980 981 982 983
  }
  else
  {
    DBUG_ASSERT(log_table_type == QUERY_LOG_SLOW);

Konstantin Osipov's avatar
Konstantin Osipov committed
984 985 986 987 988
    log_name= &SLOW_LOG_NAME;
  }
  table_list.init_one_table(MYSQL_SCHEMA_NAME.str, MYSQL_SCHEMA_NAME.length,
                            log_name->str, log_name->length, log_name->str,
                            TL_WRITE_CONCURRENT_INSERT);
989

Marc Alff's avatar
Marc Alff committed
990
  table= open_log_table(thd, &table_list, &open_tables_backup);
991 992 993
  if (table)
  {
    result= 0;
Marc Alff's avatar
Marc Alff committed
994
    close_log_table(thd, &open_tables_backup);
995 996 997 998
  }
  else
    result= 1;

999
  DBUG_RETURN(result);
1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
}

bool Log_to_csv_event_handler::
  log_error(enum loglevel level, const char *format, va_list args)
{
  /* No log table is implemented */
  DBUG_ASSERT(0);
  return FALSE;
}

bool Log_to_file_event_handler::
  log_error(enum loglevel level, const char *format,
            va_list args)
{
  return vprint_msg_to_log(level, format, args);
}

void Log_to_file_event_handler::init_pthread_objects()
{
  mysql_log.init_pthread_objects();
  mysql_slow_log.init_pthread_objects();
}


unknown's avatar
unknown committed
1024
/** Wrapper around MYSQL_LOG::write() for slow log. */
1025 1026

bool Log_to_file_event_handler::
1027
  log_slow(THD *thd, my_hrtime_t current_time,
1028
           const char *user_host, uint user_host_len,
1029
           ulonglong query_utime, ulonglong lock_utime, bool is_command,
1030 1031
           const char *sql_text, uint sql_text_len)
{
1032 1033
  Silence_log_table_errors error_handler;
  thd->push_internal_handler(&error_handler);
1034
  bool retval= mysql_slow_log.write(thd, hrtime_to_my_time(current_time),
1035 1036 1037 1038 1039
                                    user_host, user_host_len,
                                    query_utime, lock_utime, is_command,
                                    sql_text, sql_text_len);
  thd->pop_internal_handler();
  return retval;
1040 1041 1042
}


unknown's avatar
unknown committed
1043
/**
1044 1045 1046 1047 1048
   Wrapper around MYSQL_LOG::write() for general log. We need it since we
   want all log event handlers to have the same signature.
*/

bool Log_to_file_event_handler::
1049
  log_general(THD *thd, my_hrtime_t event_time, const char *user_host,
1050
              uint user_host_len, int thread_id_arg,
1051
              const char *command_type, uint command_type_len,
1052 1053
              const char *sql_text, uint sql_text_len,
              CHARSET_INFO *client_cs)
1054
{
1055 1056
  Silence_log_table_errors error_handler;
  thd->push_internal_handler(&error_handler);
1057 1058
  bool retval= mysql_log.write(hrtime_to_time(event_time), user_host,
                               user_host_len,
1059
                               thread_id_arg, command_type, command_type_len,
1060 1061 1062
                               sql_text, sql_text_len);
  thd->pop_internal_handler();
  return retval;
1063 1064 1065 1066 1067 1068 1069
}


bool Log_to_file_event_handler::init()
{
  if (!is_initialized)
  {
Monty's avatar
Monty committed
1070
    if (global_system_variables.sql_log_slow)
1071
      mysql_slow_log.open_slow_log(opt_slow_logname);
1072 1073

    if (opt_log)
1074
      mysql_log.open_query_log(opt_logname);
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091

    is_initialized= TRUE;
  }

  return FALSE;
}


void Log_to_file_event_handler::cleanup()
{
  mysql_log.cleanup();
  mysql_slow_log.cleanup();
}

void Log_to_file_event_handler::flush()
{
  /* reopen log files */
1092
  if (opt_log)
1093
    mysql_log.reopen_file();
Monty's avatar
Monty committed
1094
  if (global_system_variables.sql_log_slow)
1095
    mysql_slow_log.reopen_file();
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117
}

/*
  Log error with all enabled log event handlers

  SYNOPSIS
    error_log_print()

    level             The level of the error significance: NOTE,
                      WARNING or ERROR.
    format            format string for the error message
    args              list of arguments for the format string

  RETURN
    FALSE - OK
    TRUE - error occured
*/

bool LOGGER::error_log_print(enum loglevel level, const char *format,
                             va_list args)
{
  bool error= FALSE;
1118
  Log_event_handler **current_handler;
1119 1120

  /* currently we don't need locking here as there is no error_log table */
1121
  for (current_handler= error_log_handler_list ; *current_handler ;)
1122 1123 1124 1125 1126 1127
    error= (*current_handler++)->log_error(level, format, args) || error;

  return error;
}


unknown's avatar
unknown committed
1128
void LOGGER::cleanup_base()
1129 1130
{
  DBUG_ASSERT(inited == 1);
Marc Alff's avatar
Marc Alff committed
1131
  mysql_rwlock_destroy(&LOCK_logger);
1132
  if (table_log_handler)
unknown's avatar
unknown committed
1133
  {
1134
    table_log_handler->cleanup();
unknown's avatar
unknown committed
1135
    delete table_log_handler;
1136
    table_log_handler= NULL;
unknown's avatar
unknown committed
1137
  }
1138 1139 1140 1141 1142
  if (file_log_handler)
    file_log_handler->cleanup();
}


unknown's avatar
unknown committed
1143 1144 1145 1146
void LOGGER::cleanup_end()
{
  DBUG_ASSERT(inited == 1);
  if (file_log_handler)
1147
  {
unknown's avatar
unknown committed
1148
    delete file_log_handler;
1149 1150 1151
    file_log_handler=NULL;
  }
  inited= 0;
unknown's avatar
unknown committed
1152 1153 1154
}


unknown's avatar
unknown committed
1155
/**
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172
  Perform basic log initialization: create file-based log handler and
  init error log.
*/
void LOGGER::init_base()
{
  DBUG_ASSERT(inited == 0);
  inited= 1;

  /*
    Here we create file log handler. We don't do it for the table log handler
    here as it cannot be created so early. The reason is THD initialization,
    which depends on the system variables (parsed later).
  */
  if (!file_log_handler)
    file_log_handler= new Log_to_file_event_handler;

  /* by default we use traditional error log */
1173
  init_error_log(LOG_FILE);
1174 1175

  file_log_handler->init_pthread_objects();
Marc Alff's avatar
Marc Alff committed
1176
  mysql_rwlock_init(key_rwlock_LOCK_logger, &LOCK_logger);
1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
}


void LOGGER::init_log_tables()
{
  if (!table_log_handler)
    table_log_handler= new Log_to_csv_event_handler;

  if (!is_log_tables_initialized &&
      !table_log_handler->init() && !file_log_handler->init())
    is_log_tables_initialized= TRUE;
}


1191 1192
bool LOGGER::flush_logs(THD *thd)
{
1193
  /*
1194 1195
    Now we lock logger, as nobody should be able to use logging routines while
    log tables are closed
1196
  */
1197
  logger.lock_exclusive();
1198 1199 1200

  /* reopen log files */
  file_log_handler->flush();
1201

1202 1203
  /* end of log flush */
  logger.unlock();
1204 1205 1206 1207
  return 0;
}


1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
/**
  Close and reopen the slow log (with locks).
  
  @returns FALSE.
*/
bool LOGGER::flush_slow_log()
{
  /*
    Now we lock logger, as nobody should be able to use logging routines while
    log tables are closed
  */
  logger.lock_exclusive();

  /* Reopen slow log file */
Monty's avatar
Monty committed
1222
  if (global_system_variables.sql_log_slow)
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
    file_log_handler->get_mysql_slow_log()->reopen_file();

  /* End of log flush */
  logger.unlock();

  return 0;
}


/**
  Close and reopen the general log (with locks).

  @returns FALSE.
*/
bool LOGGER::flush_general_log()
{
  /*
    Now we lock logger, as nobody should be able to use logging routines while
    log tables are closed
  */
  logger.lock_exclusive();

  /* Reopen general log file */
  if (opt_log)
    file_log_handler->get_mysql_log()->reopen_file();

  /* End of log flush */
  logger.unlock();

  return 0;
}


1256 1257 1258 1259 1260 1261
/*
  Log slow query with all enabled log event handlers

  SYNOPSIS
    slow_log_print()

1262 1263 1264 1265
    thd                 THD of the query being logged
    query               The query being logged
    query_length        The length of the query string
    current_utime       Current time in microseconds (from undefined start)
1266 1267

  RETURN
1268 1269
    FALSE   OK
    TRUE    error occured
1270 1271 1272
*/

bool LOGGER::slow_log_print(THD *thd, const char *query, uint query_length,
1273 1274
                            ulonglong current_utime)

1275 1276
{
  bool error= FALSE;
1277
  Log_event_handler **current_handler;
1278
  bool is_command= FALSE;
1279
  char user_host_buff[MAX_USER_HOST_SIZE + 1];
1280
  Security_context *sctx= thd->security_ctx;
unknown's avatar
unknown committed
1281
  uint user_host_len= 0;
1282
  ulonglong query_utime, lock_utime;
1283

1284
  DBUG_ASSERT(thd->enable_slow_log);
1285 1286 1287 1288 1289 1290
  /*
    Print the message to the buffer if we have slow log enabled
  */

  if (*slow_log_handler_list)
  {
1291
    /* do not log slow queries from replication threads */
Monty's avatar
Monty committed
1292
    if (!thd->variables.sql_log_slow)
1293 1294
      return 0;

1295
    lock_shared();
Monty's avatar
Monty committed
1296
    if (!global_system_variables.sql_log_slow)
1297 1298 1299 1300
    {
      unlock();
      return 0;
    }
1301 1302

    /* fill in user_host value: the format is "%s[%s] @ %s [%s]" */
1303 1304
    user_host_len= (strxnmov(user_host_buff, MAX_USER_HOST_SIZE,
                             sctx->priv_user ? sctx->priv_user : "", "[",
1305
                             sctx->user ? sctx->user : (thd->slave_thread ? "SQL_SLAVE" : ""), "] @ ",
1306 1307 1308 1309
                             sctx->host ? sctx->host : "", " [",
                             sctx->ip ? sctx->ip : "", "]", NullS) -
                    user_host_buff);

1310 1311 1312 1313 1314 1315
    DBUG_ASSERT(thd->start_utime);
    DBUG_ASSERT(thd->start_time);
    query_utime= (current_utime - thd->start_utime);
    lock_utime=  (thd->utime_after_lock - thd->start_utime);
    my_hrtime_t current_time= { hrtime_from_time(thd->start_time) +
                                thd->start_time_sec_part + query_utime };
1316 1317 1318 1319

    if (!query)
    {
      is_command= TRUE;
Sergei Golubchik's avatar
Sergei Golubchik committed
1320 1321
      query= command_name[thd->get_command()].str;
      query_length= command_name[thd->get_command()].length;
1322 1323
    }

1324
    for (current_handler= slow_log_handler_list; *current_handler ;)
1325
      error= (*current_handler++)->log_slow(thd, current_time,
1326
                                            user_host_buff, user_host_len,
1327
                                            query_utime, lock_utime, is_command,
1328 1329 1330 1331 1332 1333 1334
                                            query, query_length) || error;

    unlock();
  }
  return error;
}

1335 1336
bool LOGGER::general_log_write(THD *thd, enum enum_server_command command,
                               const char *query, uint query_length)
1337 1338 1339
{
  bool error= FALSE;
  Log_event_handler **current_handler= general_log_handler_list;
1340
  char user_host_buff[MAX_USER_HOST_SIZE + 1];
1341
  uint user_host_len= 0;
1342
  my_hrtime_t current_time;
1343

1344
  DBUG_ASSERT(thd);
1345

1346
  user_host_len= make_user_name(thd, user_host_buff);
1347

1348
  current_time= my_hrtime();
1349

Sergei Golubchik's avatar
Sergei Golubchik committed
1350
  mysql_audit_general_log(thd, hrtime_to_time(current_time),
1351 1352 1353 1354
                          user_host_buff, user_host_len,
                          command_name[(uint) command].str,
                          command_name[(uint) command].length,
                          query, query_length);
1355
                        
1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
  if (opt_log && log_command(thd, command))
  {
    lock_shared();
    while (*current_handler)
      error|= (*current_handler++)->
        log_general(thd, current_time, user_host_buff,
                    user_host_len, thd->thread_id,
                    command_name[(uint) command].str,
                    command_name[(uint) command].length,
                    query, query_length,
                    thd->variables.character_set_client) || error;
    unlock();
  }
1369

1370 1371 1372
  return error;
}

1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
bool LOGGER::general_log_print(THD *thd, enum enum_server_command command,
                               const char *format, va_list args)
{
  uint message_buff_len= 0;
  char message_buff[MAX_LOG_BUFFER_SIZE];

  /* prepare message */
  if (format)
    message_buff_len= my_vsnprintf(message_buff, sizeof(message_buff),
                                   format, args);
  else
    message_buff[0]= '\0';

  return general_log_write(thd, command, message_buff, message_buff_len);
}

1389
void LOGGER::init_error_log(ulonglong error_log_printer)
1390
{
1391 1392
  if (error_log_printer & LOG_NONE)
  {
1393
    error_log_handler_list[0]= 0;
1394 1395 1396 1397 1398
    return;
  }

  switch (error_log_printer) {
  case LOG_FILE:
1399 1400 1401 1402
    error_log_handler_list[0]= file_log_handler;
    error_log_handler_list[1]= 0;
    break;
    /* these two are disabled for now */
1403
  case LOG_TABLE:
1404 1405
    DBUG_ASSERT(0);
    break;
1406
  case LOG_TABLE|LOG_FILE:
1407 1408 1409 1410 1411
    DBUG_ASSERT(0);
    break;
  }
}

1412
void LOGGER::init_slow_log(ulonglong slow_log_printer)
1413
{
1414 1415
  if (slow_log_printer & LOG_NONE)
  {
1416
    slow_log_handler_list[0]= 0;
1417 1418 1419 1420 1421
    return;
  }

  switch (slow_log_printer) {
  case LOG_FILE:
1422 1423 1424
    slow_log_handler_list[0]= file_log_handler;
    slow_log_handler_list[1]= 0;
    break;
1425
  case LOG_TABLE:
1426 1427 1428
    slow_log_handler_list[0]= table_log_handler;
    slow_log_handler_list[1]= 0;
    break;
1429
  case LOG_TABLE|LOG_FILE:
1430 1431 1432 1433 1434 1435 1436
    slow_log_handler_list[0]= file_log_handler;
    slow_log_handler_list[1]= table_log_handler;
    slow_log_handler_list[2]= 0;
    break;
  }
}

1437
void LOGGER::init_general_log(ulonglong general_log_printer)
1438
{
1439 1440
  if (general_log_printer & LOG_NONE)
  {
1441
    general_log_handler_list[0]= 0;
1442 1443 1444 1445 1446
    return;
  }

  switch (general_log_printer) {
  case LOG_FILE:
1447 1448 1449
    general_log_handler_list[0]= file_log_handler;
    general_log_handler_list[1]= 0;
    break;
1450
  case LOG_TABLE:
1451 1452 1453
    general_log_handler_list[0]= table_log_handler;
    general_log_handler_list[1]= 0;
    break;
1454
  case LOG_TABLE|LOG_FILE:
1455 1456 1457 1458 1459 1460 1461 1462
    general_log_handler_list[0]= file_log_handler;
    general_log_handler_list[1]= table_log_handler;
    general_log_handler_list[2]= 0;
    break;
  }
}


1463 1464
bool LOGGER::activate_log_handler(THD* thd, uint log_type)
{
1465
  MYSQL_QUERY_LOG *file_log;
1466 1467
  bool res= FALSE;
  lock_exclusive();
1468 1469
  switch (log_type) {
  case QUERY_LOG_SLOW:
Monty's avatar
Monty committed
1470
    if (!global_system_variables.sql_log_slow)
1471
    {
1472 1473
      file_log= file_log_handler->get_mysql_slow_log();

1474
      file_log->open_slow_log(opt_slow_logname);
1475 1476 1477 1478
      if (table_log_handler->activate_log(thd, QUERY_LOG_SLOW))
      {
        /* Error printed by open table in activate_log() */
        res= TRUE;
1479
        file_log->close(0);
1480 1481
      }
      else
1482 1483
      {
        init_slow_log(log_output_options);
Monty's avatar
Monty committed
1484
        global_system_variables.sql_log_slow= TRUE;
1485
      }
1486 1487 1488 1489 1490
    }
    break;
  case QUERY_LOG_GENERAL:
    if (!opt_log)
    {
1491 1492
      file_log= file_log_handler->get_mysql_log();

1493
      file_log->open_query_log(opt_logname);
1494 1495 1496 1497
      if (table_log_handler->activate_log(thd, QUERY_LOG_GENERAL))
      {
        /* Error printed by open table in activate_log() */
        res= TRUE;
1498
        file_log->close(0);
1499 1500
      }
      else
1501 1502
      {
        init_general_log(log_output_options);
1503
        opt_log= TRUE;
1504
      }
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
    }
    break;
  default:
    DBUG_ASSERT(0);
  }
  unlock();
  return res;
}


void LOGGER::deactivate_log_handler(THD *thd, uint log_type)
{
  my_bool *tmp_opt= 0;
1518
  MYSQL_LOG *UNINIT_VAR(file_log);
1519 1520 1521

  switch (log_type) {
  case QUERY_LOG_SLOW:
Monty's avatar
Monty committed
1522
    tmp_opt= &global_system_variables.sql_log_slow;
1523 1524 1525 1526 1527 1528 1529
    file_log= file_log_handler->get_mysql_slow_log();
    break;
  case QUERY_LOG_GENERAL:
    tmp_opt= &opt_log;
    file_log= file_log_handler->get_mysql_log();
    break;
  default:
1530
    MY_ASSERT_UNREACHABLE();
1531 1532 1533 1534 1535
  }

  if (!(*tmp_opt))
    return;

1536
  lock_exclusive();
1537 1538 1539 1540 1541 1542
  file_log->close(0);
  *tmp_opt= FALSE;
  unlock();
}


1543 1544 1545
/* the parameters are unused for the log tables */
bool Log_to_csv_event_handler::init()
{
1546
  return 0;
1547 1548
}

1549 1550 1551
int LOGGER::set_handlers(ulonglong error_log_printer,
                         ulonglong slow_log_printer,
                         ulonglong general_log_printer)
1552 1553
{
  /* error log table is not supported yet */
1554
  DBUG_ASSERT(error_log_printer < LOG_TABLE);
1555

1556
  lock_exclusive();
1557

1558
  if ((slow_log_printer & LOG_TABLE || general_log_printer & LOG_TABLE) &&
1559 1560
      !is_log_tables_initialized)
  {
1561 1562
    slow_log_printer= (slow_log_printer & ~LOG_TABLE) | LOG_FILE;
    general_log_printer= (general_log_printer & ~LOG_TABLE) | LOG_FILE;
1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576

    sql_print_error("Failed to initialize log tables. "
                    "Falling back to the old-fashioned logs");
  }

  init_error_log(error_log_printer);
  init_slow_log(slow_log_printer);
  init_general_log(general_log_printer);

  unlock();

  return 0;
}

1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596
 /*
  Save position of binary log transaction cache.

  SYNPOSIS
    binlog_trans_log_savepos()

    thd      The thread to take the binlog data from
    pos      Pointer to variable where the position will be stored

  DESCRIPTION

    Save the current position in the binary log transaction cache into
    the variable pointed to by 'pos'
 */

static void
binlog_trans_log_savepos(THD *thd, my_off_t *pos)
{
  DBUG_ENTER("binlog_trans_log_savepos");
  DBUG_ASSERT(pos != NULL);
1597
  binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
1598
  DBUG_ASSERT((WSREP(thd) && wsrep_emulate_bin_log) || mysql_bin_log.is_open());
1599
  *pos= cache_mngr->trx_cache.get_byte_position();
unknown's avatar
unknown committed
1600
  DBUG_PRINT("return", ("*pos: %lu", (ulong) *pos));
1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623
  DBUG_VOID_RETURN;
}


/*
  Truncate the binary log transaction cache.

  SYNPOSIS
    binlog_trans_log_truncate()

    thd      The thread to take the binlog data from
    pos      Position to truncate to

  DESCRIPTION

    Truncate the binary log to the given position. Will not change
    anything else.

 */
static void
binlog_trans_log_truncate(THD *thd, my_off_t pos)
{
  DBUG_ENTER("binlog_trans_log_truncate");
unknown's avatar
unknown committed
1624
  DBUG_PRINT("enter", ("pos: %lu", (ulong) pos));
1625

1626
  DBUG_ASSERT(thd_get_ha_data(thd, binlog_hton) != NULL);
1627 1628 1629
  /* Only true if binlog_trans_log_savepos() wasn't called before */
  DBUG_ASSERT(pos != ~(my_off_t) 0);

1630 1631 1632
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
  cache_mngr->trx_cache.restore_savepoint(pos);
1633 1634 1635 1636
  DBUG_VOID_RETURN;
}


1637 1638
/*
  this function is mostly a placeholder.
1639
  conceptually, binlog initialization (now mostly done in MYSQL_BIN_LOG::open)
1640 1641 1642
  should be moved here.
*/

1643
int binlog_init(void *p)
1644
{
1645
  binlog_hton= (handlerton *)p;
1646 1647
  binlog_hton->state= (WSREP_ON || opt_bin_log) ? SHOW_OPTION_YES
                                                : SHOW_OPTION_NO;
1648 1649 1650 1651 1652
  binlog_hton->db_type=DB_TYPE_BINLOG;
  binlog_hton->savepoint_offset= sizeof(my_off_t);
  binlog_hton->close_connection= binlog_close_connection;
  binlog_hton->savepoint_set= binlog_savepoint_set;
  binlog_hton->savepoint_rollback= binlog_savepoint_rollback;
1653 1654
  binlog_hton->savepoint_rollback_can_release_mdl=
                                     binlog_savepoint_rollback_can_release_mdl;
1655 1656 1657
  binlog_hton->commit= binlog_commit;
  binlog_hton->rollback= binlog_rollback;
  binlog_hton->prepare= binlog_prepare;
1658
  binlog_hton->start_consistent_snapshot= binlog_start_consistent_snapshot;
1659
  binlog_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
unknown's avatar
unknown committed
1660
  return 0;
1661 1662
}

1663 1664 1665
#ifdef WITH_WSREP
#include "wsrep_binlog.h"
#endif /* WITH_WSREP */
1666
static int binlog_close_connection(handlerton *hton, THD *thd)
1667
{
1668
  DBUG_ENTER("binlog_close_connection");
1669 1670
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
#ifdef WITH_WSREP
  if (cache_mngr && !cache_mngr->trx_cache.empty()) {
    IO_CACHE* cache= get_trans_log(thd);
    uchar *buf;
    size_t len=0;
    wsrep_write_cache_buf(cache, &buf, &len);
    WSREP_WARN("binlog trx cache not empty (%lu bytes) @ connection close %lu",
               len, thd->thread_id);
    if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);

    cache = cache_mngr->get_binlog_cache_log(false);
    wsrep_write_cache_buf(cache, &buf, &len);
    WSREP_WARN("binlog stmt cache not empty (%lu bytes) @ connection close %lu",
               len, thd->thread_id);
    if (len > 0) wsrep_dump_rbr_buf(thd, buf, len);
  }
#endif /* WITH_WSREP */
1688
  DBUG_ASSERT(cache_mngr->trx_cache.empty() && cache_mngr->stmt_cache.empty());
1689
  thd_set_ha_data(thd, binlog_hton, NULL);
1690
  cache_mngr->~binlog_cache_mngr();
1691
  my_free(cache_mngr);
1692
  DBUG_RETURN(0);
1693 1694
}

1695
/*
1696
  This function flushes a cache upon commit/rollback.
1697 1698

  SYNOPSIS
Sergei Golubchik's avatar
Sergei Golubchik committed
1699
    binlog_flush_cache()
1700

Sergei Golubchik's avatar
Sergei Golubchik committed
1701 1702 1703 1704 1705 1706 1707
    thd        The thread whose transaction should be ended
    cache_mngr Pointer to the binlog_cache_mngr to use
    all        True if the entire transaction should be ended, false if
               only the statement transaction should be ended.
    end_ev     The end event to use (COMMIT, ROLLBACK, or commit XID)
    using_stmt True if the statement cache should be flushed
    using_trx  True if the transaction cache should be flushed
1708 1709 1710

  DESCRIPTION

Sergei Golubchik's avatar
Sergei Golubchik committed
1711
    End the currently transaction or statement. The transaction can be either
1712
    a real transaction or a statement transaction.
1713

1714 1715 1716
    This can be to commit a transaction, with a COMMIT query event or an XA
    commit XID event. But it can also be to rollback a transaction with a
    ROLLBACK query event, used for rolling back transactions which also
Sergei Golubchik's avatar
Sergei Golubchik committed
1717 1718
    contain updates to non-transactional tables. Or it can be a flush of
    a statement cache.
1719
 */
1720

1721
static int
Sergei Golubchik's avatar
Sergei Golubchik committed
1722 1723 1724
binlog_flush_cache(THD *thd, binlog_cache_mngr *cache_mngr,
                   Log_event *end_ev, bool all, bool using_stmt,
                   bool using_trx)
1725
{
1726
  int error= 0;
Sergei Golubchik's avatar
Sergei Golubchik committed
1727
  DBUG_ENTER("binlog_flush_cache");
1728
  DBUG_PRINT("enter", ("end_ev: %p", end_ev));
1729

Sergei Golubchik's avatar
Sergei Golubchik committed
1730 1731
  if ((using_stmt && !cache_mngr->stmt_cache.empty()) ||
      (using_trx && !cache_mngr->trx_cache.empty()))
1732
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
1733 1734 1735
    if (using_stmt && thd->binlog_flush_pending_rows_event(TRUE, FALSE))
      DBUG_RETURN(1);
    if (using_trx && thd->binlog_flush_pending_rows_event(TRUE, TRUE))
1736
      DBUG_RETURN(1);
Sergei Golubchik's avatar
Sergei Golubchik committed
1737

1738 1739 1740 1741 1742 1743
    /*
      Doing a commit or a rollback including non-transactional tables,
      i.e., ending a transaction where we might write the transaction
      cache to the binary log.

      We can always end the statement when ending a transaction since
Sergei Golubchik's avatar
Sergei Golubchik committed
1744
      transactions are not allowed inside stored functions.  If they
1745 1746 1747
      were, we would have to ensure that we're not ending a statement
      inside a stored function.
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
1748 1749 1750
    error= mysql_bin_log.write_transaction_to_binlog(thd, cache_mngr,
                                                     end_ev, all,
                                                     using_stmt, using_trx);
1751
  }
1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
  else
  {
    /*
      This can happen in row-format binlog with something like
          BEGIN; INSERT INTO nontrans_table; INSERT IGNORE INTO trans_table;
      The nontrans_table is written directly into the binlog before commit,
      and if the trans_table is ignored there will be no rows to write when
      we get here.

      So there is no work to do. Therefore, we will not increment any XID
      count, so we must not decrement any XID count in unlog().
    */
1764
    cache_mngr->need_unlog= 0;
1765
  }
Sergei Golubchik's avatar
Sergei Golubchik committed
1766
  cache_mngr->reset(using_stmt, using_trx);
1767

Sergei Golubchik's avatar
Sergei Golubchik committed
1768 1769
  DBUG_ASSERT((!using_stmt || cache_mngr->stmt_cache.empty()) &&
              (!using_trx || cache_mngr->trx_cache.empty()));
1770 1771 1772
  DBUG_RETURN(error);
}

1773

1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
/**
  This function flushes the stmt-cache upon commit.

  @param thd                The thread whose transaction should be flushed
  @param cache_mngr         Pointer to the cache manager

  @return
    nonzero if an error pops up when flushing the cache.
*/
static inline int
Sergei Golubchik's avatar
Sergei Golubchik committed
1784
binlog_commit_flush_stmt_cache(THD *thd, bool all,
1785 1786
                               binlog_cache_mngr *cache_mngr)
{
1787
  DBUG_ENTER("binlog_commit_flush_stmt_cache");
1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
#ifdef WITH_WSREP
  if (thd->wsrep_mysql_replicated > 0)
  {
    DBUG_ASSERT(WSREP_ON);
    WSREP_DEBUG("avoiding binlog_commit_flush_trx_cache: %d",
                thd->wsrep_mysql_replicated);
    return 0;
  }
#endif

1798
  Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
Sergei Golubchik's avatar
Sergei Golubchik committed
1799
                          FALSE, TRUE, TRUE, 0);
1800
  DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, FALSE));
1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
}

/**
  This function flushes the trx-cache upon commit.

  @param thd                The thread whose transaction should be flushed
  @param cache_mngr         Pointer to the cache manager

  @return
    nonzero if an error pops up when flushing the cache.
*/
static inline int
Sergei Golubchik's avatar
Sergei Golubchik committed
1813
binlog_commit_flush_trx_cache(THD *thd, bool all, binlog_cache_mngr *cache_mngr)
1814
{
1815
  DBUG_ENTER("binlog_commit_flush_trx_cache");
1816
  Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
Sergei Golubchik's avatar
Sergei Golubchik committed
1817
                          TRUE, TRUE, TRUE, 0);
1818
  DBUG_RETURN(binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
}

/**
  This function flushes the trx-cache upon rollback.

  @param thd                The thread whose transaction should be flushed
  @param cache_mngr         Pointer to the cache manager

  @return
    nonzero if an error pops up when flushing the cache.
*/
static inline int
Sergei Golubchik's avatar
Sergei Golubchik committed
1831 1832
binlog_rollback_flush_trx_cache(THD *thd, bool all,
                                binlog_cache_mngr *cache_mngr)
1833 1834
{
  Query_log_event end_evt(thd, STRING_WITH_LEN("ROLLBACK"),
Sergei Golubchik's avatar
Sergei Golubchik committed
1835 1836
                          TRUE, TRUE, TRUE, 0);
  return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, FALSE, TRUE));
1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849
}

/**
  This function flushes the trx-cache upon commit.

  @param thd                The thread whose transaction should be flushed
  @param cache_mngr         Pointer to the cache manager
  @param xid                Transaction Id

  @return
    nonzero if an error pops up when flushing the cache.
*/
static inline int
Sergei Golubchik's avatar
Sergei Golubchik committed
1850 1851
binlog_commit_flush_xid_caches(THD *thd, binlog_cache_mngr *cache_mngr,
                               bool all, my_xid xid)
1852
{
Sergei Golubchik's avatar
Sergei Golubchik committed
1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869
  if (xid)
  {
    Xid_log_event end_evt(thd, xid, TRUE);
    return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
  }
  else
  {
    /*
      Empty xid occurs in XA COMMIT ... ONE PHASE.
      In this case, we do not have a MySQL xid for the transaction, and the
      external XA transaction coordinator will have to handle recovery if
      needed. So we end the transaction with a plain COMMIT query event.
    */
    Query_log_event end_evt(thd, STRING_WITH_LEN("COMMIT"),
                            TRUE, TRUE, TRUE, 0);
    return (binlog_flush_cache(thd, cache_mngr, &end_evt, all, TRUE, TRUE));
  }
1870 1871
}

1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883
/**
  This function truncates the transactional cache upon committing or rolling
  back either a transaction or a statement.

  @param thd        The thread whose transaction should be flushed
  @param cache_mngr Pointer to the cache data to be flushed
  @param all        @c true means truncate the transaction, otherwise the
                    statement must be truncated.

  @return
    nonzero if an error pops up when truncating the transactional cache.
*/
1884
static int
1885
binlog_truncate_trx_cache(THD *thd, binlog_cache_mngr *cache_mngr, bool all)
1886
{
1887
  DBUG_ENTER("binlog_truncate_trx_cache");
1888
  int error=0;
1889 1890 1891 1892 1893
  /*
    This function handles transactional changes and as such this flag
    equals to true.
  */
  bool const is_transactional= TRUE;
1894

1895
  DBUG_PRINT("info", ("thd->options={ %s %s}, transaction: %s",
unknown's avatar
unknown committed
1896 1897
                      FLAGSTR(thd->variables.option_bits, OPTION_NOT_AUTOCOMMIT),
                      FLAGSTR(thd->variables.option_bits, OPTION_BEGIN),
1898
                      all ? "all" : "stmt"));
1899 1900

  thd->binlog_remove_pending_rows_event(TRUE, is_transactional);
1901
  /*
1902 1903
    If rolling back an entire transaction or a single statement not
    inside a transaction, we reset the transaction cache.
1904
  */
1905
  if (ending_trans(thd, all))
1906
  {
1907
    if (cache_mngr->trx_cache.has_incident())
1908
      error= mysql_bin_log.write_incident(thd);
1909

1910
    thd->clear_binlog_table_maps();
1911

Sergei Golubchik's avatar
Sergei Golubchik committed
1912
    cache_mngr->reset(false, true);
1913
  }
1914 1915 1916 1917
  /*
    If rolling back a statement in a transaction, we truncate the
    transaction cache to remove the statement.
  */
1918
  else
1919
    cache_mngr->trx_cache.restore_prev_position();
1920

1921
  DBUG_ASSERT(thd->binlog_get_pending_rows_event(is_transactional) == NULL);
1922
  DBUG_RETURN(error);
1923 1924
}

1925
static int binlog_prepare(handlerton *hton, THD *thd, bool all)
1926 1927 1928 1929 1930
{
  /*
    do nothing.
    just pretend we can do 2pc, so that MySQL won't
    switch to 1pc.
1931
    real work will be done in MYSQL_BIN_LOG::log_and_order()
1932 1933 1934 1935
  */
  return 0;
}

1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952
/*
  We flush the cache wrapped in a beging/rollback if:
    . aborting a single or multi-statement transaction and;
    . the OPTION_KEEP_LOG is active or;
    . the format is STMT and a non-trans table was updated or;
    . the format is MIXED and a temporary non-trans table was
      updated or;
    . the format is MIXED, non-trans table was updated and
      aborting a single statement transaction;
*/
static bool trans_cannot_safely_rollback(THD *thd, bool all)
{
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);

  return ((thd->variables.option_bits & OPTION_KEEP_LOG) ||
          (trans_has_updated_non_trans_table(thd) &&
1953
           thd->wsrep_binlog_format() == BINLOG_FORMAT_STMT) ||
1954
          (cache_mngr->trx_cache.changes_to_non_trans_temp_table() &&
1955
           thd->wsrep_binlog_format() == BINLOG_FORMAT_MIXED) ||
1956 1957
          (trans_has_updated_non_trans_table(thd) &&
           ending_single_stmt_trans(thd,all) &&
1958
           thd->wsrep_binlog_format() == BINLOG_FORMAT_MIXED));
1959 1960 1961
}


1962 1963 1964
/**
  This function is called once after each statement.

1965
  It has the responsibility to flush the caches to the binary log on commits.
1966 1967 1968

  @param hton  The binlog handlerton.
  @param thd   The client thread that executes the transaction.
1969 1970
  @param all   This is @c true if this is a real transaction commit, and
               @false otherwise.
1971 1972 1973

  @see handlerton::commit
*/
1974
static int binlog_commit(handlerton *hton, THD *thd, bool all)
1975
{
1976
  int error= 0;
1977
  DBUG_ENTER("binlog_commit");
1978 1979
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
1980

1981 1982 1983 1984 1985
  if (!cache_mngr)
  {
    DBUG_ASSERT(WSREP(thd));
    DBUG_RETURN(0);
  }
1986

1987 1988 1989
  DBUG_PRINT("debug",
             ("all: %d, in_transaction: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
              all,
1990
              YESNO(thd->in_multi_stmt_transaction_mode()),
1991 1992 1993 1994 1995
              YESNO(thd->transaction.all.modified_non_trans_table),
              YESNO(thd->transaction.stmt.modified_non_trans_table)));

  if (!cache_mngr->stmt_cache.empty())
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
1996
    error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
1997
  }
1998

1999
  if (cache_mngr->trx_cache.empty())
2000
  {
2001 2002 2003
    /*
      we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
2004
    cache_mngr->reset(false, true);
2005
    DBUG_RETURN(error);
2006
  }
2007

2008
  /*
2009 2010
    We commit the transaction if:
     - We are not in a transaction and committing a statement, or
2011 2012
     - We are in a transaction and a full transaction is committed.
    Otherwise, we accumulate the changes.
2013
  */
2014
  if (!error && ending_trans(thd, all))
Sergei Golubchik's avatar
Sergei Golubchik committed
2015
    error= binlog_commit_flush_trx_cache(thd, all, cache_mngr);
2016

2017 2018 2019
  /*
    This is part of the stmt rollback.
  */
2020
  if (!all)
2021
    cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
2022

2023
  DBUG_RETURN(error);
2024 2025
}

2026
/**
2027
  This function is called when a transaction or a statement is rolled back.
2028 2029 2030

  @param hton  The binlog handlerton.
  @param thd   The client thread that executes the transaction.
2031 2032
  @param all   This is @c true if this is a real transaction rollback, and
               @false otherwise.
2033 2034 2035

  @see handlerton::rollback
*/
2036
static int binlog_rollback(handlerton *hton, THD *thd, bool all)
2037 2038
{
  DBUG_ENTER("binlog_rollback");
2039
  int error= 0;
2040 2041
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
2042

2043 2044 2045 2046 2047 2048
  if (!cache_mngr)
  {
    DBUG_ASSERT(WSREP(thd));
    DBUG_RETURN(0);
  }

2049 2050 2051 2052
  DBUG_PRINT("debug", ("all: %s, all.modified_non_trans_table: %s, stmt.modified_non_trans_table: %s",
                       YESNO(all),
                       YESNO(thd->transaction.all.modified_non_trans_table),
                       YESNO(thd->transaction.stmt.modified_non_trans_table)));
2053 2054 2055 2056 2057 2058 2059

  /*
    If an incident event is set we do not flush the content of the statement
    cache because it may be corrupted.
  */
  if (cache_mngr->stmt_cache.has_incident())
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
2060 2061
    error= mysql_bin_log.write_incident(thd);
    cache_mngr->reset(true, false);
2062 2063 2064
  }
  else if (!cache_mngr->stmt_cache.empty())
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
2065
    error= binlog_commit_flush_stmt_cache(thd, all, cache_mngr);
2066 2067
  }

2068
  if (cache_mngr->trx_cache.empty())
2069
  {
2070
    /*
2071 2072
      we're here because cache_log was flushed in MYSQL_BIN_LOG::log_xid()
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
2073
    cache_mngr->reset(false, true);
2074
    DBUG_RETURN(error);
2075
  }
2076
  if (!wsrep_emulate_bin_log && mysql_bin_log.check_write_error(thd))
2077
  {
2078
    /*
2079 2080 2081 2082
      "all == true" means that a "rollback statement" triggered the error and
      this function was called. However, this must not happen as a rollback
      is written directly to the binary log. And in auto-commit mode, a single
      statement that is rolled back has the flag all == false.
2083
    */
2084
    DBUG_ASSERT(!all);
2085
    /*
2086 2087
      We reach this point if the effect of a statement did not properly get into
      a cache and need to be rolled back.
2088
    */
2089
    error |= binlog_truncate_trx_cache(thd, cache_mngr, all);
2090
  }
2091
  else if (!error)
2092
  {  
2093
    if (ending_trans(thd, all) && trans_cannot_safely_rollback(thd, all))
Sergei Golubchik's avatar
Sergei Golubchik committed
2094
      error= binlog_rollback_flush_trx_cache(thd, all, cache_mngr);
2095
    /*
2096 2097
      Truncate the cache if:
        . aborting a single or multi-statement transaction or;
2098
        . the OPTION_KEEP_LOG is not active and;
2099 2100 2101 2102
        . the format is not STMT or no non-trans table was
          updated and;
        . the format is not MIXED or no temporary non-trans table
          was updated.
2103
    */
2104 2105
    else if (ending_trans(thd, all) ||
             (!(thd->variables.option_bits & OPTION_KEEP_LOG) &&
2106
              (!stmt_has_updated_non_trans_table(thd) ||
2107
               thd->wsrep_binlog_format() != BINLOG_FORMAT_STMT) &&
2108
              (!cache_mngr->trx_cache.changes_to_non_trans_temp_table() ||
2109
               thd->wsrep_binlog_format() != BINLOG_FORMAT_MIXED)))
2110
      error= binlog_truncate_trx_cache(thd, cache_mngr, all);
2111
  }
2112 2113 2114 2115

  /* 
    This is part of the stmt rollback.
  */
2116
  if (!all)
2117 2118
    cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);

2119 2120 2121
  DBUG_RETURN(error);
}

2122 2123 2124

void binlog_reset_cache(THD *thd)
{
2125 2126
  binlog_cache_mngr *const cache_mngr= opt_bin_log ? 
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton) : 0;
2127
  DBUG_ENTER("binlog_reset_cache");
2128 2129 2130 2131 2132
  if (cache_mngr)
  {
    thd->binlog_remove_pending_rows_event(TRUE, TRUE);
    cache_mngr->reset(true, true);
  }
2133 2134 2135 2136
  DBUG_VOID_RETURN;
}


2137
void MYSQL_BIN_LOG::set_write_error(THD *thd, bool is_transactional)
2138 2139 2140 2141 2142 2143 2144 2145 2146
{
  DBUG_ENTER("MYSQL_BIN_LOG::set_write_error");

  write_error= 1;

  if (check_write_error(thd))
    DBUG_VOID_RETURN;

  if (my_errno == EFBIG)
2147 2148 2149
  {
    if (is_transactional)
    {
2150
      my_message(ER_TRANS_CACHE_FULL, ER_THD(thd, ER_TRANS_CACHE_FULL), MYF(MY_WME));
2151 2152 2153
    }
    else
    {
2154
      my_message(ER_STMT_CACHE_FULL, ER_THD(thd, ER_STMT_CACHE_FULL), MYF(MY_WME));
2155 2156
    }
  }
2157
  else
2158
  {
2159
    my_error(ER_ERROR_ON_WRITE, MYF(MY_WME), name, errno);
2160
  }
2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173

  DBUG_VOID_RETURN;
}

bool MYSQL_BIN_LOG::check_write_error(THD *thd)
{
  DBUG_ENTER("MYSQL_BIN_LOG::check_write_error");

  bool checked= FALSE;

  if (!thd->is_error())
    DBUG_RETURN(checked);

2174
  switch (thd->get_stmt_da()->sql_errno())
2175 2176
  {
    case ER_TRANS_CACHE_FULL:
2177
    case ER_STMT_CACHE_FULL:
2178 2179 2180 2181 2182 2183 2184 2185 2186
    case ER_ERROR_ON_WRITE:
    case ER_BINLOG_LOGGING_IMPOSSIBLE:
      checked= TRUE;
    break;
  }

  DBUG_RETURN(checked);
}

Sergey Petrunya's avatar
Sergey Petrunya committed
2187

unknown's avatar
unknown committed
2188 2189 2190 2191 2192 2193
/**
  @note
  How do we handle this (unlikely but legal) case:
  @verbatim
    [transaction] + [update to non-trans table] + [rollback to savepoint] ?
  @endverbatim
2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211
  The problem occurs when a savepoint is before the update to the
  non-transactional table. Then when there's a rollback to the savepoint, if we
  simply truncate the binlog cache, we lose the part of the binlog cache where
  the update is. If we want to not lose it, we need to write the SAVEPOINT
  command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
  is easy: it's just write at the end of the binlog cache, but the former
  should be *inserted* to the place where the user called SAVEPOINT. The
  solution is that when the user calls SAVEPOINT, we write it to the binlog
  cache (so no need to later insert it). As transactions are never intermixed
  in the binary log (i.e. they are serialized), we won't have conflicts with
  savepoint names when using mysqlbinlog or in the slave SQL thread.
  Then when ROLLBACK TO SAVEPOINT is called, if we updated some
  non-transactional table, we don't truncate the binlog cache but instead write
  ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
  will chop the SAVEPOINT command from the binlog cache, which is good as in
  that case there is no need to have it in the binlog).
*/

2212
static int binlog_savepoint_set(handlerton *hton, THD *thd, void *sv)
2213
{
Sergei Golubchik's avatar
Sergei Golubchik committed
2214
  int error= 1;
2215 2216 2217 2218
  DBUG_ENTER("binlog_savepoint_set");

  if (wsrep_emulate_bin_log)
    DBUG_RETURN(0);
2219

Sergei Golubchik's avatar
Sergei Golubchik committed
2220 2221 2222
  char buf[1024];
  String log_query(buf, sizeof(buf), &my_charset_bin);
  if (log_query.copy(STRING_WITH_LEN("SAVEPOINT "), &my_charset_bin) ||
unknown's avatar
unknown committed
2223 2224
      append_identifier(thd, &log_query,
                        thd->lex->ident.str, thd->lex->ident.length))
2225
    DBUG_RETURN(1);
2226
  int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
Sergei Golubchik's avatar
Sergei Golubchik committed
2227
  Query_log_event qinfo(thd, log_query.c_ptr_safe(), log_query.length(),
2228
                        TRUE, FALSE, TRUE, errcode);
Sergei Golubchik's avatar
Sergei Golubchik committed
2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245
  /* 
    We cannot record the position before writing the statement
    because a rollback to a savepoint (.e.g. consider it "S") would
    prevent the savepoint statement (i.e. "SAVEPOINT S") from being
    written to the binary log despite the fact that the server could
    still issue other rollback statements to the same savepoint (i.e. 
    "S"). 
    Given that the savepoint is valid until the server releases it,
    ie, until the transaction commits or it is released explicitly,
    we need to log it anyway so that we don't have "ROLLBACK TO S"
    or "RELEASE S" without the preceding "SAVEPOINT S" in the binary
    log.
  */
  if (!(error= mysql_bin_log.write(&qinfo)))
    binlog_trans_log_savepos(thd, (my_off_t*) sv);

  DBUG_RETURN(error);
2246 2247
}

2248
static int binlog_savepoint_rollback(handlerton *hton, THD *thd, void *sv)
2249 2250 2251 2252 2253 2254 2255 2256
{
  DBUG_ENTER("binlog_savepoint_rollback");

  /*
    Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
    non-transactional table. Otherwise, truncate the binlog cache starting
    from the SAVEPOINT command.
  */
2257
  if (!wsrep_emulate_bin_log &&
2258
      unlikely(trans_has_updated_non_trans_table(thd) ||
2259
               (thd->variables.option_bits & OPTION_KEEP_LOG)))
2260
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
2261 2262 2263
    char buf[1024];
    String log_query(buf, sizeof(buf), &my_charset_bin);
    if (log_query.copy(STRING_WITH_LEN("ROLLBACK TO "), &my_charset_bin) ||
unknown's avatar
unknown committed
2264 2265
        append_identifier(thd, &log_query,
                          thd->lex->ident.str, thd->lex->ident.length))
2266
      DBUG_RETURN(1);
2267
    int errcode= query_error_code(thd, thd->killed == NOT_KILLED);
2268
    Query_log_event qinfo(thd, log_query.ptr(), log_query.length(),
2269
                          TRUE, FALSE, TRUE, errcode);
2270
    DBUG_RETURN(mysql_bin_log.write(&qinfo));
2271
  }
2272

2273
  binlog_trans_log_truncate(thd, *(my_off_t*)sv);
2274

2275 2276 2277
  DBUG_RETURN(0);
}

unknown's avatar
unknown committed
2278

2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302
/**
  Check whether binlog state allows to safely release MDL locks after
  rollback to savepoint.

  @param hton  The binlog handlerton.
  @param thd   The client thread that executes the transaction.

  @return true  - It is safe to release MDL locks.
          false - If it is not.
*/
static bool binlog_savepoint_rollback_can_release_mdl(handlerton *hton,
                                                      THD *thd)
{
  DBUG_ENTER("binlog_savepoint_rollback_can_release_mdl");
  /*
    If we have not updated any non-transactional tables rollback
    to savepoint will simply truncate binlog cache starting from
    SAVEPOINT command. So it should be safe to release MDL acquired
    after SAVEPOINT command in this case.
  */
  DBUG_RETURN(!trans_cannot_safely_rollback(thd, true));
}


2303 2304
int check_binlog_magic(IO_CACHE* log, const char** errmsg)
{
Michael Widenius's avatar
Michael Widenius committed
2305
  uchar magic[4];
2306 2307
  DBUG_ASSERT(my_b_tell(log) == 0);

Michael Widenius's avatar
Michael Widenius committed
2308
  if (my_b_read(log, magic, sizeof(magic)))
2309 2310 2311 2312 2313 2314
  {
    *errmsg = "I/O error reading the header from the binary log";
    sql_print_error("%s, errno=%d, io cache code=%d", *errmsg, my_errno,
		    log->error);
    return 1;
  }
Michael Widenius's avatar
Michael Widenius committed
2315
  if (bcmp(magic, BINLOG_MAGIC, sizeof(magic)))
2316 2317 2318 2319 2320 2321 2322
  {
    *errmsg = "Binlog has bad magic number;  It's not a binary log file that can be used by this version of MySQL";
    return 1;
  }
  return 0;
}

unknown's avatar
unknown committed
2323

2324 2325 2326 2327 2328
File open_binlog(IO_CACHE *log, const char *log_file_name, const char **errmsg)
{
  File file;
  DBUG_ENTER("open_binlog");

Marc Alff's avatar
Marc Alff committed
2329 2330 2331
  if ((file= mysql_file_open(key_file_binlog,
                             log_file_name, O_RDONLY | O_BINARY | O_SHARE,
                             MYF(MY_WME))) < 0)
2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
  {
    sql_print_error("Failed to open log (file '%s', errno %d)",
                    log_file_name, my_errno);
    *errmsg = "Could not open log file";
    goto err;
  }
  if (init_io_cache(log, file, IO_SIZE*2, READ_CACHE, 0, 0,
                    MYF(MY_WME|MY_DONT_CHECK_FILESIZE)))
  {
    sql_print_error("Failed to create a cache on log (file '%s')",
                    log_file_name);
    *errmsg = "Could not open log file";
    goto err;
  }
  if (check_binlog_magic(log,errmsg))
    goto err;
  DBUG_RETURN(file);

err:
  if (file >= 0)
  {
Marc Alff's avatar
Marc Alff committed
2353
    mysql_file_close(file, MYF(0));
2354 2355 2356 2357
    end_io_cache(log);
  }
  DBUG_RETURN(-1);
}
unknown's avatar
unknown committed
2358

Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
2359
#ifdef _WIN32
unknown's avatar
unknown committed
2360
static int eventSource = 0;
unknown's avatar
unknown committed
2361

2362
static void setup_windows_event_source()
unknown's avatar
unknown committed
2363
{
2364
  HKEY    hRegKey= NULL;
unknown's avatar
unknown committed
2365 2366 2367
  DWORD   dwError= 0;
  TCHAR   szPath[MAX_PATH];
  DWORD dwTypes;
2368

unknown's avatar
unknown committed
2369 2370 2371
  if (eventSource)               // Ensure that we are only called once
    return;
  eventSource= 1;
unknown's avatar
unknown committed
2372

unknown's avatar
unknown committed
2373
  // Create the event source registry key
2374
  dwError= RegCreateKey(HKEY_LOCAL_MACHINE,
unknown's avatar
unknown committed
2375 2376
                          "SYSTEM\\CurrentControlSet\\Services\\EventLog\\Application\\MySQL", 
                          &hRegKey);
unknown's avatar
unknown committed
2377

unknown's avatar
unknown committed
2378 2379
  /* Name of the PE module that contains the message resource */
  GetModuleFileName(NULL, szPath, MAX_PATH);
unknown's avatar
unknown committed
2380

unknown's avatar
unknown committed
2381
  /* Register EventMessageFile */
2382
  dwError = RegSetValueEx(hRegKey, "EventMessageFile", 0, REG_EXPAND_SZ,
unknown's avatar
unknown committed
2383
                          (PBYTE) szPath, (DWORD) (strlen(szPath) + 1));
unknown's avatar
unknown committed
2384

unknown's avatar
unknown committed
2385 2386 2387 2388 2389
  /* Register supported event types */
  dwTypes= (EVENTLOG_ERROR_TYPE | EVENTLOG_WARNING_TYPE |
            EVENTLOG_INFORMATION_TYPE);
  dwError= RegSetValueEx(hRegKey, "TypesSupported", 0, REG_DWORD,
                         (LPBYTE) &dwTypes, sizeof dwTypes);
unknown's avatar
unknown committed
2390

unknown's avatar
unknown committed
2391
  RegCloseKey(hRegKey);
unknown's avatar
unknown committed
2392 2393
}

Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
2394
#endif /* _WIN32 */
unknown's avatar
unknown committed
2395 2396


unknown's avatar
unknown committed
2397 2398 2399
/**
  Find a unique filename for 'filename.#'.

2400 2401 2402 2403 2404 2405
  Set '#' to the number next to the maximum found in the most
  recent log file extension.

  This function will return nonzero if: (i) the generated name
  exceeds FN_REFLEN; (ii) if the number of extensions is exhausted;
  or (iii) some other error happened while examining the filesystem.
unknown's avatar
unknown committed
2406 2407

  @return
2408
    nonzero if not possible to get unique filename.
unknown's avatar
unknown committed
2409
*/
unknown's avatar
unknown committed
2410

2411
static int find_uniq_filename(char *name, ulong next_log_number)
unknown's avatar
unknown committed
2412
{
2413
  uint                  i;
2414
  char                  buff[FN_REFLEN], ext_buf[FN_REFLEN];
2415
  struct st_my_dir     *dir_info;
unknown's avatar
unknown committed
2416
  reg1 struct fileinfo *file_info;
2417
  ulong                 max_found, next, number;
2418 2419
  size_t		buf_length, length;
  char			*start, *end;
2420
  int                   error= 0;
unknown's avatar
unknown committed
2421 2422
  DBUG_ENTER("find_uniq_filename");

2423 2424 2425
  length= dirname_part(buff, name, &buf_length);
  start=  name + length;
  end=    strend(start);
2426

unknown's avatar
unknown committed
2427
  *end='.';
2428
  length= (size_t) (end - start + 1);
unknown's avatar
unknown committed
2429

2430
  if ((DBUG_EVALUATE_IF("error_unique_log_filename", 1, 
Luis Soares's avatar
Luis Soares committed
2431
      !(dir_info= my_dir(buff,MYF(MY_DONT_SORT))))))
unknown's avatar
unknown committed
2432 2433
  {						// This shouldn't happen
    strmov(end,".1");				// use name+1
2434
    DBUG_RETURN(1);
unknown's avatar
unknown committed
2435 2436
  }
  file_info= dir_info->dir_entry;
2437
  max_found= next_log_number ? next_log_number-1 : 0;
Sergei Golubchik's avatar
Sergei Golubchik committed
2438
  for (i= dir_info->number_of_files ; i-- ; file_info++)
unknown's avatar
unknown committed
2439
  {
2440
    if (strncmp(file_info->name, start, length) == 0 &&
unknown's avatar
unknown committed
2441 2442 2443 2444 2445 2446 2447
	test_if_number(file_info->name+length, &number,0))
    {
      set_if_bigger(max_found,(ulong) number);
    }
  }
  my_dirend(dir_info);

2448
  /* check if reached the maximum possible extension number */
2449
  if (max_found >= MAX_LOG_UNIQUE_FN_EXT)
2450 2451 2452 2453 2454 2455 2456 2457 2458
  {
    sql_print_error("Log filename extension number exhausted: %06lu. \
Please fix this by archiving old logs and \
updating the index files.", max_found);
    error= 1;
    goto end;
  }

  next= max_found + 1;
Luis Soares's avatar
Luis Soares committed
2459 2460 2461 2462 2463
  if (sprintf(ext_buf, "%06lu", next)<0)
  {
    error= 1;
    goto end;
  }
unknown's avatar
unknown committed
2464
  *end++='.';
2465 2466 2467 2468 2469 2470 2471 2472

  /* 
    Check if the generated extension size + the file name exceeds the
    buffer size used. If one did not check this, then the filename might be
    truncated, resulting in error.
   */
  if (((strlen(ext_buf) + (end - name)) >= FN_REFLEN))
  {
2473
    sql_print_error("Log filename too large: %s%s (%zu). \
2474 2475 2476 2477 2478 2479
Please fix this by archiving old logs and updating the \
index files.", name, ext_buf, (strlen(ext_buf) + (end - name)));
    error= 1;
    goto end;
  }

Luis Soares's avatar
Luis Soares committed
2480 2481 2482 2483 2484
  if (sprintf(end, "%06lu", next)<0)
  {
    error= 1;
    goto end;
  }
2485 2486 2487 2488 2489 2490 2491 2492 2493

  /* print warning if reaching the end of available extensions. */
  if ((next > (MAX_LOG_UNIQUE_FN_EXT - LOG_WARN_UNIQUE_FN_EXT_LEFT)))
    sql_print_warning("Next log extension: %lu. \
Remaining log filename extensions: %lu. \
Please consider archiving some logs.", next, (MAX_LOG_UNIQUE_FN_EXT - next));

end:
  DBUG_RETURN(error);
unknown's avatar
unknown committed
2494 2495
}

2496

2497
void MYSQL_LOG::init(enum_log_type log_type_arg,
2498
                     enum cache_type io_cache_type_arg)
2499 2500 2501 2502 2503 2504 2505 2506 2507
{
  DBUG_ENTER("MYSQL_LOG::init");
  log_type= log_type_arg;
  io_cache_type= io_cache_type_arg;
  DBUG_PRINT("info",("log_type: %d", log_type));
  DBUG_VOID_RETURN;
}


2508 2509
bool MYSQL_LOG::init_and_set_log_file_name(const char *log_name,
                                           const char *new_name,
2510
                                           ulong next_log_number,
2511 2512 2513 2514 2515
                                           enum_log_type log_type_arg,
                                           enum cache_type io_cache_type_arg)
{
  init(log_type_arg, io_cache_type_arg);

2516 2517 2518 2519 2520 2521
  if (new_name)
  {
    strmov(log_file_name, new_name);
  }
  else if (!new_name && generate_new_name(log_file_name, log_name,
                                          next_log_number))
2522 2523 2524 2525 2526 2527
    return TRUE;

  return FALSE;
}


2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548
/*
  Open a (new) log file.

  SYNOPSIS
    open()

    log_name            The name of the log to open
    log_type_arg        The type of the log. E.g. LOG_NORMAL
    new_name            The new name for the logfile. This is only needed
                        when the method is used to open the binlog file.
    io_cache_type_arg   The type of the IO_CACHE to use for this log file

  DESCRIPTION
    Open the logfile, init IO_CACHE and write startup messages
    (in case of general and slow query logs).

  RETURN VALUES
    0   ok
    1   error
*/

2549 2550 2551 2552 2553
bool MYSQL_LOG::open(
#ifdef HAVE_PSI_INTERFACE
                     PSI_file_key log_file_key,
#endif
                     const char *log_name, enum_log_type log_type_arg,
2554 2555
                     const char *new_name, ulong next_log_number,
                     enum cache_type io_cache_type_arg)
2556 2557
{
  char buff[FN_REFLEN];
2558
  MY_STAT f_stat;
2559
  File file= -1;
2560
  my_off_t seek_offset;
2561
  bool is_fifo = false;
2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573
  int open_flags= O_CREAT | O_BINARY;
  DBUG_ENTER("MYSQL_LOG::open");
  DBUG_PRINT("enter", ("log_type: %d", (int) log_type_arg));

  write_error= 0;

  if (!(name= my_strdup(log_name, MYF(MY_WME))))
  {
    name= (char *)log_name; // for the error message
    goto err;
  }

2574 2575 2576 2577 2578 2579 2580
  /*
    log_type is LOG_UNKNOWN if we should not generate a new name
    This is only used when called from MYSQL_BINARY_LOG::open, which
    has already updated log_file_name.
   */
  if (log_type_arg != LOG_UNKNOWN &&
      init_and_set_log_file_name(name, new_name, next_log_number,
2581
                                 log_type_arg, io_cache_type_arg))
2582 2583
    goto err;

2584 2585
  is_fifo = my_stat(log_file_name, &f_stat, MYF(0)) &&
            MY_S_ISFIFO(f_stat.st_mode);
2586

2587 2588 2589 2590 2591
  if (io_cache_type == SEQ_READ_APPEND)
    open_flags |= O_RDWR | O_APPEND;
  else
    open_flags |= O_WRONLY | (log_type == LOG_BIN ? 0 : O_APPEND);

2592 2593 2594
  if (is_fifo)
    open_flags |= O_NONBLOCK;

2595 2596
  db[0]= 0;

2597 2598 2599 2600 2601
#ifdef HAVE_PSI_INTERFACE
  /* Keep the key for reopen */
  m_log_file_key= log_file_key;
#endif

2602 2603 2604 2605
  if ((file= mysql_file_open(log_file_key, log_file_name, open_flags,
                             MYF(MY_WME | ME_WAITTANG))) < 0)
    goto err;

2606 2607 2608
  if (is_fifo)
    seek_offset= 0;
  else if ((seek_offset= mysql_file_tell(file, MYF(MY_WME))))
2609 2610 2611
    goto err;

  if (init_io_cache(&log_file, file, IO_SIZE, io_cache_type, seek_offset, 0,
2612 2613 2614 2615
                    MYF(MY_WME | MY_NABP |
                        ((log_type == LOG_BIN) ? MY_WAIT_IF_FULL : 0))))
    goto err;

2616
  if (log_type == LOG_NORMAL)
2617 2618
  {
    char *end;
2619
    int len=my_snprintf(buff, sizeof(buff), "%s, Version: %s (%s). "
2620
#ifdef EMBEDDED_LIBRARY
2621 2622
                        "embedded library\n",
                        my_progname, server_version, MYSQL_COMPILATION_COMMENT
Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
2623
#elif _WIN32
2624
			"started with:\nTCP Port: %d, Named Pipe: %s\n",
2625 2626
                        my_progname, server_version, MYSQL_COMPILATION_COMMENT,
                        mysqld_port, mysqld_unix_port
2627 2628
#else
			"started with:\nTcp port: %d  Unix socket: %s\n",
2629 2630
                        my_progname, server_version, MYSQL_COMPILATION_COMMENT,
                        mysqld_port, mysqld_unix_port
2631 2632 2633 2634
#endif
                       );
    end= strnmov(buff + len, "Time                 Id Command    Argument\n",
                 sizeof(buff) - len);
2635
    if (my_b_write(&log_file, (uchar*) buff, (uint) (end-buff)) ||
2636 2637 2638 2639
	flush_io_cache(&log_file))
      goto err;
  }

2640
  log_state= LOG_OPENED;
2641 2642 2643 2644 2645 2646 2647 2648
  DBUG_RETURN(0);

err:
  sql_print_error("Could not use %s for logging (error %d). \
Turning logging off for the whole duration of the MySQL server process. \
To turn it on again: fix the cause, \
shutdown the MySQL server and restart it.", name, errno);
  if (file >= 0)
Marc Alff's avatar
Marc Alff committed
2649
    mysql_file_close(file, MYF(0));
2650
  end_io_cache(&log_file);
2651 2652
  my_free(name);
  name= NULL;
2653
  log_state= LOG_CLOSED;
2654 2655 2656
  DBUG_RETURN(1);
}

unknown's avatar
unknown committed
2657
MYSQL_LOG::MYSQL_LOG()
unknown's avatar
unknown committed
2658 2659
  : name(0), write_error(FALSE), inited(FALSE), log_type(LOG_UNKNOWN),
    log_state(LOG_CLOSED)
unknown's avatar
unknown committed
2660 2661
{
  /*
unknown's avatar
unknown committed
2662 2663
    We don't want to initialize LOCK_Log here as such initialization depends on
    safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
unknown's avatar
unknown committed
2664
    called only in main(). Doing initialization here would make it happen
2665
    before main().
unknown's avatar
unknown committed
2666
  */
2667 2668 2669 2670 2671 2672 2673
  bzero((char*) &log_file, sizeof(log_file));
}

void MYSQL_LOG::init_pthread_objects()
{
  DBUG_ASSERT(inited == 0);
  inited= 1;
Marc Alff's avatar
Marc Alff committed
2674
  mysql_mutex_init(key_LOG_LOCK_log, &LOCK_log, MY_MUTEX_INIT_SLOW);
2675 2676 2677 2678 2679 2680 2681
}

/*
  Close the log file

  SYNOPSIS
    close()
2682 2683 2684 2685
    exiting     Bitmask. LOG_CLOSE_TO_BE_OPENED is used if we intend to call
                open at once after close. LOG_CLOSE_DELAYED_CLOSE is used for
                binlog rotation, to delay actual close of the old file until
                we have successfully created the new file.
2686 2687 2688 2689 2690 2691 2692 2693 2694 2695

  NOTES
    One can do an open on the object at once after doing a close.
    The internal structures are not freed until cleanup() is called
*/

void MYSQL_LOG::close(uint exiting)
{					// One can't set log_type here!
  DBUG_ENTER("MYSQL_LOG::close");
  DBUG_PRINT("enter",("exiting: %d", (int) exiting));
2696
  if (log_state == LOG_OPENED)
2697 2698 2699
  {
    end_io_cache(&log_file);

2700
    if (log_type == LOG_BIN && mysql_file_sync(log_file.file, MYF(MY_WME)) && ! write_error)
2701 2702
    {
      write_error= 1;
2703
      sql_print_error(ER_THD_OR_DEFAULT(current_thd, ER_ERROR_ON_WRITE), name, errno);
2704 2705
    }

2706 2707
    if (!(exiting & LOG_CLOSE_DELAYED_CLOSE) &&
        mysql_file_close(log_file.file, MYF(MY_WME)) && ! write_error)
2708 2709
    {
      write_error= 1;
2710
      sql_print_error(ER_THD_OR_DEFAULT(current_thd, ER_ERROR_ON_WRITE), name, errno);
2711 2712 2713
    }
  }

2714
  log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
2715 2716
  my_free(name);
  name= NULL;
2717
  DBUG_VOID_RETURN;
unknown's avatar
unknown committed
2718 2719
}

unknown's avatar
unknown committed
2720
/** This is called only once. */
unknown's avatar
unknown committed
2721

unknown's avatar
unknown committed
2722
void MYSQL_LOG::cleanup()
unknown's avatar
unknown committed
2723
{
2724
  DBUG_ENTER("cleanup");
unknown's avatar
unknown committed
2725
  if (inited)
2726
  {
unknown's avatar
unknown committed
2727
    inited= 0;
Marc Alff's avatar
Marc Alff committed
2728
    mysql_mutex_destroy(&LOCK_log);
2729
    close(0);
2730
  }
2731
  DBUG_VOID_RETURN;
unknown's avatar
unknown committed
2732 2733
}

2734

2735 2736
int MYSQL_LOG::generate_new_name(char *new_name, const char *log_name,
                                 ulong next_log_number)
2737
{
2738 2739
  fn_format(new_name, log_name, mysql_data_home, "", 4);
  if (log_type == LOG_BIN)
unknown's avatar
unknown committed
2740 2741 2742
  {
    if (!fn_ext(log_name)[0])
    {
2743
      if (DBUG_EVALUATE_IF("binlog_inject_new_name_error", TRUE, FALSE) ||
2744
          find_uniq_filename(new_name, next_log_number))
unknown's avatar
unknown committed
2745
      {
2746 2747
        THD *thd= current_thd;
        if (thd)
2748 2749
          my_printf_error(ER_NO_UNIQUE_LOGFILE,
                          ER_THD(thd, ER_NO_UNIQUE_LOGFILE),
2750 2751
                          MYF(ME_FATALERROR), log_name);
        sql_print_error(ER_DEFAULT(ER_NO_UNIQUE_LOGFILE), log_name);
unknown's avatar
unknown committed
2752 2753 2754 2755 2756 2757 2758
	return 1;
      }
    }
  }
  return 0;
}

2759

2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771
/*
  Reopen the log file

  SYNOPSIS
    reopen_file()

  DESCRIPTION
    Reopen the log file. The method is used during FLUSH LOGS
    and locks LOCK_log mutex
*/


2772
void MYSQL_QUERY_LOG::reopen_file()
2773
{
2774 2775 2776 2777 2778 2779 2780 2781 2782
  char *save_name;

  DBUG_ENTER("MYSQL_LOG::reopen_file");
  if (!is_open())
  {
    DBUG_PRINT("info",("log is closed"));
    DBUG_VOID_RETURN;
  }

Marc Alff's avatar
Marc Alff committed
2783
  mysql_mutex_lock(&LOCK_log);
2784 2785 2786 2787 2788 2789

  save_name= name;
  name= 0;				// Don't free name
  close(LOG_CLOSE_TO_BE_OPENED);

  /*
2790
     Note that at this point, log_state != LOG_CLOSED (important for is_open()).
2791 2792
  */

2793 2794 2795 2796
  open(
#ifdef HAVE_PSI_INTERFACE
       m_log_file_key,
#endif
2797
       save_name, log_type, 0, 0, io_cache_type);
2798
  my_free(save_name);
2799

Marc Alff's avatar
Marc Alff committed
2800
  mysql_mutex_unlock(&LOCK_log);
2801

2802
  DBUG_VOID_RETURN;
2803 2804
}

unknown's avatar
unknown committed
2805

2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830
/*
  Write a command to traditional general log file

  SYNOPSIS
    write()

    event_time        command start timestamp
    user_host         the pointer to the string with user@host info
    user_host_len     length of the user_host string. this is computed once
                      and passed to all general log  event handlers
    thread_id         Id of the thread, issued a query
    command_type      the type of the command being logged
    command_type_len  the length of the string above
    sql_text          the very text of the query being executed
    sql_text_len      the length of sql_text string

  DESCRIPTION

   Log given command to to normal (not rotable) log file

  RETURN
    FASE - OK
    TRUE - error occured
*/

2831
bool MYSQL_QUERY_LOG::write(time_t event_time, const char *user_host,
2832
                            uint user_host_len, int thread_id_arg,
2833 2834
                            const char *command_type, uint command_type_len,
                            const char *sql_text, uint sql_text_len)
unknown's avatar
unknown committed
2835
{
2836 2837
  char buff[32];
  uint length= 0;
2838
  char local_time_buff[MAX_TIME_SIZE];
2839 2840 2841
  struct tm start;
  uint time_buff_len= 0;

Marc Alff's avatar
Marc Alff committed
2842
  mysql_mutex_lock(&LOCK_log);
unknown's avatar
unknown committed
2843

2844 2845 2846
  /* Test if someone closed between the is_open test and lock */
  if (is_open())
  {
2847 2848 2849
    /* for testing output of timestamp and thread id */
    DBUG_EXECUTE_IF("reset_log_last_time", last_time= 0;);

2850 2851 2852 2853 2854 2855 2856
    /* Note that my_b_write() assumes it knows the length for this */
      if (event_time != last_time)
      {
        last_time= event_time;

        localtime_r(&event_time, &start);

2857
        time_buff_len= my_snprintf(local_time_buff, MAX_TIME_SIZE,
2858
                                   "%02d%02d%02d %2d:%02d:%02d\t",
2859 2860 2861 2862
                                   start.tm_year % 100, start.tm_mon + 1,
                                   start.tm_mday, start.tm_hour,
                                   start.tm_min, start.tm_sec);

2863
        if (my_b_write(&log_file, (uchar*) local_time_buff, time_buff_len))
2864 2865 2866
          goto err;
      }
      else
2867
        if (my_b_write(&log_file, (uchar*) "\t\t" ,2) < 0)
2868 2869
          goto err;

2870
      /* command_type, thread_id */
2871
      length= my_snprintf(buff, 32, "%5ld ", (long) thread_id_arg);
2872

2873
    if (my_b_write(&log_file, (uchar*) buff, length))
2874 2875
      goto err;

2876
    if (my_b_write(&log_file, (uchar*) command_type, command_type_len))
2877 2878
      goto err;

2879
    if (my_b_write(&log_file, (uchar*) "\t", 1))
2880 2881 2882
      goto err;

    /* sql_text */
2883
    if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len))
2884 2885
      goto err;

2886
    if (my_b_write(&log_file, (uchar*) "\n", 1) ||
2887 2888 2889 2890
        flush_io_cache(&log_file))
      goto err;
  }

Marc Alff's avatar
Marc Alff committed
2891
  mysql_mutex_unlock(&LOCK_log);
2892 2893 2894 2895 2896 2897 2898 2899
  return FALSE;
err:

  if (!write_error)
  {
    write_error= 1;
    sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
  }
Marc Alff's avatar
Marc Alff committed
2900
  mysql_mutex_unlock(&LOCK_log);
2901
  return TRUE;
2902 2903
}

unknown's avatar
unknown committed
2904

2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915
/*
  Log a query to the traditional slow log file

  SYNOPSIS
    write()

    thd               THD of the query
    current_time      current timestamp
    user_host         the pointer to the string with user@host info
    user_host_len     length of the user_host string. this is computed once
                      and passed to all general log event handlers
2916 2917
    query_utime       Amount of time the query took to execute (in microseconds)
    lock_utime        Amount of time the query was locked (in microseconds)
2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932
    is_command        The flag, which determines, whether the sql_text is a
                      query or an administrator command.
    sql_text          the very text of the query or administrator command
                      processed
    sql_text_len      the length of sql_text string

  DESCRIPTION

   Log a query to the slow log file.

  RETURN
    FALSE - OK
    TRUE - error occured
*/

2933
bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time,
2934
                            const char *user_host,
2935 2936
                            uint user_host_len, ulonglong query_utime,
                            ulonglong lock_utime, bool is_command,
2937
                            const char *sql_text, uint sql_text_len)
unknown's avatar
unknown committed
2938
{
2939
  bool error= 0;
2940
  DBUG_ENTER("MYSQL_QUERY_LOG::write");
2941

Marc Alff's avatar
Marc Alff committed
2942
  mysql_mutex_lock(&LOCK_log);
unknown's avatar
unknown committed
2943

2944
  if (!is_open())
unknown's avatar
unknown committed
2945
  {
Marc Alff's avatar
Marc Alff committed
2946
    mysql_mutex_unlock(&LOCK_log);
2947
    DBUG_RETURN(0);
unknown's avatar
unknown committed
2948
  }
2949 2950 2951 2952 2953

  if (is_open())
  {						// Safety agains reopen
    int tmp_errno= 0;
    char buff[80], *end;
2954
    char query_time_buff[22+7], lock_time_buff[22+7];
2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972
    uint buff_len;
    end= buff;

    if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
    {
      if (current_time != last_time)
      {
        last_time= current_time;
        struct tm start;
        localtime_r(&current_time, &start);

        buff_len= my_snprintf(buff, sizeof buff,
                              "# Time: %02d%02d%02d %2d:%02d:%02d\n",
                              start.tm_year % 100, start.tm_mon + 1,
                              start.tm_mday, start.tm_hour,
                              start.tm_min, start.tm_sec);

        /* Note that my_b_write() assumes it knows the length for this */
2973
        if (my_b_write(&log_file, (uchar*) buff, buff_len))
2974 2975
          tmp_errno= errno;
      }
2976 2977
      const uchar uh[]= "# User@Host: ";
      if (my_b_write(&log_file, uh, sizeof(uh) - 1))
2978
        tmp_errno= errno;
2979
      if (my_b_write(&log_file, (uchar*) user_host, user_host_len))
2980
        tmp_errno= errno;
2981
      if (my_b_write(&log_file, (uchar*) "\n", 1))
2982 2983
        tmp_errno= errno;
    }
2984
    
2985
    /* For slow query log */
2986 2987
    sprintf(query_time_buff, "%.6f", ulonglong2double(query_utime)/1000000.0);
    sprintf(lock_time_buff,  "%.6f", ulonglong2double(lock_utime)/1000000.0);
2988
    if (my_b_printf(&log_file,
2989
                    "# Thread_id: %lu  Schema: %s  QC_hit: %s\n" \
2990 2991
                    "# Query_time: %s  Lock_time: %s  Rows_sent: %lu  Rows_examined: %lu\n" \
                    "# Rows_affected: %lu\n",
2992 2993
                    (ulong) thd->thread_id, (thd->db ? thd->db : ""),
                    ((thd->query_plan_flags & QPLAN_QC) ? "Yes" : "No"),
2994
                    query_time_buff, lock_time_buff,
Sergei Golubchik's avatar
Sergei Golubchik committed
2995
                    (ulong) thd->get_sent_row_count(),
2996 2997 2998 2999
                    (ulong) thd->get_examined_row_count(),
                    thd->get_stmt_da()->is_ok() ?
                    (ulong) thd->get_stmt_da()->affected_rows() :
                    0) == (size_t) -1)
3000
      tmp_errno= errno;
3001 3002 3003 3004 3005 3006 3007
     if ((thd->variables.log_slow_verbosity & LOG_SLOW_VERBOSITY_QUERY_PLAN) &&
         (thd->query_plan_flags &
          (QPLAN_FULL_SCAN | QPLAN_FULL_JOIN | QPLAN_TMP_TABLE |
           QPLAN_TMP_DISK | QPLAN_FILESORT | QPLAN_FILESORT_DISK)) &&
         my_b_printf(&log_file,
                     "# Full_scan: %s  Full_join: %s  "
                     "Tmp_table: %s  Tmp_table_on_disk: %s\n"
3008 3009
                     "# Filesort: %s  Filesort_on_disk: %s  Merge_passes: %lu  "
                     "Priority_queue: %s\n",
3010 3011 3012 3013 3014 3015 3016
                     ((thd->query_plan_flags & QPLAN_FULL_SCAN) ? "Yes" : "No"),
                     ((thd->query_plan_flags & QPLAN_FULL_JOIN) ? "Yes" : "No"),
                     ((thd->query_plan_flags & QPLAN_TMP_TABLE) ? "Yes" : "No"),
                     ((thd->query_plan_flags & QPLAN_TMP_DISK) ? "Yes" : "No"),
                     ((thd->query_plan_flags & QPLAN_FILESORT) ? "Yes" : "No"),
                     ((thd->query_plan_flags & QPLAN_FILESORT_DISK) ?
                      "Yes" : "No"),
3017 3018 3019 3020
                     thd->query_plan_fsort_passes,
                     ((thd->query_plan_flags & QPLAN_FILESORT_PRIORITY_QUEUE) ? 
                       "Yes" : "No")
                     ) == (size_t) -1)
3021
       tmp_errno= errno;
3022
    if (thd->variables.log_slow_verbosity & LOG_SLOW_VERBOSITY_EXPLAIN &&
3023
        thd->lex->explain)
3024 3025 3026
    {
      StringBuffer<128> buf;
      DBUG_ASSERT(!thd->free_list);
3027
      if (!print_explain_for_slow_log(thd->lex, thd, &buf))
3028
        my_b_printf(&log_file, "%s", buf.c_ptr_safe());
3029 3030
      thd->free_items();
    }
3031 3032
    if (thd->db && strcmp(thd->db, db))
    {						// Database changed
3033
      if (my_b_printf(&log_file,"use %s;\n",thd->db) == (size_t) -1)
3034 3035 3036
        tmp_errno= errno;
      strmov(db,thd->db);
    }
3037
    if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
3038 3039
    {
      end=strmov(end, ",last_insert_id=");
3040 3041 3042
      end=longlong10_to_str((longlong)
                            thd->first_successful_insert_id_in_prev_stmt_for_binlog,
                            end, -10);
3043 3044
    }
    // Save value if we do an insert.
3045
    if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
3046 3047 3048 3049
    {
      if (!(specialflag & SPECIAL_SHORT_LOG_FORMAT))
      {
        end=strmov(end,",insert_id=");
3050 3051 3052
        end=longlong10_to_str((longlong)
                              thd->auto_inc_intervals_in_cur_stmt_for_binlog.minimum(),
                              end, -10);
3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
      }
    }

    /*
      This info used to show up randomly, depending on whether the query
      checked the query start time or not. now we always write current
      timestamp to the slow log
    */
    end= strmov(end, ",timestamp=");
    end= int10_to_str((long) current_time, end, 10);

    if (end != buff)
    {
      *end++=';';
      *end='\n';
3068 3069
      if (my_b_write(&log_file, (uchar*) "SET ", 4) ||
          my_b_write(&log_file, (uchar*) buff + 1, (uint) (end-buff)))
3070 3071 3072 3073 3074 3075
        tmp_errno= errno;
    }
    if (is_command)
    {
      end= strxmov(buff, "# administrator command: ", NullS);
      buff_len= (ulong) (end - buff);
3076 3077 3078 3079
      DBUG_EXECUTE_IF("simulate_slow_log_write_error",
                      {DBUG_SET("+d,simulate_file_write_error");});
      if(my_b_write(&log_file, (uchar*) buff, buff_len))
        tmp_errno= errno;
3080
    }
3081 3082
    if (my_b_write(&log_file, (uchar*) sql_text, sql_text_len) ||
        my_b_write(&log_file, (uchar*) ";\n",2) ||
3083 3084 3085 3086 3087 3088 3089 3090
        flush_io_cache(&log_file))
      tmp_errno= errno;
    if (tmp_errno)
    {
      error= 1;
      if (! write_error)
      {
        write_error= 1;
3091
        sql_print_error(ER_THD(thd, ER_ERROR_ON_WRITE), name, error);
3092 3093 3094
      }
    }
  }
Marc Alff's avatar
Marc Alff committed
3095
  mysql_mutex_unlock(&LOCK_log);
3096
  DBUG_RETURN(error);
3097 3098
}

3099

unknown's avatar
unknown committed
3100 3101 3102 3103 3104
/**
  @todo
  The following should be using fn_format();  We just need to
  first change fn_format() to cut the file name if it's too long.
*/
3105
const char *MYSQL_LOG::generate_name(const char *log_name,
3106 3107
                                     const char *suffix,
                                     bool strip_ext, char *buff)
3108 3109 3110
{
  if (!log_name || !log_name[0])
  {
3111 3112 3113
    strmake(buff, pidfile_name, FN_REFLEN - strlen(suffix) - 1);
    return (const char *)
      fn_format(buff, buff, "", suffix, MYF(MY_REPLACE_EXT|MY_REPLACE_DIR));
3114 3115 3116 3117
  }
  // get rid of extension if the log is binary to avoid problems
  if (strip_ext)
  {
3118 3119
    char *p= fn_ext(log_name);
    uint length= (uint) (p - log_name);
3120
    strmake(buff, log_name, MY_MIN(length, FN_REFLEN-1));
3121 3122 3123 3124 3125
    return (const char*)buff;
  }
  return log_name;
}

unknown's avatar
unknown committed
3126

3127

3128
MYSQL_BIN_LOG::MYSQL_BIN_LOG(uint *sync_period)
3129
  :reset_master_pending(0), mark_xid_done_waiting(0),
3130
   bytes_written(0), file_id(1), open_count(1),
unknown's avatar
unknown committed
3131 3132
   group_commit_queue(0), group_commit_queue_busy(FALSE),
   num_commits(0), num_group_commits(0),
3133 3134
   group_commit_trigger_count(0), group_commit_trigger_timeout(0),
   group_commit_trigger_lock_wait(0),
3135 3136
   sync_period_ptr(sync_period), sync_counter(0),
   state_file_deleted(false), binlog_state_recover_done(false),
Andrei Elkin's avatar
Andrei Elkin committed
3137
   is_relay_log(0), signal_cnt(0),
3138 3139
   checksum_alg_reset(BINLOG_CHECKSUM_ALG_UNDEF),
   relay_log_checksum_alg(BINLOG_CHECKSUM_ALG_UNDEF),
3140 3141
   description_event_for_exec(0), description_event_for_queue(0),
   current_binlog_id(0)
3142 3143 3144 3145 3146 3147 3148 3149 3150
{
  /*
    We don't want to initialize locks here as such initialization depends on
    safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
    called only in main(). Doing initialization here would make it happen
    before main().
  */
  index_file_name[0] = 0;
  bzero((char*) &index_file, sizeof(index_file));
3151
  bzero((char*) &purge_index_file, sizeof(purge_index_file));
3152 3153 3154 3155 3156 3157 3158 3159 3160
}

/* this is called only once */

void MYSQL_BIN_LOG::cleanup()
{
  DBUG_ENTER("cleanup");
  if (inited)
  {
3161 3162
    xid_count_per_binlog *b;

3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175
    /* Wait for the binlog background thread to stop. */
    if (!is_relay_log && binlog_background_thread_started)
    {
      mysql_mutex_lock(&LOCK_binlog_background_thread);
      binlog_background_thread_stop= true;
      mysql_cond_signal(&COND_binlog_background_thread);
      while (binlog_background_thread_stop)
        mysql_cond_wait(&COND_binlog_background_thread_end,
                        &LOCK_binlog_background_thread);
      mysql_mutex_unlock(&LOCK_binlog_background_thread);
      binlog_background_thread_started= false;
    }

3176 3177 3178 3179
    inited= 0;
    close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT);
    delete description_event_for_queue;
    delete description_event_for_exec;
3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191

    while ((b= binlog_xid_count_list.get()))
    {
      /*
        There should be no pending XIDs at shutdown, and only one entry (for
        the active binlog file) in the list.
      */
      DBUG_ASSERT(b->xid_count == 0);
      DBUG_ASSERT(!binlog_xid_count_list.head());
      my_free(b);
    }

Marc Alff's avatar
Marc Alff committed
3192 3193
    mysql_mutex_destroy(&LOCK_log);
    mysql_mutex_destroy(&LOCK_index);
3194
    mysql_mutex_destroy(&LOCK_xid_list);
3195
    mysql_mutex_destroy(&LOCK_binlog_background_thread);
3196
    mysql_mutex_destroy(&LOCK_binlog_end_pos);
Marc Alff's avatar
Marc Alff committed
3197
    mysql_cond_destroy(&update_cond);
3198 3199
    mysql_cond_destroy(&COND_queue_busy);
    mysql_cond_destroy(&COND_xid_list);
3200 3201
    mysql_cond_destroy(&COND_binlog_background_thread);
    mysql_cond_destroy(&COND_binlog_background_thread_end);
3202
  }
3203 3204 3205 3206 3207 3208

  /*
    Free data for global binlog state.
    We can't do that automaticly as we need to do this before
    safemalloc is shut down
  */
3209 3210
  if (!is_relay_log)
    rpl_global_gtid_binlog_state.free();
3211 3212 3213 3214 3215
  DBUG_VOID_RETURN;
}


/* Init binlog-specific vars */
3216
void MYSQL_BIN_LOG::init(ulong max_size_arg)
3217 3218 3219 3220 3221 3222 3223 3224 3225 3226
{
  DBUG_ENTER("MYSQL_BIN_LOG::init");
  max_size= max_size_arg;
  DBUG_PRINT("info",("max_size: %lu", max_size));
  DBUG_VOID_RETURN;
}


void MYSQL_BIN_LOG::init_pthread_objects()
{
3227 3228
  MYSQL_LOG::init_pthread_objects();
  mysql_mutex_init(m_key_LOCK_index, &LOCK_index, MY_MUTEX_INIT_SLOW);
Sergei Golubchik's avatar
Sergei Golubchik committed
3229
  mysql_mutex_setflags(&LOCK_index, MYF_NO_DEADLOCK_DETECTION);
3230 3231
  mysql_mutex_init(key_BINLOG_LOCK_xid_list,
                   &LOCK_xid_list, MY_MUTEX_INIT_FAST);
3232
  mysql_cond_init(m_key_update_cond, &update_cond, 0);
Sergei Golubchik's avatar
Sergei Golubchik committed
3233
  mysql_cond_init(m_key_COND_queue_busy, &COND_queue_busy, 0);
3234
  mysql_cond_init(key_BINLOG_COND_xid_list, &COND_xid_list, 0);
3235 3236 3237 3238 3239 3240 3241

  mysql_mutex_init(key_BINLOG_LOCK_binlog_background_thread,
                   &LOCK_binlog_background_thread, MY_MUTEX_INIT_FAST);
  mysql_cond_init(key_BINLOG_COND_binlog_background_thread,
                  &COND_binlog_background_thread, 0);
  mysql_cond_init(key_BINLOG_COND_binlog_background_thread_end,
                  &COND_binlog_background_thread_end, 0);
3242 3243 3244

  mysql_mutex_init(m_key_LOCK_binlog_end_pos, &LOCK_binlog_end_pos,
                   MY_MUTEX_INIT_SLOW);
3245 3246 3247 3248
}


bool MYSQL_BIN_LOG::open_index_file(const char *index_file_name_arg,
3249
                                    const char *log_name, bool need_mutex)
3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266
{
  File index_file_nr= -1;
  DBUG_ASSERT(!my_b_inited(&index_file));

  /*
    First open of this class instance
    Create an index file that will hold all file names uses for logging.
    Add new entries to the end of it.
  */
  myf opt= MY_UNPACK_FILENAME;
  if (!index_file_name_arg)
  {
    index_file_name_arg= log_name;    // Use same basename for index file
    opt= MY_UNPACK_FILENAME | MY_REPLACE_EXT;
  }
  fn_format(index_file_name, index_file_name_arg, mysql_data_home,
            ".index", opt);
3267
  if ((index_file_nr= mysql_file_open(m_key_file_log_index,
Marc Alff's avatar
Marc Alff committed
3268 3269 3270 3271
                                      index_file_name,
                                      O_RDWR | O_CREAT | O_BINARY,
                                      MYF(MY_WME))) < 0 ||
       mysql_file_sync(index_file_nr, MYF(MY_WME)) ||
3272 3273
       init_io_cache(&index_file, index_file_nr,
                     IO_SIZE, WRITE_CACHE,
Marc Alff's avatar
Marc Alff committed
3274 3275
                     mysql_file_seek(index_file_nr, 0L, MY_SEEK_END, MYF(0)),
                                     0, MYF(MY_WME | MY_WAIT_IF_FULL)) ||
3276
      DBUG_EVALUATE_IF("fault_injection_openning_index", 1, 0))
3277
  {
3278 3279 3280
    /*
      TODO: all operations creating/deleting the index file or a log, should
      call my_sync_dir() or my_sync_dir_by_file() to be durable.
Marc Alff's avatar
Marc Alff committed
3281 3282
      TODO: file creation should be done with mysql_file_create()
      not mysql_file_open().
3283
    */
3284
    if (index_file_nr >= 0)
Marc Alff's avatar
Marc Alff committed
3285
      mysql_file_close(index_file_nr, MYF(0));
3286 3287
    return TRUE;
  }
3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309

#ifdef HAVE_REPLICATION
  /*
    Sync the index by purging any binary log file that is not registered.
    In other words, either purge binary log files that were removed from
    the index but not purged from the file system due to a crash or purge
    any binary log file that was created but not register in the index
    due to a crash.
  */

  if (set_purge_index_file_name(index_file_name_arg) ||
      open_purge_index_file(FALSE) ||
      purge_index_entry(NULL, NULL, need_mutex) ||
      close_purge_index_file() ||
      DBUG_EVALUATE_IF("fault_injection_recovering_index", 1, 0))
  {
    sql_print_error("MYSQL_BIN_LOG::open_index_file failed to sync the index "
                    "file.");
    return TRUE;
  }
#endif

3310 3311 3312
  return FALSE;
}

unknown's avatar
unknown committed
3313

unknown's avatar
unknown committed
3314
/**
3315
  Open a (new) binlog file.
3316

3317
  - Open the log file and the index file. Register the new
unknown's avatar
unknown committed
3318
  file name in it
3319
  - When calling this when the file is in use, you must have a locks
unknown's avatar
unknown committed
3320
  on LOCK_log and LOCK_index.
3321

unknown's avatar
unknown committed
3322
  @retval
3323
    0	ok
unknown's avatar
unknown committed
3324
  @retval
3325 3326 3327
    1	error
*/

3328 3329 3330
bool MYSQL_BIN_LOG::open(const char *log_name,
                         enum_log_type log_type_arg,
                         const char *new_name,
3331
                         ulong next_log_number,
3332 3333
                         enum cache_type io_cache_type_arg,
                         ulong max_size_arg,
3334 3335
                         bool null_created_arg,
                         bool need_mutex)
unknown's avatar
unknown committed
3336
{
3337
  File file= -1;
3338
  xid_count_per_binlog *new_xid_list_entry= NULL, *b;
3339
  DBUG_ENTER("MYSQL_BIN_LOG::open");
3340
  DBUG_PRINT("enter",("log_type: %d",(int) log_type_arg));
3341

3342 3343 3344 3345 3346 3347 3348 3349
  if (!is_relay_log)
  {
    if (!binlog_state_recover_done)
    {
      binlog_state_recover_done= true;
      if (do_binlog_recovery(opt_bin_logname, false))
        DBUG_RETURN(1);
    }
unknown's avatar
unknown committed
3350

3351 3352 3353 3354
    if (!binlog_background_thread_started &&
        start_binlog_background_thread())
      DBUG_RETURN(1);
  }
3355

3356 3357 3358
  /* We need to calculate new log file name for purge to delete old */
  if (init_and_set_log_file_name(log_name, new_name, next_log_number,
                                 log_type_arg, io_cache_type_arg))
3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369
  {
    sql_print_error("MSYQL_BIN_LOG::open failed to generate new file name.");
    DBUG_RETURN(1);
  }

#ifdef HAVE_REPLICATION
  if (open_purge_index_file(TRUE) ||
      register_create_index_entry(log_file_name) ||
      sync_purge_index_file() ||
      DBUG_EVALUATE_IF("fault_injection_registering_index", 1, 0))
  {
Luis Soares's avatar
Luis Soares committed
3370
    /**
3371 3372 3373 3374 3375 3376 3377 3378 3379
        TODO:
        Although this was introduced to appease valgrind when
        injecting emulated faults using
        fault_injection_registering_index it may be good to consider
        what actually happens when open_purge_index_file succeeds but
        register or sync fails.

        Perhaps we might need the code below in MYSQL_LOG_BIN::cleanup
        for "real life" purposes as well? 
Luis Soares's avatar
Luis Soares committed
3380 3381 3382 3383 3384 3385 3386 3387 3388
     */
    DBUG_EXECUTE_IF("fault_injection_registering_index", {
      if (my_b_inited(&purge_index_file))
      {
        end_io_cache(&purge_index_file);
        my_close(purge_index_file.file, MYF(0));
      }
    });

3389 3390 3391
    sql_print_error("MSYQL_BIN_LOG::open failed to sync the index file.");
    DBUG_RETURN(1);
  }
3392
  DBUG_EXECUTE_IF("crash_create_non_critical_before_update_index", DBUG_SUICIDE(););
3393 3394 3395
#endif

  write_error= 0;
unknown's avatar
unknown committed
3396

3397
  /* open the main log file */
3398 3399
  if (MYSQL_LOG::open(
#ifdef HAVE_PSI_INTERFACE
3400
                      m_key_file_log,
3401
#endif
3402 3403 3404
                      log_name,
                      LOG_UNKNOWN, /* Don't generate new name */
                      0, 0, io_cache_type_arg))
3405 3406 3407 3408
  {
#ifdef HAVE_REPLICATION
    close_purge_index_file();
#endif
3409
    DBUG_RETURN(1);                            /* all warnings issued */
3410
  }
3411

3412
  init(max_size_arg);
3413

unknown's avatar
unknown committed
3414
  open_count++;
unknown's avatar
unknown committed
3415

3416
  DBUG_ASSERT(log_type == LOG_BIN);
unknown's avatar
unknown committed
3417 3418

  {
3419 3420 3421 3422 3423 3424 3425 3426 3427 3428
    bool write_file_name_to_index_file=0;

    if (!my_b_filelength(&log_file))
    {
      /*
	The binary log file was empty (probably newly created)
	This is the normal case and happens when the user doesn't specify
	an extension for the binary log files.
	In this case we write a standard header to it.
      */
Michael Widenius's avatar
Michael Widenius committed
3429
      if (my_b_safe_write(&log_file, BINLOG_MAGIC,
unknown's avatar
unknown committed
3430
			  BIN_LOG_HEADER_SIZE))
unknown's avatar
unknown committed
3431
        goto err;
3432 3433
      bytes_written+= BIN_LOG_HEADER_SIZE;
      write_file_name_to_index_file= 1;
unknown's avatar
unknown committed
3434
    }
unknown's avatar
unknown committed
3435

3436
    {
3437
      /*
3438 3439
        In 4.x we put Start event only in the first binlog. But from 5.0 we
        want a Start event even if this is not the very first binlog.
3440 3441
      */
      Format_description_log_event s(BINLOG_VERSION);
unknown's avatar
unknown committed
3442 3443 3444 3445 3446
      /*
        don't set LOG_EVENT_BINLOG_IN_USE_F for SEQ_READ_APPEND io_cache
        as we won't be able to reset it later
      */
      if (io_cache_type == WRITE_CACHE)
3447 3448 3449 3450 3451 3452 3453 3454 3455
        s.flags |= LOG_EVENT_BINLOG_IN_USE_F;
      s.checksum_alg= is_relay_log ?
        /* relay-log */
        /* inherit master's A descriptor if one has been received */
        (relay_log_checksum_alg= 
         (relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF) ?
         relay_log_checksum_alg :
         /* otherwise use slave's local preference of RL events verification */
         (opt_slave_sql_verify_checksum == 0) ?
Michael Widenius's avatar
Michael Widenius committed
3456
         (uint8) BINLOG_CHECKSUM_ALG_OFF : (uint8) binlog_checksum_options):
3457
        /* binlog */
Michael Widenius's avatar
Michael Widenius committed
3458
        (uint8) binlog_checksum_options;
3459
      DBUG_ASSERT(s.checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
3460 3461
      if (!s.is_valid())
        goto err;
3462
      s.dont_set_created= null_created_arg;
3463 3464 3465
      if (s.write(&log_file))
        goto err;
      bytes_written+= s.data_written;
3466 3467 3468 3469

      if (!is_relay_log)
      {
        char buf[FN_REFLEN];
3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504

        /*
          Output a Gtid_list_log_event at the start of the binlog file.

          This is used to quickly determine which GTIDs are found in binlog
          files earlier than this one, and which are found in this (or later)
          binlogs.

          The list gives a mapping from (domain_id, server_id) -> seq_no (so
          this means that there is at most one entry for every unique pair
          (domain_id, server_id) in the list). It indicates that this seq_no is
          the last one found in an earlier binlog file for this (domain_id,
          server_id) combination - so any higher seq_no should be search for
          from this binlog file, or a later one.

          This allows to locate the binlog file containing a given GTID by
          scanning backwards, reading just the Gtid_list_log_event at the
          start of each file, and scanning only the relevant binlog file when
          found, not all binlog files.

          The existence of a given entry (domain_id, server_id, seq_no)
          guarantees only that this seq_no will not be found in this or any
          later binlog file. It does not guarantee that it can be found it an
          earlier binlog file, for example the file may have been purged.

          If there is no entry for a given (domain_id, server_id) pair, then
          it means that no such GTID exists in any earlier binlog. It is
          permissible to remove such pair from future Gtid_list_log_events
          if all previous binlog files containing such GTIDs have been purged
          (though such optimization is not performed at the time of this
          writing). So if there is no entry for given GTID it means that such
          GTID should be search for in this or later binlog file, same as if
          there had been an entry (domain_id, server_id, 0).
        */

unknown's avatar
unknown committed
3505
        Gtid_list_log_event gl_ev(&rpl_global_gtid_binlog_state, 0);
unknown's avatar
unknown committed
3506 3507
        if (gl_ev.write(&log_file))
          goto err;
3508 3509 3510

        /* Output a binlog checkpoint event at the start of the binlog file. */

3511
        /*
3512 3513 3514 3515 3516 3517 3518
          Construct an entry in the binlog_xid_count_list for the new binlog
          file (we will not link it into the list until we know the new file
          is successfully created; otherwise we would have to remove it again
          if creation failed, which gets tricky since other threads may have
          seen the entry in the meantime - and we do not want to hold
          LOCK_xid_list for long periods of time).

3519 3520 3521 3522 3523 3524
          Write the current binlog checkpoint into the log, so XA recovery will
          know from where to start recovery.
        */
        uint off= dirname_length(log_file_name);
        uint len= strlen(log_file_name) - off;
        char *entry_mem, *name_mem;
3525
        if (!(new_xid_list_entry = (xid_count_per_binlog *)
3526 3527 3528 3529 3530 3531
              my_multi_malloc(MYF(MY_WME),
                              &entry_mem, sizeof(xid_count_per_binlog),
                              &name_mem, len,
                              NULL)))
          goto err;
        memcpy(name_mem, log_file_name+off, len);
3532 3533 3534
        new_xid_list_entry->binlog_name= name_mem;
        new_xid_list_entry->binlog_name_len= len;
        new_xid_list_entry->xid_count= 0;
3535 3536

        /*
3537
          Find the name for the Initial binlog checkpoint.
3538

3539 3540 3541 3542 3543 3544 3545 3546 3547
          Normally this will just be the first entry, as we delete entries
          when their count drops to zero. But we scan the list to handle any
          corner case, eg. for the first binlog file opened after startup, the
          list will be empty.
        */
        mysql_mutex_lock(&LOCK_xid_list);
        I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
        while ((b= it++) && b->xid_count == 0)
          ;
3548
        mysql_mutex_unlock(&LOCK_xid_list);
3549 3550 3551
        if (!b)
          b= new_xid_list_entry;
        strmake(buf, b->binlog_name, b->binlog_name_len);
3552
        Binlog_checkpoint_log_event ev(buf, len);
3553 3554 3555 3556
        DBUG_EXECUTE_IF("crash_before_write_checkpoint_event",
                        flush_io_cache(&log_file);
                        mysql_file_sync(log_file.file, MYF(MY_WME));
                        DBUG_SUICIDE(););
3557 3558 3559 3560
        if (ev.write(&log_file))
          goto err;
        bytes_written+= ev.data_written;
      }
3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572
    }
    if (description_event_for_queue &&
        description_event_for_queue->binlog_version>=4)
    {
      /*
        This is a relay log written to by the I/O slave thread.
        Write the event so that others can later know the format of this relay
        log.
        Note that this event is very close to the original event from the
        master (it has binlog version of the master, event types of the
        master), so this is suitable to parse the next relay log's event. It
        has been produced by
unknown's avatar
unknown committed
3573
        Format_description_log_event::Format_description_log_event(char* buf,).
3574 3575 3576 3577 3578 3579 3580 3581 3582
        Why don't we want to write the description_event_for_queue if this
        event is for format<4 (3.23 or 4.x): this is because in that case, the
        description_event_for_queue describes the data received from the
        master, but not the data written to the relay log (*conversion*),
        which is in format 4 (slave's).
      */
      /*
        Set 'created' to 0, so that in next relay logs this event does not
        trigger cleaning actions on the slave in
3583
        Format_description_log_event::apply_event_impl().
3584 3585 3586
      */
      description_event_for_queue->created= 0;
      /* Don't set log_pos in event header */
3587
      description_event_for_queue->set_artificial_event();
3588

3589 3590 3591
      if (description_event_for_queue->write(&log_file))
        goto err;
      bytes_written+= description_event_for_queue->data_written;
3592
    }
unknown's avatar
unknown committed
3593
    if (flush_io_cache(&log_file) ||
Sergei Golubchik's avatar
Sergei Golubchik committed
3594
        mysql_file_sync(log_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
unknown's avatar
unknown committed
3595
      goto err;
3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608

    my_off_t offset= my_b_tell(&log_file);

    if (!is_relay_log)
    {
      /* update binlog_end_pos so that it can be read by after sync hook */
      reset_binlog_end_pos(log_file_name, offset);

      mysql_mutex_lock(&LOCK_commit_ordered);
      strmake_buf(last_commit_pos_file, log_file_name);
      last_commit_pos_offset= offset;
      mysql_mutex_unlock(&LOCK_commit_ordered);
    }
3609 3610 3611

    if (write_file_name_to_index_file)
    {
3612
#ifdef HAVE_REPLICATION
3613 3614 3615 3616
#ifdef ENABLED_DEBUG_SYNC
      if (current_thd)
        DEBUG_SYNC(current_thd, "binlog_open_before_update_index");
#endif
3617
      DBUG_EXECUTE_IF("crash_create_critical_before_update_index", DBUG_SUICIDE(););
3618 3619 3620 3621 3622
#endif

      DBUG_ASSERT(my_b_inited(&index_file) != 0);
      reinit_io_cache(&index_file, WRITE_CACHE,
                      my_b_filelength(&index_file), 0, 0);
unknown's avatar
unknown committed
3623 3624 3625 3626
      /*
        As this is a new log file, we write the file name to the index
        file. As every time we write to the index file, we sync it.
      */
3627 3628 3629 3630 3631
      if (DBUG_EVALUATE_IF("fault_injection_updating_index", 1, 0) ||
          my_b_write(&index_file, (uchar*) log_file_name,
                     strlen(log_file_name)) ||
          my_b_write(&index_file, (uchar*) "\n", 1) ||
          flush_io_cache(&index_file) ||
Sergei Golubchik's avatar
Sergei Golubchik committed
3632
          mysql_file_sync(index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE)))
3633 3634 3635
        goto err;

#ifdef HAVE_REPLICATION
3636
      DBUG_EXECUTE_IF("crash_create_after_update_index", DBUG_SUICIDE(););
3637
#endif
unknown's avatar
unknown committed
3638
    }
3639
  }
3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654

  if (!is_relay_log)
  {
    /*
      Now the file was created successfully, so we can link in the entry for
      the new binlog file in binlog_xid_count_list.
    */
    mysql_mutex_lock(&LOCK_xid_list);
    ++current_binlog_id;
    new_xid_list_entry->binlog_id= current_binlog_id;
    /* Remove any initial entries with no pending XIDs.  */
    while ((b= binlog_xid_count_list.head()) && b->xid_count == 0)
      my_free(binlog_xid_count_list.get());
    binlog_xid_count_list.push_back(new_xid_list_entry);
    mysql_mutex_unlock(&LOCK_xid_list);
3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673

    /*
      Now that we have synced a new binlog file with an initial Gtid_list
      event, it is safe to delete the binlog state file. We will write out
      a new, updated file at shutdown, and if we crash before we can recover
      the state from the newly written binlog file.

      Since the state file will contain out-of-date data as soon as the first
      new GTID is binlogged, it is better to remove it, to avoid any risk of
      accidentally reading incorrect data later.
    */
    if (!state_file_deleted)
    {
      char buf[FN_REFLEN];
      fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
                MY_UNPACK_FILENAME);
      my_delete(buf, MY_SYNC_DIR);
      state_file_deleted= true;
    }
3674 3675
  }

3676 3677
  log_state= LOG_OPENED;

3678 3679 3680 3681
#ifdef HAVE_REPLICATION
  close_purge_index_file();
#endif

3682
  DBUG_RETURN(0);
3683 3684

err:
3685 3686 3687 3688 3689
#ifdef HAVE_REPLICATION
  if (is_inited_purge_index_file())
    purge_index_entry(NULL, NULL, need_mutex);
  close_purge_index_file();
#endif
3690 3691 3692
  sql_print_error("Could not use %s for logging (error %d). \
Turning logging off for the whole duration of the MySQL server process. \
To turn it on again: fix the cause, \
3693
shutdown the MySQL server and restart it.", name, errno);
3694 3695
  if (new_xid_list_entry)
    my_free(new_xid_list_entry);
3696
  if (file >= 0)
Marc Alff's avatar
Marc Alff committed
3697
    mysql_file_close(file, MYF(0));
3698
  close(LOG_CLOSE_INDEX);
3699
  DBUG_RETURN(1);
unknown's avatar
unknown committed
3700 3701
}

3702

3703
int MYSQL_BIN_LOG::get_current_log(LOG_INFO* linfo)
unknown's avatar
unknown committed
3704
{
Marc Alff's avatar
Marc Alff committed
3705
  mysql_mutex_lock(&LOCK_log);
unknown's avatar
unknown committed
3706
  int ret = raw_get_current_log(linfo);
Marc Alff's avatar
Marc Alff committed
3707
  mysql_mutex_unlock(&LOCK_log);
unknown's avatar
unknown committed
3708 3709 3710
  return ret;
}

unknown's avatar
unknown committed
3711
int MYSQL_BIN_LOG::raw_get_current_log(LOG_INFO* linfo)
unknown's avatar
unknown committed
3712
{
3713
  mysql_mutex_assert_owner(&LOCK_log);
3714
  strmake_buf(linfo->log_file_name, log_file_name);
3715
  linfo->pos = my_b_tell(&log_file);
unknown's avatar
unknown committed
3716 3717 3718
  return 0;
}

unknown's avatar
unknown committed
3719 3720
/**
  Move all data up in a file in an filename index file.
3721 3722 3723 3724 3725

    We do the copy outside of the IO_CACHE as the cache buffers would just
    make things slower and more complicated.
    In most cases the copy loop should only do one read.

unknown's avatar
unknown committed
3726 3727 3728 3729 3730 3731 3732
  @param index_file			File to move
  @param offset			Move everything from here to beginning

  @note
    File will be truncated to be 'offset' shorter or filled up with newlines

  @retval
3733 3734 3735
    0	ok
*/

3736 3737
#ifdef HAVE_REPLICATION

3738 3739 3740 3741 3742
static bool copy_up_file_and_fill(IO_CACHE *index_file, my_off_t offset)
{
  int bytes_read;
  my_off_t init_offset= offset;
  File file= index_file->file;
3743
  uchar io_buf[IO_SIZE*2];
3744 3745 3746 3747
  DBUG_ENTER("copy_up_file_and_fill");

  for (;; offset+= bytes_read)
  {
Marc Alff's avatar
Marc Alff committed
3748 3749 3750
    mysql_file_seek(file, offset, MY_SEEK_SET, MYF(0));
    if ((bytes_read= (int) mysql_file_read(file, io_buf, sizeof(io_buf),
                                           MYF(MY_WME)))
3751 3752 3753 3754
	< 0)
      goto err;
    if (!bytes_read)
      break;					// end of file
Marc Alff's avatar
Marc Alff committed
3755
    mysql_file_seek(file, offset-init_offset, MY_SEEK_SET, MYF(0));
3756 3757
    if (mysql_file_write(file, io_buf, bytes_read,
                         MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
3758 3759 3760
      goto err;
  }
  /* The following will either truncate the file or fill the end with \n' */
Marc Alff's avatar
Marc Alff committed
3761
  if (mysql_file_chsize(file, offset - init_offset, '\n', MYF(MY_WME)) ||
Sergei Golubchik's avatar
Sergei Golubchik committed
3762
      mysql_file_sync(file, MYF(MY_WME|MY_SYNC_FILESIZE)))
3763 3764 3765 3766 3767 3768 3769 3770 3771 3772
    goto err;

  /* Reset data in old index cache */
  reinit_io_cache(index_file, READ_CACHE, (my_off_t) 0, 0, 1);
  DBUG_RETURN(0);

err:
  DBUG_RETURN(1);
}

3773
#endif /* HAVE_REPLICATION */
3774

unknown's avatar
unknown committed
3775 3776
/**
  Find the position in the log-index-file for the given log name.
3777

unknown's avatar
unknown committed
3778 3779 3780 3781 3782 3783
  @param linfo		Store here the found log file name and position to
                       the NEXT log file name in the index file.
  @param log_name	Filename to find in the index file.
                       Is a null pointer if we want to read the first entry
  @param need_lock	Set this to 1 if the parent doesn't already have a
                       lock on LOCK_index
3784

unknown's avatar
unknown committed
3785
  @note
unknown's avatar
unknown committed
3786 3787
    On systems without the truncate function the file will end with one or
    more empty lines.  These will be ignored when reading the file.
3788

unknown's avatar
unknown committed
3789
  @retval
3790
    0			ok
unknown's avatar
unknown committed
3791 3792 3793
  @retval
    LOG_INFO_EOF	        End of log-index-file found
  @retval
3794 3795 3796
    LOG_INFO_IO		Got IO error while reading file
*/

3797
int MYSQL_BIN_LOG::find_log_pos(LOG_INFO *linfo, const char *log_name,
3798
			    bool need_lock)
unknown's avatar
unknown committed
3799
{
3800
  int error= 0;
3801 3802 3803
  char *full_fname= linfo->log_file_name;
  char full_log_name[FN_REFLEN], fname[FN_REFLEN];
  uint log_name_len= 0, fname_len= 0;
3804
  DBUG_ENTER("find_log_pos");
3805
  full_log_name[0]= full_fname[0]= 0;
unknown's avatar
unknown committed
3806

unknown's avatar
unknown committed
3807
  /*
3808 3809
    Mutex needed because we need to make sure the file pointer does not
    move from under our feet
unknown's avatar
unknown committed
3810
  */
3811
  if (need_lock)
Marc Alff's avatar
Marc Alff committed
3812 3813
    mysql_mutex_lock(&LOCK_index);
  mysql_mutex_assert_owner(&LOCK_index);
3814

3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828
  // extend relative paths for log_name to be searched
  if (log_name)
  {
    if(normalize_binlog_name(full_log_name, log_name, is_relay_log))
    {
      error= LOG_INFO_EOF;
      goto end;
    }
  }

  log_name_len= log_name ? (uint) strlen(full_log_name) : 0;
  DBUG_PRINT("enter", ("log_name: %s, full_log_name: %s", 
                       log_name ? log_name : "NULL", full_log_name));

3829 3830 3831
  /* As the file is flushed, we can't get an error here */
  (void) reinit_io_cache(&index_file, READ_CACHE, (my_off_t) 0, 0, 0);

unknown's avatar
unknown committed
3832
  for (;;)
3833
  {
3834
    uint length;
3835 3836
    my_off_t offset= my_b_tell(&index_file);

3837 3838 3839
    DBUG_EXECUTE_IF("simulate_find_log_pos_error",
                    error=  LOG_INFO_EOF; break;);
    /* If we get 0 or 1 characters, this is the end of the file */
3840
    if ((length= my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
unknown's avatar
unknown committed
3841
    {
3842 3843 3844
      /* Did not find the given entry; Return not found or error */
      error= !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
      break;
unknown's avatar
unknown committed
3845
    }
3846 3847 3848 3849
    if (fname[length-1] != '\n')
      continue;                                 // Not a log entry
    fname[length-1]= 0;                         // Remove end \n
    
3850 3851 3852 3853 3854 3855 3856 3857
    // extend relative paths and match against full path
    if (normalize_binlog_name(full_fname, fname, is_relay_log))
    {
      error= LOG_INFO_EOF;
      break;
    }
    fname_len= (uint) strlen(full_fname);

3858 3859
    // if the log entry matches, null string matching anything
    if (!log_name ||
3860
        (log_name_len == fname_len &&
3861
	 !memcmp(full_fname, full_log_name, log_name_len)))
unknown's avatar
unknown committed
3862
    {
3863
      DBUG_PRINT("info", ("Found log file entry"));
3864 3865
      linfo->index_file_start_offset= offset;
      linfo->index_file_offset = my_b_tell(&index_file);
3866
      break;
unknown's avatar
unknown committed
3867
    }
3868
  }
3869

3870
end:
3871
  if (need_lock)
Marc Alff's avatar
Marc Alff committed
3872
    mysql_mutex_unlock(&LOCK_index);
3873
  DBUG_RETURN(error);
unknown's avatar
unknown committed
3874
}
unknown's avatar
unknown committed
3875

3876

unknown's avatar
unknown committed
3877 3878
/**
  Find the position in the log-index-file for the given log name.
3879

unknown's avatar
unknown committed
3880
  @param
3881 3882
    linfo		Store here the next log file name and position to
			the file name after that.
unknown's avatar
unknown committed
3883
  @param
3884 3885 3886
    need_lock		Set this to 1 if the parent doesn't already have a
			lock on LOCK_index

unknown's avatar
unknown committed
3887
  @note
3888
    - Before calling this function, one has to call find_log_pos()
unknown's avatar
unknown committed
3889
    to set up 'linfo'
3890
    - Mutex needed because we need to make sure the file pointer does not move
unknown's avatar
unknown committed
3891
    from under our feet
3892

unknown's avatar
unknown committed
3893
  @retval
3894
    0			ok
unknown's avatar
unknown committed
3895 3896 3897
  @retval
    LOG_INFO_EOF	        End of log-index-file found
  @retval
3898 3899 3900
    LOG_INFO_IO		Got IO error while reading file
*/

3901
int MYSQL_BIN_LOG::find_next_log(LOG_INFO* linfo, bool need_lock)
unknown's avatar
unknown committed
3902
{
3903
  int error= 0;
3904
  uint length;
3905 3906
  char fname[FN_REFLEN];
  char *full_fname= linfo->log_file_name;
3907

3908
  if (need_lock)
Marc Alff's avatar
Marc Alff committed
3909 3910
    mysql_mutex_lock(&LOCK_index);
  mysql_mutex_assert_owner(&LOCK_index);
3911 3912 3913 3914 3915 3916 3917

  /* As the file is flushed, we can't get an error here */
  (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0,
			 0);

  linfo->index_file_start_offset= linfo->index_file_offset;
  if ((length=my_b_gets(&index_file, fname, FN_REFLEN)) <= 1)
3918
  {
3919
    error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
3920 3921
    goto err;
  }
3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934

  if (fname[0] != 0)
  {
    if(normalize_binlog_name(full_fname, fname, is_relay_log))
    {
      error= LOG_INFO_EOF;
      goto err;
    }
    length= strlen(full_fname);
  }

  full_fname[length-1]= 0;			// kill \n
  linfo->index_file_offset= my_b_tell(&index_file);
3935

unknown's avatar
unknown committed
3936
err:
3937
  if (need_lock)
Marc Alff's avatar
Marc Alff committed
3938
    mysql_mutex_unlock(&LOCK_index);
unknown's avatar
unknown committed
3939 3940 3941
  return error;
}

unknown's avatar
unknown committed
3942

unknown's avatar
unknown committed
3943 3944
/**
  Delete all logs refered to in the index file.
3945

unknown's avatar
unknown committed
3946
  The new index file will only contain this file.
3947

3948 3949 3950 3951
  @param thd		  Thread id. This can be zero in case of resetting 
                          relay logs
  @param create_new_log   1 if we should start writing to a new log file
  @param next_log_number  min number of next log file to use, if possible.
3952

unknown's avatar
unknown committed
3953 3954
  @note
    If not called from slave thread, write start event to new log
3955

unknown's avatar
unknown committed
3956
  @retval
3957
    0	ok
unknown's avatar
unknown committed
3958
  @retval
3959 3960 3961
    1   error
*/

3962
bool MYSQL_BIN_LOG::reset_logs(THD *thd, bool create_new_log,
3963 3964
                               rpl_gtid *init_state, uint32 init_state_len,
                               ulong next_log_number)
3965 3966
{
  LOG_INFO linfo;
3967
  bool error=0;
3968
  int err;
3969
  const char* save_name;
3970
  DBUG_ENTER("reset_logs");
3971

3972 3973
  if (!is_relay_log)
  {
unknown's avatar
unknown committed
3974 3975 3976 3977 3978 3979
    if (init_state && !is_empty_state())
    {
      my_error(ER_BINLOG_MUST_BE_EMPTY, MYF(0));
      DBUG_RETURN(1);
    }

3980 3981 3982 3983 3984
    /*
      Mark that a RESET MASTER is in progress.
      This ensures that a binlog checkpoint will not try to write binlog
      checkpoint events, which would be useless (as we are deleting the binlog
      anyway) and could deadlock, as we are holding LOCK_log.
3985 3986 3987 3988

      Wait for any mark_xid_done() calls that might be already running to
      complete (mark_xid_done_waiting counter to drop to zero); we need to
      do this before we take the LOCK_log to not deadlock.
3989 3990
    */
    mysql_mutex_lock(&LOCK_xid_list);
3991
    reset_master_pending++;
3992 3993
    while (mark_xid_done_waiting > 0)
      mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
3994
    mysql_mutex_unlock(&LOCK_xid_list);
3995
  }
3996

3997
  DEBUG_SYNC_C_IF_THD(thd, "reset_logs_after_set_reset_master_pending");
3998 3999 4000 4001 4002 4003 4004 4005 4006
  /*
    We need to get both locks to be sure that no one is trying to
    write to the index log file.
  */
  mysql_mutex_lock(&LOCK_log);
  mysql_mutex_lock(&LOCK_index);

  if (!is_relay_log)
  {
4007 4008
    /*
      We are going to nuke all binary log files.
4009 4010 4011
      Without binlog, we cannot XA recover prepared-but-not-committed
      transactions in engines. So force a commit checkpoint first.

4012 4013
      Note that we take and immediately
      release LOCK_after_binlog_sync/LOCK_commit_ordered. This has
4014 4015 4016 4017 4018 4019 4020 4021 4022
      the effect to ensure that any on-going group commit (in
      trx_group_commit_leader()) has completed before we request the checkpoint,
      due to the chaining of LOCK_log and LOCK_commit_ordered in that function.
      (We are holding LOCK_log, so no new group commit can start).

      Without this, it is possible (though perhaps unlikely) that the RESET
      MASTER could run in-between the write to the binlog and the
      commit_ordered() in the engine of some transaction, and then a crash
      later would leave such transaction not recoverable.
4023
    */
4024 4025

    mysql_mutex_lock(&LOCK_after_binlog_sync);
4026
    mysql_mutex_lock(&LOCK_commit_ordered);
4027
    mysql_mutex_unlock(&LOCK_after_binlog_sync);
4028 4029 4030 4031 4032 4033
    mysql_mutex_unlock(&LOCK_commit_ordered);

    mark_xids_active(current_binlog_id, 1);
    do_checkpoint_request(current_binlog_id);

    /* Now wait for all checkpoint requests and pending unlog() to complete. */
4034 4035 4036
    mysql_mutex_lock(&LOCK_xid_list);
    for (;;)
    {
4037 4038
      if (is_xidlist_idle_nolock())
        break;
4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052
      /*
        Wait until signalled that one more binlog dropped to zero, then check
        again.
      */
      mysql_cond_wait(&COND_xid_list, &LOCK_xid_list);
    }

    /*
      Now all XIDs are fully flushed to disk, and we are holding LOCK_log so
      no new ones will be written. So we can proceed to delete the logs.
    */
    mysql_mutex_unlock(&LOCK_xid_list);
  }

4053 4054 4055 4056 4057 4058
  /*
    The following mutex is needed to ensure that no threads call
    'delete thd' as we would then risk missing a 'rollback' from this
    thread. If the transaction involved MyISAM tables, it should go
    into binlog even on rollback.
  */
Marc Alff's avatar
Marc Alff committed
4059
  mysql_mutex_lock(&LOCK_thread_count);
4060 4061 4062 4063

  /* Save variables so that we can reopen the log */
  save_name=name;
  name=0;					// Protect against free
4064
  close(LOG_CLOSE_TO_BE_OPENED);
4065

4066 4067 4068 4069 4070
  /*
    First delete all old log files and then update the index file.
    As we first delete the log files and do not use sort of logging,
    a crash may lead to an inconsistent state where the index has
    references to non-existent files.
4071

4072 4073 4074
    We need to invert the steps and use the purge_index_file methods
    in order to make the operation safe.
  */
4075

4076
  if ((err= find_log_pos(&linfo, NullS, 0)) != 0)
4077
  {
4078 4079
    uint errcode= purge_log_get_error_code(err);
    sql_print_error("Failed to locate old binlog or relay log files");
4080
    my_message(errcode, ER_THD_OR_DEFAULT(thd, errcode), MYF(0));
4081
    error= 1;
4082 4083
    goto err;
  }
4084

unknown's avatar
unknown committed
4085
  for (;;)
4086
  {
4087
    if ((error= my_delete(linfo.log_file_name, MYF(0))) != 0)
4088 4089 4090
    {
      if (my_errno == ENOENT) 
      {
4091 4092 4093 4094 4095 4096
        if (thd)
          push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
                              ER_LOG_PURGE_NO_FILE,
                              ER_THD(thd, ER_LOG_PURGE_NO_FILE),
                              linfo.log_file_name);

4097 4098 4099 4100 4101 4102 4103
        sql_print_information("Failed to delete file '%s'",
                              linfo.log_file_name);
        my_errno= 0;
        error= 0;
      }
      else
      {
4104 4105 4106 4107 4108 4109 4110 4111
        if (thd)
          push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
                              ER_BINLOG_PURGE_FATAL_ERR,
                              "a problem with deleting %s; "
                              "consider examining correspondence "
                              "of your binlog index file "
                              "to the actual binlog files",
                              linfo.log_file_name);
4112 4113 4114 4115
        error= 1;
        goto err;
      }
    }
4116
    if (find_next_log(&linfo, 0))
4117 4118
      break;
  }
4119

unknown's avatar
unknown committed
4120
  if (!is_relay_log)
unknown's avatar
unknown committed
4121
  {
unknown's avatar
unknown committed
4122 4123 4124 4125
    if (init_state)
      rpl_global_gtid_binlog_state.load(init_state, init_state_len);
    else
      rpl_global_gtid_binlog_state.reset();
unknown's avatar
unknown committed
4126
  }
unknown's avatar
unknown committed
4127

4128
  /* Start logging with a new file */
4129
  close(LOG_CLOSE_INDEX | LOG_CLOSE_TO_BE_OPENED);
4130
  if ((error= my_delete(index_file_name, MYF(0))))	// Reset (open will update)
4131 4132 4133
  {
    if (my_errno == ENOENT) 
    {
4134 4135 4136 4137 4138
      if (thd)
        push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
                            ER_LOG_PURGE_NO_FILE,
                            ER_THD(thd, ER_LOG_PURGE_NO_FILE),
                            index_file_name);
4139 4140 4141 4142 4143 4144 4145
      sql_print_information("Failed to delete file '%s'",
                            index_file_name);
      my_errno= 0;
      error= 0;
    }
    else
    {
4146 4147 4148 4149 4150 4151 4152 4153
      if (thd)
        push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
                            ER_BINLOG_PURGE_FATAL_ERR,
                            "a problem with deleting %s; "
                            "consider examining correspondence "
                            "of your binlog index file "
                            "to the actual binlog files",
                            index_file_name);
4154 4155 4156 4157
      error= 1;
      goto err;
    }
  }
4158
  if (create_new_log && !open_index_file(index_file_name, 0, FALSE))
4159 4160
    if ((error= open(save_name, log_type, 0, next_log_number,
                     io_cache_type, max_size, 0, FALSE)))
4161
      goto err;
4162
  my_free((void *) save_name);
4163

4164
err:
4165 4166
  if (error == 1)
    name= const_cast<char*>(save_name);
Marc Alff's avatar
Marc Alff committed
4167
  mysql_mutex_unlock(&LOCK_thread_count);
4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188

  if (!is_relay_log)
  {
    xid_count_per_binlog *b;
    /*
      Remove all entries in the xid_count list except the last.
      Normally we will just be deleting all the entries that we waited for to
      drop to zero above. But if we fail during RESET MASTER for some reason
      then we will not have created any new log file, and we may keep the last
      of the old entries.
    */
    mysql_mutex_lock(&LOCK_xid_list);
    for (;;)
    {
      b= binlog_xid_count_list.head();
      DBUG_ASSERT(b /* List can never become empty. */);
      if (b->binlog_id == current_binlog_id)
        break;
      DBUG_ASSERT(b->xid_count == 0);
      my_free(binlog_xid_count_list.get());
    }
4189
    reset_master_pending--;
4190 4191 4192
    mysql_mutex_unlock(&LOCK_xid_list);
  }

Marc Alff's avatar
Marc Alff committed
4193 4194
  mysql_mutex_unlock(&LOCK_index);
  mysql_mutex_unlock(&LOCK_log);
4195
  DBUG_RETURN(error);
4196 4197
}

unknown's avatar
unknown committed
4198

unknown's avatar
unknown committed
4199
/**
4200 4201
  Delete relay log files prior to rli->group_relay_log_name
  (i.e. all logs which are not involved in a non-finished group
unknown's avatar
unknown committed
4202 4203
  (transaction)), remove them from the index file and start on next
  relay log.
4204

unknown's avatar
unknown committed
4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218
  IMPLEMENTATION
  - Protects index file with LOCK_index
  - Delete relevant relay log files
  - Copy all file names after these ones to the front of the index file
  - If the OS has truncate, truncate the file, else fill it with \n'
  - Read the next file name from the index file and store in rli->linfo

  @param rli	       Relay log information
  @param included     If false, all relay logs that are strictly before
                      rli->group_relay_log_name are deleted ; if true, the
                      latter is deleted too (i.e. all relay logs
                      read by the SQL slave thread are deleted).

  @note
4219
    - This is only called from the slave-execute thread when it has read
unknown's avatar
unknown committed
4220
    all commands from a relay log and want to switch to a new relay log.
4221
    - When this happens, we can be in an active transaction as
unknown's avatar
unknown committed
4222 4223 4224
    a transaction can span over two relay logs
    (although it is always written as a single block to the master's binary
    log, hence cannot span over two master's binary logs).
4225

unknown's avatar
unknown committed
4226
  @retval
4227
    0			ok
unknown's avatar
unknown committed
4228 4229 4230
  @retval
    LOG_INFO_EOF	        End of log-index-file found
  @retval
4231
    LOG_INFO_SEEK	Could not allocate IO cache
unknown's avatar
unknown committed
4232
  @retval
4233
    LOG_INFO_IO		Got IO error while reading file
4234 4235
*/

unknown's avatar
SCRUM  
unknown committed
4236
#ifdef HAVE_REPLICATION
4237

4238
int MYSQL_BIN_LOG::purge_first_log(Relay_log_info* rli, bool included)
4239
{
4240
  int error;
4241
  char *to_purge_if_included= NULL;
4242
  inuse_relaylog *ir;
4243
  ulonglong log_space_reclaimed= 0;
4244 4245
  DBUG_ENTER("purge_first_log");

4246
  DBUG_ASSERT(is_open());
4247
  DBUG_ASSERT(rli->slave_running == MYSQL_SLAVE_RUN_NOT_CONNECT);
4248
  DBUG_ASSERT(!strcmp(rli->linfo.log_file_name,rli->event_relay_log_name));
4249

Marc Alff's avatar
Marc Alff committed
4250
  mysql_mutex_lock(&LOCK_index);
4251 4252 4253 4254 4255 4256 4257 4258 4259 4260

  ir= rli->inuse_relaylog_list;
  while (ir)
  {
    inuse_relaylog *next= ir->next;
    if (!ir->completed || ir->dequeued_count < ir->queued_count)
    {
      included= false;
      break;
    }
4261
    if (!included && !strcmp(ir->name, rli->group_relay_log_name))
4262 4263 4264 4265 4266 4267 4268
      break;
    if (!next)
    {
      rli->last_inuse_relaylog= NULL;
      included= 1;
      to_purge_if_included= my_strdup(ir->name, MYF(0));
    }
4269
    rli->free_inuse_relaylog(ir);
4270 4271 4272 4273 4274
    ir= next;
  }
  rli->inuse_relaylog_list= ir;
  if (ir)
    to_purge_if_included= my_strdup(ir->name, MYF(0));
unknown's avatar
unknown committed
4275

4276 4277
  /*
    Read the next log file name from the index file and pass it back to
4278
    the caller.
4279
  */
4280 4281
  if((error=find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)) || 
     (error=find_next_log(&rli->linfo, 0)))
4282 4283
  {
    char buff[22];
4284 4285 4286
    sql_print_error("next log error: %d  offset: %s  log: %s included: %d",
                    error,
                    llstr(rli->linfo.index_file_offset,buff),
4287
                    rli->event_relay_log_name,
4288
                    included);
4289 4290
    goto err;
  }
4291

4292
  /*
4293
    Reset rli's coordinates to the current log.
4294
  */
4295
  rli->event_relay_log_pos= BIN_LOG_HEADER_SIZE;
4296
  strmake_buf(rli->event_relay_log_name,rli->linfo.log_file_name);
4297 4298 4299 4300 4301 4302 4303 4304 4305

  /*
    If we removed the rli->group_relay_log_name file,
    we must update the rli->group* coordinates, otherwise do not touch it as the
    group's execution is not finished (e.g. COMMIT not executed)
  */
  if (included)
  {
    rli->group_relay_log_pos = BIN_LOG_HEADER_SIZE;
4306
    strmake_buf(rli->group_relay_log_name,rli->linfo.log_file_name);
4307
    rli->notify_group_relay_log_name_update();
4308
  }
4309 4310 4311

  /* Store where we are in the new file for the execution thread */
  flush_relay_log_info(rli);
unknown's avatar
unknown committed
4312

4313
  DBUG_EXECUTE_IF("crash_before_purge_logs", DBUG_SUICIDE(););
4314 4315

  rli->relay_log.purge_logs(to_purge_if_included, included,
4316
                            0, 0, &log_space_reclaimed);
4317

4318 4319
  mysql_mutex_lock(&rli->log_space_lock);
  rli->log_space_total-= log_space_reclaimed;
Marc Alff's avatar
Marc Alff committed
4320
  mysql_cond_broadcast(&rli->log_space_cond);
4321
  mysql_mutex_unlock(&rli->log_space_lock);
4322 4323 4324 4325 4326

  /*
   * Need to update the log pos because purge logs has been called 
   * after fetching initially the log pos at the begining of the method.
   */
4327
  if((error=find_log_pos(&rli->linfo, rli->event_relay_log_name, 0)))
4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340
  {
    char buff[22];
    sql_print_error("next log error: %d  offset: %s  log: %s included: %d",
                    error,
                    llstr(rli->linfo.index_file_offset,buff),
                    rli->group_relay_log_name,
                    included);
    goto err;
  }

  /* If included was passed, rli->linfo should be the first entry. */
  DBUG_ASSERT(!included || rli->linfo.index_file_start_offset == 0);

4341
err:
4342
  my_free(to_purge_if_included);
Marc Alff's avatar
Marc Alff committed
4343
  mysql_mutex_unlock(&LOCK_index);
4344
  DBUG_RETURN(error);
4345 4346
}

unknown's avatar
unknown committed
4347 4348
/**
  Update log index_file.
unknown's avatar
unknown committed
4349 4350
*/

4351
int MYSQL_BIN_LOG::update_log_index(LOG_INFO* log_info, bool need_update_threads)
unknown's avatar
unknown committed
4352 4353 4354 4355 4356
{
  if (copy_up_file_and_fill(&index_file, log_info->index_file_start_offset))
    return LOG_INFO_IO;

  // now update offsets in index file for running threads
4357 4358
  if (need_update_threads)
    adjust_linfo_offsets(log_info->index_file_start_offset);
unknown's avatar
unknown committed
4359 4360
  return 0;
}
unknown's avatar
unknown committed
4361

unknown's avatar
unknown committed
4362
/**
4363 4364
  Remove all logs before the given log from disk and from the index file.

unknown's avatar
unknown committed
4365 4366 4367 4368 4369
  @param to_log	      Delete all log file name before this file.
  @param included            If true, to_log is deleted too.
  @param need_mutex
  @param need_update_threads If we want to update the log coordinates of
                             all threads. False for relay logs, true otherwise.
4370 4371
  @param reclaimeed_log_space If not null, increment this variable to
                              the amount of log space freed
4372

unknown's avatar
unknown committed
4373
  @note
4374 4375 4376
    If any of the logs before the deleted one is in use,
    only purge logs up to this one.

unknown's avatar
unknown committed
4377 4378 4379
  @retval
    0			ok
  @retval
4380
    LOG_INFO_EOF		to_log not found
4381
    LOG_INFO_EMFILE             too many files opened
4382
    LOG_INFO_FATAL              if any other than ENOENT error from
Marc Alff's avatar
Marc Alff committed
4383
                                mysql_file_stat() or mysql_file_delete()
4384 4385
*/

4386
int MYSQL_BIN_LOG::purge_logs(const char *to_log, 
4387 4388 4389 4390
                              bool included,
                              bool need_mutex, 
                              bool need_update_threads, 
                              ulonglong *reclaimed_space)
unknown's avatar
unknown committed
4391
{
4392
  int error= 0;
4393
  bool exit_loop= 0;
4394
  LOG_INFO log_info;
4395
  THD *thd= current_thd;
4396
  DBUG_ENTER("purge_logs");
4397
  DBUG_PRINT("info",("to_log= %s",to_log));
4398

4399
  if (need_mutex)
Marc Alff's avatar
Marc Alff committed
4400
    mysql_mutex_lock(&LOCK_index);
4401 4402
  if ((error=find_log_pos(&log_info, to_log, 0 /*no mutex*/))) 
  {
4403
    sql_print_error("MYSQL_BIN_LOG::purge_logs was called with file %s not "
4404
                    "listed in the index.", to_log);
4405
    goto err;
4406 4407
  }

4408
  if ((error= open_purge_index_file(TRUE)))
4409
  {
4410 4411
    sql_print_error("MYSQL_BIN_LOG::purge_logs failed to sync the index file.");
    goto err;
4412
  }
4413

4414
  /*
4415
    File name exists in index file; delete until we find this file
4416 4417 4418
    or a file that is used.
  */
  if ((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/)))
4419
    goto err;
4420
  while ((strcmp(to_log,log_info.log_file_name) || (exit_loop=included)) &&
4421
         can_purge_log(log_info.log_file_name))
4422
  {
4423
    if ((error= register_purge_index_entry(log_info.log_file_name)))
4424
    {
4425
      sql_print_error("MYSQL_BIN_LOG::purge_logs failed to copy %s to register file.",
4426 4427 4428 4429 4430 4431
                      log_info.log_file_name);
      goto err;
    }

    if (find_next_log(&log_info, 0) || exit_loop)
      break;
4432 4433
  }

4434
  DBUG_EXECUTE_IF("crash_purge_before_update_index", DBUG_SUICIDE(););
4435 4436 4437 4438 4439 4440

  if ((error= sync_purge_index_file()))
  {
    sql_print_error("MSYQL_BIN_LOG::purge_logs failed to flush register file.");
    goto err;
  }
4441 4442

  /* We know how many files to delete. Update index file. */
4443
  if ((error=update_log_index(&log_info, need_update_threads)))
4444
  {
4445
    sql_print_error("MSYQL_BIN_LOG::purge_logs failed to update the index file");
4446 4447 4448
    goto err;
  }

4449
  DBUG_EXECUTE_IF("crash_purge_critical_after_update_index", DBUG_SUICIDE(););
4450 4451 4452 4453

err:
  /* Read each entry from purge_index_file and delete the file. */
  if (is_inited_purge_index_file() &&
4454
      (error= purge_index_entry(thd, reclaimed_space, FALSE)))
4455 4456 4457 4458
    sql_print_error("MSYQL_BIN_LOG::purge_logs failed to process registered files"
                    " that would be purged.");
  close_purge_index_file();

4459
  DBUG_EXECUTE_IF("crash_purge_non_critical_after_update_index", DBUG_SUICIDE(););
4460 4461

  if (need_mutex)
Marc Alff's avatar
Marc Alff committed
4462
    mysql_mutex_unlock(&LOCK_index);
4463 4464
  DBUG_RETURN(error);
}
4465

4466 4467 4468 4469 4470 4471 4472
int MYSQL_BIN_LOG::set_purge_index_file_name(const char *base_file_name)
{
  int error= 0;
  DBUG_ENTER("MYSQL_BIN_LOG::set_purge_index_file_name");
  if (fn_format(purge_index_file_name, base_file_name, mysql_data_home,
                ".~rec~", MYF(MY_UNPACK_FILENAME | MY_SAFE_PATH |
                              MY_REPLACE_EXT)) == NULL)
4473
  {
4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511
    error= 1;
    sql_print_error("MYSQL_BIN_LOG::set_purge_index_file_name failed to set "
                      "file name.");
  }
  DBUG_RETURN(error);
}

int MYSQL_BIN_LOG::open_purge_index_file(bool destroy)
{
  int error= 0;
  File file= -1;

  DBUG_ENTER("MYSQL_BIN_LOG::open_purge_index_file");

  if (destroy)
    close_purge_index_file();

  if (!my_b_inited(&purge_index_file))
  {
    if ((file= my_open(purge_index_file_name, O_RDWR | O_CREAT | O_BINARY,
                       MYF(MY_WME | ME_WAITTANG))) < 0  ||
        init_io_cache(&purge_index_file, file, IO_SIZE,
                      (destroy ? WRITE_CACHE : READ_CACHE),
                      0, 0, MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
    {
      error= 1;
      sql_print_error("MYSQL_BIN_LOG::open_purge_index_file failed to open register "
                      " file.");
    }
  }
  DBUG_RETURN(error);
}

int MYSQL_BIN_LOG::close_purge_index_file()
{
  int error= 0;

  DBUG_ENTER("MYSQL_BIN_LOG::close_purge_index_file");
4512

4513
  if (my_b_inited(&purge_index_file))
4514
  {
4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525
    end_io_cache(&purge_index_file);
    error= my_close(purge_index_file.file, MYF(0));
  }
  my_delete(purge_index_file_name, MYF(0));
  bzero((char*) &purge_index_file, sizeof(purge_index_file));

  DBUG_RETURN(error);
}

bool MYSQL_BIN_LOG::is_inited_purge_index_file()
{
4526
  return my_b_inited(&purge_index_file);
4527 4528 4529 4530 4531 4532 4533 4534
}

int MYSQL_BIN_LOG::sync_purge_index_file()
{
  int error= 0;
  DBUG_ENTER("MYSQL_BIN_LOG::sync_purge_index_file");

  if ((error= flush_io_cache(&purge_index_file)) ||
4535
      (error= my_sync(purge_index_file.file, MYF(MY_WME|MY_SYNC_FILESIZE))))
4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558
    DBUG_RETURN(error);

  DBUG_RETURN(error);
}

int MYSQL_BIN_LOG::register_purge_index_entry(const char *entry)
{
  int error= 0;
  DBUG_ENTER("MYSQL_BIN_LOG::register_purge_index_entry");

  if ((error=my_b_write(&purge_index_file, (const uchar*)entry, strlen(entry))) ||
      (error=my_b_write(&purge_index_file, (const uchar*)"\n", 1)))
    DBUG_RETURN (error);

  DBUG_RETURN(error);
}

int MYSQL_BIN_LOG::register_create_index_entry(const char *entry)
{
  DBUG_ENTER("MYSQL_BIN_LOG::register_create_index_entry");
  DBUG_RETURN(register_purge_index_entry(entry));
}

4559
int MYSQL_BIN_LOG::purge_index_entry(THD *thd, ulonglong *reclaimed_space,
4560 4561
                                     bool need_mutex)
{
4562
  DBUG_ENTER("MYSQL_BIN_LOG:purge_index_entry");
4563 4564 4565 4566 4567 4568 4569 4570 4571 4572
  MY_STAT s;
  int error= 0;
  LOG_INFO log_info;
  LOG_INFO check_log_info;

  DBUG_ASSERT(my_b_inited(&purge_index_file));

  if ((error=reinit_io_cache(&purge_index_file, READ_CACHE, 0, 0, 0)))
  {
    sql_print_error("MSYQL_BIN_LOG::purge_index_entry failed to reinit register file "
4573 4574 4575 4576 4577 4578 4579 4580
                    "for read");
    goto err;
  }

  for (;;)
  {
    uint length;

4581
    if ((length=my_b_gets(&purge_index_file, log_info.log_file_name,
4582 4583
                          FN_REFLEN)) <= 1)
    {
4584
      if (purge_index_file.error)
4585
      {
4586 4587 4588
        error= purge_index_file.error;
        sql_print_error("MSYQL_BIN_LOG::purge_index_entry error %d reading from "
                        "register file.", error);
4589 4590 4591 4592 4593 4594 4595 4596 4597 4598
        goto err;
      }

      /* Reached EOF */
      break;
    }

    /* Get rid of the trailing '\n' */
    log_info.log_file_name[length-1]= 0;

4599
    if (!mysql_file_stat(m_key_file_log, log_info.log_file_name, &s, MYF(0)))
4600
    {
4601 4602 4603 4604 4605
      if (my_errno == ENOENT) 
      {
        /*
          It's not fatal if we can't stat a log file that does not exist;
          If we could not stat, we won't delete.
4606
        */
4607 4608
        if (thd)
        {
4609
          push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4610
                              ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4611 4612
                              log_info.log_file_name);
        }
Marc Alff's avatar
Marc Alff committed
4613
        sql_print_information("Failed to execute mysql_file_stat on file '%s'",
4614
			      log_info.log_file_name);
4615 4616 4617 4618 4619 4620 4621
        my_errno= 0;
      }
      else
      {
        /*
          Other than ENOENT are fatal
        */
4622 4623
        if (thd)
        {
4624
          push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639
                              ER_BINLOG_PURGE_FATAL_ERR,
                              "a problem with getting info on being purged %s; "
                              "consider examining correspondence "
                              "of your binlog index file "
                              "to the actual binlog files",
                              log_info.log_file_name);
        }
        else
        {
          sql_print_information("Failed to delete log file '%s'; "
                                "consider examining correspondence "
                                "of your binlog index file "
                                "to the actual binlog files",
                                log_info.log_file_name);
        }
4640 4641 4642 4643 4644 4645
        error= LOG_INFO_FATAL;
        goto err;
      }
    }
    else
    {
4646
      if ((error= find_log_pos(&check_log_info, log_info.log_file_name, need_mutex)))
4647
      {
4648
        if (error != LOG_INFO_EOF)
4649
        {
4650 4651
          if (thd)
          {
4652
            push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4653 4654 4655
                                ER_BINLOG_PURGE_FATAL_ERR,
                                "a problem with deleting %s and "
                                "reading the binlog index file",
4656 4657
                                log_info.log_file_name);
          }
4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671
          else
          {
            sql_print_information("Failed to delete file '%s' and "
                                  "read the binlog index file",
                                  log_info.log_file_name);
          }
          goto err;
        }
           
        error= 0;

        DBUG_PRINT("info",("purging %s",log_info.log_file_name));
        if (!my_delete(log_info.log_file_name, MYF(0)))
        {
4672 4673
          if (reclaimed_space)
            *reclaimed_space+= s.st_size;
4674 4675 4676
        }
        else
        {
4677
          if (my_errno == ENOENT)
4678
          {
4679 4680
            if (thd)
            {
4681
              push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4682
                                  ER_LOG_PURGE_NO_FILE, ER_THD(thd, ER_LOG_PURGE_NO_FILE),
4683 4684 4685 4686 4687
                                  log_info.log_file_name);
            }
            sql_print_information("Failed to delete file '%s'",
                                  log_info.log_file_name);
            my_errno= 0;
4688 4689 4690
          }
          else
          {
4691 4692
            if (thd)
            {
4693
              push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4694 4695
                                  ER_BINLOG_PURGE_FATAL_ERR,
                                  "a problem with deleting %s; "
4696 4697 4698 4699
                                  "consider examining correspondence "
                                  "of your binlog index file "
                                  "to the actual binlog files",
                                  log_info.log_file_name);
4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716
            }
            else
            {
              sql_print_information("Failed to delete file '%s'; "
                                    "consider examining correspondence "
                                    "of your binlog index file "
                                    "to the actual binlog files",
                                    log_info.log_file_name);
            }
            if (my_errno == EMFILE)
            {
              DBUG_PRINT("info",
                         ("my_errno: %d, set ret = LOG_INFO_EMFILE", my_errno));
              error= LOG_INFO_EMFILE;
              goto err;
            }
            error= LOG_INFO_FATAL;
Kristofer Pettersson's avatar
Kristofer Pettersson committed
4717
            goto err;
4718
          }
4719 4720
        }
      }
4721
    }
4722
  }
4723

unknown's avatar
unknown committed
4724
err:
4725
  DBUG_RETURN(error);
unknown's avatar
unknown committed
4726 4727
}

unknown's avatar
unknown committed
4728
/**
4729 4730 4731
  Remove all logs before the given file date from disk and from the
  index file.

unknown's avatar
unknown committed
4732
  @param thd		Thread pointer
4733
  @param purge_time	Delete all log files before given date.
4734

unknown's avatar
unknown committed
4735
  @note
4736 4737 4738
    If any of the logs before the deleted one is in use,
    only purge logs up to this one.

unknown's avatar
unknown committed
4739
  @retval
4740
    0				ok
unknown's avatar
unknown committed
4741
  @retval
4742
    LOG_INFO_PURGE_NO_ROTATE	Binary file that can't be rotated
4743
    LOG_INFO_FATAL              if any other than ENOENT error from
Marc Alff's avatar
Marc Alff committed
4744
                                mysql_file_stat() or mysql_file_delete()
4745 4746
*/

4747
int MYSQL_BIN_LOG::purge_logs_before_date(time_t purge_time)
4748 4749
{
  int error;
4750
  char to_log[FN_REFLEN];
4751 4752
  LOG_INFO log_info;
  MY_STAT stat_area;
4753
  THD *thd= current_thd;
4754 4755
  DBUG_ENTER("purge_logs_before_date");

Marc Alff's avatar
Marc Alff committed
4756
  mysql_mutex_lock(&LOCK_index);
4757
  to_log[0]= 0;
4758 4759

  if ((error=find_log_pos(&log_info, NullS, 0 /*no mutex*/)))
unknown's avatar
unknown committed
4760
    goto err;
4761 4762

  while (strcmp(log_file_name, log_info.log_file_name) &&
4763
	 can_purge_log(log_info.log_file_name))
4764
  {
4765
    if (!mysql_file_stat(m_key_file_log,
Marc Alff's avatar
Marc Alff committed
4766
                         log_info.log_file_name, &stat_area, MYF(0)))
4767 4768 4769 4770 4771
    {
      if (my_errno == ENOENT) 
      {
        /*
          It's not fatal if we can't stat a log file that does not exist.
4772
        */
4773 4774 4775 4776 4777 4778 4779
        my_errno= 0;
      }
      else
      {
        /*
          Other than ENOENT are fatal
        */
4780 4781
        if (thd)
        {
4782
          push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794
                              ER_BINLOG_PURGE_FATAL_ERR,
                              "a problem with getting info on being purged %s; "
                              "consider examining correspondence "
                              "of your binlog index file "
                              "to the actual binlog files",
                              log_info.log_file_name);
        }
        else
        {
          sql_print_information("Failed to delete log file '%s'",
                                log_info.log_file_name);
        }
4795 4796 4797 4798 4799 4800
        error= LOG_INFO_FATAL;
        goto err;
      }
    }
    else
    {
4801
      if (stat_area.st_mtime < purge_time) 
4802
        strmake_buf(to_log, log_info.log_file_name);
4803
      else
4804 4805
        break;
    }
4806 4807
    if (find_next_log(&log_info, 0))
      break;
unknown's avatar
unknown committed
4808
  }
4809

4810
  error= (to_log[0] ? purge_logs(to_log, 1, 0, 1, (ulonglong *) 0) : 0);
4811

unknown's avatar
unknown committed
4812
err:
Marc Alff's avatar
Marc Alff committed
4813
  mysql_mutex_unlock(&LOCK_index);
4814
  DBUG_RETURN(error);
unknown's avatar
unknown committed
4815
}
4816 4817 4818


bool
4819
MYSQL_BIN_LOG::can_purge_log(const char *log_file_name_arg)
4820 4821 4822
{
  xid_count_per_binlog *b;

4823
  if (is_active(log_file_name_arg))
4824 4825 4826 4827 4828
    return false;
  mysql_mutex_lock(&LOCK_xid_list);
  {
    I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
    while ((b= it++) &&
4829
           0 != strncmp(log_file_name_arg+dirname_length(log_file_name_arg),
4830 4831 4832 4833 4834 4835
                        b->binlog_name, b->binlog_name_len))
      ;
  }
  mysql_mutex_unlock(&LOCK_xid_list);
  if (b)
    return false;
4836
  return !log_in_use(log_file_name_arg);
4837
}
unknown's avatar
SCRUM  
unknown committed
4838
#endif /* HAVE_REPLICATION */
4839

4840

4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866
bool
MYSQL_BIN_LOG::is_xidlist_idle()
{
  bool res;
  mysql_mutex_lock(&LOCK_xid_list);
  res= is_xidlist_idle_nolock();
  mysql_mutex_unlock(&LOCK_xid_list);
  return res;
}


bool
MYSQL_BIN_LOG::is_xidlist_idle_nolock()
{
  xid_count_per_binlog *b;

  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
  while ((b= it++))
  {
    if (b->xid_count > 0)
      return false;
  }
  return true;
}


unknown's avatar
unknown committed
4867 4868
/**
  Create a new log file name.
4869

unknown's avatar
unknown committed
4870
  @param buf		buf of at least FN_REFLEN where new name is stored
4871

unknown's avatar
unknown committed
4872
  @note
4873 4874 4875
    If file name will be longer then FN_REFLEN it will be truncated
*/

4876
void MYSQL_BIN_LOG::make_log_name(char* buf, const char* log_ident)
unknown's avatar
unknown committed
4877
{
unknown's avatar
unknown committed
4878
  uint dir_len = dirname_length(log_file_name); 
unknown's avatar
unknown committed
4879
  if (dir_len >= FN_REFLEN)
unknown's avatar
unknown committed
4880 4881
    dir_len=FN_REFLEN-1;
  strnmov(buf, log_file_name, dir_len);
unknown's avatar
unknown committed
4882
  strmake(buf+dir_len, log_ident, FN_REFLEN - dir_len -1);
unknown's avatar
unknown committed
4883 4884
}

4885

unknown's avatar
unknown committed
4886 4887
/**
  Check if we are writing/reading to the given log file.
4888 4889
*/

4890
bool MYSQL_BIN_LOG::is_active(const char *log_file_name_arg)
unknown's avatar
unknown committed
4891
{
4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905
  /**
   * there should/must be mysql_mutex_assert_owner(&LOCK_log) here...
   * but code violates this! (scary monsters and super creeps!)
   *
   * example stacktrace:
   * #8  MYSQL_BIN_LOG::is_active
   * #9  MYSQL_BIN_LOG::can_purge_log
   * #10 MYSQL_BIN_LOG::purge_logs
   * #11 MYSQL_BIN_LOG::purge_first_log
   * #12 next_event
   * #13 exec_relay_log_event
   *
   * I didn't investigate if this is ligit...(i.e if my comment is wrong)
   */
unknown's avatar
unknown committed
4906
  return !strcmp(log_file_name, log_file_name_arg);
unknown's avatar
unknown committed
4907 4908
}

4909

4910 4911 4912 4913 4914 4915
/*
  Wrappers around new_file_impl to avoid using argument
  to control locking. The argument 1) less readable 2) breaks
  incapsulation 3) allows external access to the class without
  a lock (which is not possible with private new_file_without_locking
  method).
4916 4917 4918

  @retval
    nonzero - error
4919 4920
*/

4921
int MYSQL_BIN_LOG::new_file()
4922
{
4923
  return new_file_impl(1);
4924 4925
}

4926 4927 4928 4929 4930
/*
  @retval
    nonzero - error
 */
int MYSQL_BIN_LOG::new_file_without_locking()
4931
{
4932
  return new_file_impl(0);
4933 4934 4935
}


unknown's avatar
unknown committed
4936 4937
/**
  Start writing to a new log file or reopen the old file.
4938

unknown's avatar
unknown committed
4939
  @param need_lock		Set to 1 if caller has not locked LOCK_log
4940

4941 4942 4943
  @retval
    nonzero - error

unknown's avatar
unknown committed
4944
  @note
4945 4946 4947
    The new file name is stored last in the index file
*/

4948
int MYSQL_BIN_LOG::new_file_impl(bool need_lock)
unknown's avatar
unknown committed
4949
{
4950 4951
  int error= 0, close_on_error= FALSE;
  char new_name[FN_REFLEN], *new_name_ptr, *old_name, *file_to_open;
4952 4953
  uint close_flag;
  bool delay_close= false;
4954
  File UNINIT_VAR(old_file);
4955
  DBUG_ENTER("MYSQL_BIN_LOG::new_file_impl");
4956

4957
  if (!is_open())
4958 4959
  {
    DBUG_PRINT("info",("log is closed"));
4960
    DBUG_RETURN(error);
4961
  }
4962 4963

  if (need_lock)
Marc Alff's avatar
Marc Alff committed
4964 4965
    mysql_mutex_lock(&LOCK_log);
  mysql_mutex_lock(&LOCK_index);
4966

Marc Alff's avatar
Marc Alff committed
4967 4968
  mysql_mutex_assert_owner(&LOCK_log);
  mysql_mutex_assert_owner(&LOCK_index);
4969

unknown's avatar
unknown committed
4970
  /* Reuse old name if not binlog and not update log */
4971
  new_name_ptr= name;
unknown's avatar
unknown committed
4972

4973
  /*
4974 4975 4976
    If user hasn't specified an extension, generate a new log name
    We have to do this here and not in open as we want to store the
    new file name in the current binary log file.
4977
  */
4978
  if ((error= generate_new_name(new_name, name, 0)))
4979 4980
    goto end;
  new_name_ptr=new_name;
4981

4982
  if (log_type == LOG_BIN)
4983
  {
unknown's avatar
unknown committed
4984
    {
4985
      /*
4986 4987
        We log the whole file name for log file as the user may decide
        to change base names at some point.
4988
      */
4989 4990 4991 4992 4993 4994 4995 4996 4997
      Rotate_log_event r(new_name+dirname_length(new_name), 0, LOG_EVENT_OFFSET,
                         is_relay_log ? Rotate_log_event::RELAY_LOG : 0);
      /* 
         The current relay-log's closing Rotate event must have checksum
         value computed with an algorithm of the last relay-logged FD event.
      */
      if (is_relay_log)
        r.checksum_alg= relay_log_checksum_alg;
      DBUG_ASSERT(!is_relay_log || relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
4998 4999 5000
      if(DBUG_EVALUATE_IF("fault_injection_new_file_rotate_event", (error=close_on_error=TRUE), FALSE) ||
         (error= r.write(&log_file)))
      {
Luis Soares's avatar
Luis Soares committed
5001
        DBUG_EXECUTE_IF("fault_injection_new_file_rotate_event", errno=2;);
5002
        close_on_error= TRUE;
5003 5004 5005
        my_printf_error(ER_ERROR_ON_WRITE,
                        ER_THD_OR_DEFAULT(current_thd, ER_CANT_OPEN_FILE),
                        MYF(ME_FATALERROR), name, errno);
5006 5007
        goto end;
      }
5008
      bytes_written += r.data_written;
unknown's avatar
unknown committed
5009
    }
5010 5011 5012 5013 5014
    /*
      Update needs to be signalled even if there is no rotate event
      log rotation should give the waiting thread a signal to
      discover EOF and move on to the next log.
    */
5015
    signal_update();
5016 5017 5018
  }
  old_name=name;
  name=0;				// Don't free name
5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032
  close_flag= LOG_CLOSE_TO_BE_OPENED | LOG_CLOSE_INDEX;
  if (!is_relay_log)
  {
    /*
      We need to keep the old binlog file open (and marked as in-use) until
      the new one is fully created and synced to disk and index. Otherwise we
      leave a window where if we crash, there is no binlog file marked as
      crashed for server restart to detect the need for recovery.
    */
    old_file= log_file.file;
    close_flag|= LOG_CLOSE_DELAYED_CLOSE;
    delay_close= true;
  }
  close(close_flag);
5033 5034 5035 5036 5037 5038
  if (log_type == LOG_BIN && checksum_alg_reset != BINLOG_CHECKSUM_ALG_UNDEF)
  {
    DBUG_ASSERT(!is_relay_log);
    DBUG_ASSERT(binlog_checksum_options != checksum_alg_reset);
    binlog_checksum_options= checksum_alg_reset;
  }
5039
  /*
5040 5041
     Note that at this point, log_state != LOG_CLOSED
     (important for is_open()).
unknown's avatar
unknown committed
5042
  */
unknown's avatar
unknown committed
5043

5044
  /*
5045
     new_file() is only used for rotation (in FLUSH LOGS or because size >
5046
     max_binlog_size or max_relay_log_size).
5047 5048
     If this is a binary log, the Format_description_log_event at the
     beginning of the new file should have created=0 (to distinguish with the
5049 5050
     Format_description_log_event written at server startup, which should
     trigger temp tables deletion on slaves.
5051
  */
5052

5053
  /* reopen index binlog file, BUG#34582 */
5054 5055 5056 5057 5058 5059
  file_to_open= index_file_name;
  error= open_index_file(index_file_name, 0, FALSE);
  if (!error)
  {
    /* reopen the binary log file. */
    file_to_open= new_name_ptr;
5060
    error= open(old_name, log_type, new_name_ptr, 0, io_cache_type,
5061
                max_size, 1, FALSE);
5062 5063 5064 5065 5066
  }

  /* handle reopening errors */
  if (error)
  {
5067 5068
    my_printf_error(ER_CANT_OPEN_FILE,
                    ER_THD_OR_DEFAULT(current_thd, ER_CANT_OPEN_FILE), 
5069 5070 5071 5072
                    MYF(ME_FATALERROR), file_to_open, error);
    close_on_error= TRUE;
  }

5073
  my_free(old_name);
5074

5075
end:
5076

5077 5078 5079 5080 5081 5082
  if (delay_close)
  {
    clear_inuse_flag_when_closing(old_file);
    mysql_file_close(old_file, MYF(MY_WME));
  }

5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105
  if (error && close_on_error /* rotate or reopen failed */)
  {
    /* 
      Close whatever was left opened.

      We are keeping the behavior as it exists today, ie,
      we disable logging and move on (see: BUG#51014).

      TODO: as part of WL#1790 consider other approaches:
       - kill mysql (safety);
       - try multiple locations for opening a log file;
       - switch server to protected/readonly mode
       - ...
    */
    close(LOG_CLOSE_INDEX);
    sql_print_error("Could not open %s for logging (error %d). "
                     "Turning logging off for the whole duration "
                     "of the MySQL server process. To turn it on "
                     "again: fix the cause, shutdown the MySQL "
                     "server and restart it.", 
                     new_name_ptr, errno);
  }

5106
  if (need_lock)
Marc Alff's avatar
Marc Alff committed
5107 5108
    mysql_mutex_unlock(&LOCK_log);
  mysql_mutex_unlock(&LOCK_index);
5109

5110
  DBUG_RETURN(error);
unknown's avatar
unknown committed
5111 5112
}

5113

5114 5115
bool
MYSQL_BIN_LOG::append(Log_event *ev)
5116
{
5117
  bool res;
Marc Alff's avatar
Marc Alff committed
5118
  mysql_mutex_lock(&LOCK_log);
5119 5120 5121 5122 5123 5124 5125 5126 5127
  res= append_no_lock(ev);
  mysql_mutex_unlock(&LOCK_log);
  return res;
}


bool MYSQL_BIN_LOG::append_no_lock(Log_event* ev)
{
  bool error = 0;
5128
  DBUG_ENTER("MYSQL_BIN_LOG::append");
5129

5130
  mysql_mutex_assert_owner(&LOCK_log);
5131
  DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5132 5133 5134 5135
  /*
    Log_event::write() is smart enough to use my_b_write() or
    my_b_append() depending on the kind of cache we have.
  */
5136 5137 5138 5139 5140
  if (ev->write(&log_file))
  {
    error=1;
    goto err;
  }
5141
  bytes_written+= ev->data_written;
unknown's avatar
unknown committed
5142
  DBUG_PRINT("info",("max_size: %lu",max_size));
5143 5144
  if (flush_and_sync(0))
    goto err;
5145
  if (my_b_append_tell(&log_file) > max_size)
5146
    error= new_file_without_locking();
5147
err:
5148
  signal_update();				// Safe as we don't call close
5149
  DBUG_RETURN(error);
5150 5151
}

5152

5153
bool MYSQL_BIN_LOG::appendv(const char* buf, uint len,...)
5154
{
5155
  bool error= 0;
5156
  DBUG_ENTER("MYSQL_BIN_LOG::appendv");
5157 5158
  va_list(args);
  va_start(args,len);
5159

5160
  DBUG_ASSERT(log_file.type == SEQ_READ_APPEND);
5161

Marc Alff's avatar
Marc Alff committed
5162
  mysql_mutex_assert_owner(&LOCK_log);
5163 5164
  do
  {
5165
    if (my_b_append(&log_file,(uchar*) buf,len))
5166
    {
5167 5168
      error= 1;
      goto err;
5169
    }
unknown's avatar
unknown committed
5170
    bytes_written += len;
5171
  } while ((buf=va_arg(args,const char*)) && (len=va_arg(args,uint)));
unknown's avatar
unknown committed
5172
  DBUG_PRINT("info",("max_size: %lu",max_size));
5173 5174
  if (flush_and_sync(0))
    goto err;
5175
  if (my_b_append_tell(&log_file) > max_size)
5176
    error= new_file_without_locking();
5177
err:
5178 5179
  if (!error)
    signal_update();
5180
  DBUG_RETURN(error);
5181
}
unknown's avatar
unknown committed
5182

5183
bool MYSQL_BIN_LOG::flush_and_sync(bool *synced)
5184
{
5185
  int err=0, fd=log_file.file;
5186 5187
  if (synced)
    *synced= 0;
Marc Alff's avatar
Marc Alff committed
5188
  mysql_mutex_assert_owner(&LOCK_log);
5189 5190
  if (flush_io_cache(&log_file))
    return 1;
5191 5192
  uint sync_period= get_sync_period();
  if (sync_period && ++sync_counter >= sync_period)
5193
  {
5194
    sync_counter= 0;
Sergei Golubchik's avatar
Sergei Golubchik committed
5195
    err= mysql_file_sync(fd, MYF(MY_WME|MY_SYNC_FILESIZE));
5196 5197
    if (synced)
      *synced= 1;
5198
#ifndef DBUG_OFF
unknown's avatar
unknown committed
5199 5200
    if (opt_binlog_dbug_fsync_sleep > 0)
      my_sleep(opt_binlog_dbug_fsync_sleep);
5201
#endif
5202
  }
5203
  return err;
5204 5205
}

5206
void MYSQL_BIN_LOG::start_union_events(THD *thd, query_id_t query_id_param)
5207 5208 5209 5210 5211
{
  DBUG_ASSERT(!thd->binlog_evt_union.do_union);
  thd->binlog_evt_union.do_union= TRUE;
  thd->binlog_evt_union.unioned_events= FALSE;
  thd->binlog_evt_union.unioned_events_trans= FALSE;
5212
  thd->binlog_evt_union.first_query_id= query_id_param;
5213 5214
}

5215
void MYSQL_BIN_LOG::stop_union_events(THD *thd)
5216 5217 5218 5219 5220
{
  DBUG_ASSERT(thd->binlog_evt_union.do_union);
  thd->binlog_evt_union.do_union= FALSE;
}

5221
bool MYSQL_BIN_LOG::is_query_in_union(THD *thd, query_id_t query_id_param)
5222 5223 5224 5225 5226
{
  return (thd->binlog_evt_union.do_union && 
          query_id_param >= thd->binlog_evt_union.first_query_id);
}

5227
/** 
5228
  This function checks if a transactional table was updated by the
5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240
  current transaction.

  @param thd The client thread that executed the current statement.
  @return
    @c true if a transactional table was updated, @c false otherwise.
*/
bool
trans_has_updated_trans_table(const THD* thd)
{
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);

5241
  return (cache_mngr ? !cache_mngr->trx_cache.empty() : 0);
5242 5243 5244
}

/** 
5245
  This function checks if a transactional table was updated by the
5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256
  current statement.

  @param thd The client thread that executed the current statement.
  @return
    @c true if a transactional table was updated, @c false otherwise.
*/
bool
stmt_has_updated_trans_table(const THD *thd)
{
  Ha_trx_info *ha_info;

5257 5258
  for (ha_info= thd->transaction.stmt.ha_list; ha_info;
       ha_info= ha_info->next())
5259 5260 5261 5262 5263 5264 5265 5266 5267
  {
    if (ha_info->is_trx_read_write() && ha_info->ht() != binlog_hton)
      return (TRUE);
  }
  return (FALSE);
}

/** 
  This function checks if either a trx-cache or a non-trx-cache should
5268 5269 5270
  be used. If @c bin_log_direct_non_trans_update is active or the format
  is either MIXED or ROW, the cache to be used depends on the flag @c
  is_transactional. 
5271

5272 5273 5274 5275
  On the other hand, if binlog_format is STMT or direct option is
  OFF, the trx-cache should be used if and only if the statement is
  transactional or the trx-cache is not empty. Otherwise, the
  non-trx-cache should be used.
5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287

  @param thd              The client thread.
  @param is_transactional The changes are related to a trx-table.
  @return
    @c true if a trx-cache should be used, @c false otherwise.
*/
bool use_trans_cache(const THD* thd, bool is_transactional)
{
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);

  return
5288
    ((thd->is_current_stmt_binlog_format_row() ||
5289
     thd->variables.binlog_direct_non_trans_update) ? is_transactional :
5290
     (is_transactional || !cache_mngr->trx_cache.empty()));
5291
}
5292

5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304
/**
  This function checks if a transaction, either a multi-statement
  or a single statement transaction is about to commit or not.

  @param thd The client thread that executed the current statement.
  @param all Committing a transaction (i.e. TRUE) or a statement
             (i.e. FALSE).
  @return
    @c true if committing a transaction, otherwise @c false.
*/
bool ending_trans(THD* thd, const bool all)
{
5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321
  return (all || ending_single_stmt_trans(thd, all));
}

/**
  This function checks if a single statement transaction is about
  to commit or not.

  @param thd The client thread that executed the current statement.
  @param all Committing a transaction (i.e. TRUE) or a statement
             (i.e. FALSE).
  @return
    @c true if committing a single statement transaction, otherwise
    @c false.
*/
bool ending_single_stmt_trans(THD* thd, const bool all)
{
  return (!all && !thd->in_multi_stmt_transaction_mode());
5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351
}

/**
  This function checks if a non-transactional table was updated by
  the current transaction.

  @param thd The client thread that executed the current statement.
  @return
    @c true if a non-transactional table was updated, @c false
    otherwise.
*/
bool trans_has_updated_non_trans_table(const THD* thd)
{
  return (thd->transaction.all.modified_non_trans_table ||
          thd->transaction.stmt.modified_non_trans_table);
}

/**
  This function checks if a non-transactional table was updated by the
  current statement.

  @param thd The client thread that executed the current statement.
  @return
    @c true if a non-transactional table was updated, @c false otherwise.
*/
bool stmt_has_updated_non_trans_table(const THD* thd)
{
  return (thd->transaction.stmt.modified_non_trans_table);
}

5352 5353 5354 5355 5356
/*
  These functions are placed in this file since they need access to
  binlog_hton, which has internal linkage.
*/

5357
binlog_cache_mngr *THD::binlog_setup_trx_data()
5358 5359
{
  DBUG_ENTER("THD::binlog_setup_trx_data");
5360 5361
  binlog_cache_mngr *cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5362

5363
  if (cache_mngr)
5364
    DBUG_RETURN(cache_mngr);                             // Already set up
5365

5366 5367 5368
  cache_mngr= (binlog_cache_mngr*) my_malloc(sizeof(binlog_cache_mngr), MYF(MY_ZEROFILL));
  if (!cache_mngr ||
      open_cached_file(&cache_mngr->stmt_cache.cache_log, mysql_tmpdir,
5369
                       LOG_PREFIX, binlog_stmt_cache_size, MYF(MY_WME)) ||
5370
      open_cached_file(&cache_mngr->trx_cache.cache_log, mysql_tmpdir,
5371 5372
                       LOG_PREFIX, binlog_cache_size, MYF(MY_WME)))
  {
5373
    my_free(cache_mngr);
5374
    DBUG_RETURN(0);                      // Didn't manage to set it up
5375
  }
5376
  thd_set_ha_data(this, binlog_hton, cache_mngr);
5377

5378
  cache_mngr= new (cache_mngr)
5379 5380 5381 5382 5383 5384
              binlog_cache_mngr(max_binlog_stmt_cache_size,
                                max_binlog_cache_size,
                                &binlog_stmt_cache_use,
                                &binlog_stmt_cache_disk_use,
                                &binlog_cache_use,
                                &binlog_cache_disk_use);
5385
  DBUG_RETURN(cache_mngr);
5386 5387
}

5388
/*
5389 5390
  Function to start a statement and optionally a transaction for the
  binary log.
5391

5392 5393
  SYNOPSIS
    binlog_start_trans_and_stmt()
5394

5395 5396 5397 5398 5399 5400
  DESCRIPTION

    This function does three things:
    - Start a transaction if not in autocommit mode or if a BEGIN
      statement has been seen.

5401
    - Start a statement transaction to allow us to truncate the cache.
5402 5403

    - Save the currrent binlog position so that we can roll back the
5404
      statement by truncating the cache.
5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417

      We only update the saved position if the old one was undefined,
      the reason is that there are some cases (e.g., for CREATE-SELECT)
      where the position is saved twice (e.g., both in
      select_create::prepare() and THD::binlog_write_table_map()) , but
      we should use the first. This means that calls to this function
      can be used to start the statement before the first table map
      event, to include some extra events.
 */

void
THD::binlog_start_trans_and_stmt()
{
5418
  binlog_cache_mngr *cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
unknown's avatar
unknown committed
5419
  DBUG_ENTER("binlog_start_trans_and_stmt");
5420 5421 5422
  DBUG_PRINT("enter", ("cache_mngr: %p  cache_mngr->trx_cache.get_prev_position(): %lu",
                       cache_mngr,
                       (cache_mngr ? (ulong) cache_mngr->trx_cache.get_prev_position() :
unknown's avatar
unknown committed
5423 5424
                        (ulong) 0)));

5425 5426
  if (cache_mngr == NULL ||
      cache_mngr->trx_cache.get_prev_position() == MY_OFF_T_UNDEF)
5427
  {
5428
    this->binlog_set_stmt_begin();
5429
    if (in_multi_stmt_transaction_mode())
unknown's avatar
unknown committed
5430 5431
      trans_register_ha(this, TRUE, binlog_hton);
    trans_register_ha(this, FALSE, binlog_hton);
5432 5433 5434 5435 5436 5437 5438 5439 5440 5441
    /*
      Mark statement transaction as read/write. We never start
      a binary log transaction and keep it read-only,
      therefore it's best to mark the transaction read/write just
      at the same time we start it.
      Not necessary to mark the normal transaction read/write
      since the statement-level flag will be propagated automatically
      inside ha_commit_trans.
    */
    ha_data[binlog_hton->slot].ha_info[0].set_trx_read_write();
5442 5443 5444 5445
  }
  DBUG_VOID_RETURN;
}

5446
void THD::binlog_set_stmt_begin() {
5447 5448
  binlog_cache_mngr *cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
5449 5450

  /*
5451
    The call to binlog_trans_log_savepos() might create the cache_mngr
5452 5453
    structure, if it didn't exist before, so we save the position
    into an auto variable and then write it into the transaction
5454
    data for the binary log (i.e., cache_mngr).
5455 5456 5457
  */
  my_off_t pos= 0;
  binlog_trans_log_savepos(this, &pos);
5458 5459
  cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);
  cache_mngr->trx_cache.set_prev_position(pos);
5460 5461
}

5462 5463 5464 5465 5466 5467
static int
binlog_start_consistent_snapshot(handlerton *hton, THD *thd)
{
  int err= 0;
  DBUG_ENTER("binlog_start_consistent_snapshot");

5468
  binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
5469 5470

  /* Server layer calls us with LOCK_commit_ordered locked, so this is safe. */
5471
  mysql_mutex_assert_owner(&LOCK_commit_ordered);
5472
  strmake_buf(cache_mngr->last_commit_pos_file, mysql_bin_log.last_commit_pos_file);
Sergei Golubchik's avatar
Sergei Golubchik committed
5473
  cache_mngr->last_commit_pos_offset= mysql_bin_log.last_commit_pos_offset;
5474 5475 5476 5477 5478

  trans_register_ha(thd, TRUE, hton);

  DBUG_RETURN(err);
}
5479

5480 5481 5482 5483 5484
/**
  This function writes a table map to the binary log. 
  Note that in order to keep the signature uniform with related methods,
  we use a redundant parameter to indicate whether a transactional table
  was changed or not.
5485

Sergei Golubchik's avatar
Sergei Golubchik committed
5486 5487
  If with_annotate != NULL and
  *with_annotate = TRUE write also Annotate_rows before the table map.
5488 5489 5490 5491 5492 5493 5494
 
  @param table             a pointer to the table.
  @param is_transactional  @c true indicates a transactional table,
                           otherwise @c false a non-transactional.
  @return
    nonzero if an error pops up when writing the table map event.
*/
Sergei Golubchik's avatar
Sergei Golubchik committed
5495
int THD::binlog_write_table_map(TABLE *table, bool is_transactional,
5496
                                my_bool *with_annotate)
5497
{
5498
  int error;
5499
  DBUG_ENTER("THD::binlog_write_table_map");
unknown's avatar
unknown committed
5500 5501
  DBUG_PRINT("enter", ("table: 0x%lx  (%s: #%lu)",
                       (long) table, table->s->table_name.str,
5502
                       table->s->table_map_id));
5503

5504 5505 5506 5507
  /* Ensure that all events in a GTID group are in the same cache */
  if (variables.option_bits & OPTION_GTID_BEGIN)
    is_transactional= 1;
  
5508
  /* Pre-conditions */
5509
  DBUG_ASSERT(is_current_stmt_binlog_format_row());
5510
  DBUG_ASSERT(WSREP_EMULATE_BINLOG(this) || mysql_bin_log.is_open());
5511 5512 5513
  DBUG_ASSERT(table->s->table_map_id != ULONG_MAX);

  Table_map_log_event
5514
    the_event(this, table, table->s->table_map_id, is_transactional);
5515

5516
  if (binlog_table_maps == 0)
5517
    binlog_start_trans_and_stmt();
5518

5519 5520 5521
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);

5522 5523
  IO_CACHE *file=
    cache_mngr->get_binlog_cache_log(use_trans_cache(this, is_transactional));
Sergei Golubchik's avatar
Sergei Golubchik committed
5524 5525
  if (with_annotate && *with_annotate)
  {
5526
    Annotate_rows_log_event anno(table->in_use, is_transactional, false);
Sergei Golubchik's avatar
Sergei Golubchik committed
5527 5528 5529 5530 5531
    /* Annotate event should be written not more than once */
    *with_annotate= 0;
    if ((error= anno.write(file)))
      DBUG_RETURN(error);
  }
5532
  if ((error= the_event.write(file)))
5533 5534
    DBUG_RETURN(error);

5535
  binlog_table_maps++;
5536 5537 5538
  DBUG_RETURN(0);
}

5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549
/**
  This function retrieves a pending row event from a cache which is
  specified through the parameter @c is_transactional. Respectively, when it
  is @c true, the pending event is returned from the transactional cache.
  Otherwise from the non-transactional cache.

  @param is_transactional  @c true indicates a transactional cache,
                           otherwise @c false a non-transactional.
  @return
    The row event if any. 
*/
5550
Rows_log_event*
5551
THD::binlog_get_pending_rows_event(bool is_transactional) const
5552
{
5553 5554 5555 5556
  Rows_log_event* rows= NULL;
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(this, binlog_hton);

5557
  /*
5558 5559 5560
    This is less than ideal, but here's the story: If there is no cache_mngr,
    prepare_pending_rows_event() has never been called (since the cache_mngr
    is set up there). In that case, we just return NULL.
5561
   */
5562 5563 5564
  if (cache_mngr)
  {
    binlog_cache_data *cache_data=
5565
      cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
5566 5567 5568 5569

    rows= cache_data->pending();
  }
  return (rows);
5570 5571
}

5572 5573 5574 5575 5576 5577 5578 5579 5580 5581
/**
  This function stores a pending row event into a cache which is specified
  through the parameter @c is_transactional. Respectively, when it is @c
  true, the pending event is stored into the transactional cache. Otherwise
  into the non-transactional cache.

  @param evt               a pointer to the row event.
  @param is_transactional  @c true indicates a transactional cache,
                           otherwise @c false a non-transactional.
*/
5582
void
5583
THD::binlog_set_pending_rows_event(Rows_log_event* ev, bool is_transactional)
5584
{
5585
  binlog_cache_mngr *const cache_mngr= binlog_setup_trx_data();
5586 5587

  DBUG_ASSERT(cache_mngr);
5588

5589
  binlog_cache_data *cache_data=
5590
    cache_mngr->get_binlog_cache_data(use_trans_cache(this, is_transactional));
5591

5592
  cache_data->set_pending(ev);
5593 5594 5595
}


5596
/**
5597 5598
  This function removes the pending rows event, discarding any outstanding
  rows. If there is no pending rows event available, this is effectively a
5599
  no-op.
5600 5601 5602 5603 5604

  @param thd               a pointer to the user thread.
  @param is_transactional  @c true indicates a transactional cache,
                           otherwise @c false a non-transactional.
*/
5605
int
5606
MYSQL_BIN_LOG::remove_pending_rows_event(THD *thd, bool is_transactional)
5607
{
5608
  DBUG_ENTER("MYSQL_BIN_LOG::remove_pending_rows_event");
5609

5610 5611 5612 5613
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);

  DBUG_ASSERT(cache_mngr);
5614

5615
  binlog_cache_data *cache_data=
5616
    cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
5617

5618
  if (Rows_log_event* pending= cache_data->pending())
5619 5620
  {
    delete pending;
5621
    cache_data->set_pending(NULL);
5622 5623 5624 5625 5626
  }

  DBUG_RETURN(0);
}

5627
/*
5628 5629 5630 5631 5632 5633 5634 5635
  Moves the last bunch of rows from the pending Rows event to a cache (either
  transactional cache if is_transaction is @c true, or the non-transactional
  cache otherwise. Sets a new pending event.

  @param thd               a pointer to the user thread.
  @param evt               a pointer to the row event.
  @param is_transactional  @c true indicates a transactional cache,
                           otherwise @c false a non-transactional.
5636
*/
5637 5638
int
MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
5639 5640
                                                Rows_log_event* event,
                                                bool is_transactional)
5641
{
5642
  DBUG_ENTER("MYSQL_BIN_LOG::flush_and_set_pending_rows_event(event)");
5643
  DBUG_ASSERT(WSREP_EMULATE_BINLOG(thd) || mysql_bin_log.is_open());
unknown's avatar
unknown committed
5644
  DBUG_PRINT("enter", ("event: 0x%lx", (long) event));
5645 5646

  int error= 0;
5647 5648
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
5649

5650
  DBUG_ASSERT(cache_mngr);
5651

5652
  binlog_cache_data *cache_data=
5653
    cache_mngr->get_binlog_cache_data(use_trans_cache(thd, is_transactional));
5654

5655
  DBUG_PRINT("info", ("cache_mngr->pending(): 0x%lx", (long) cache_data->pending()));
5656

5657
  if (Rows_log_event* pending= cache_data->pending())
5658
  {
5659
    IO_CACHE *file= &cache_data->cache_log;
5660 5661

    /*
5662
      Write pending event to the cache.
5663
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
5664 5665
    DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
                    {DBUG_SET("+d,simulate_file_write_error");});
5666 5667
    if (pending->write(file))
    {
5668
      set_write_error(thd, is_transactional);
5669
      if (check_write_error(thd) && cache_data &&
5670
          stmt_has_updated_non_trans_table(thd))
5671
        cache_data->set_incident();
Sergei Golubchik's avatar
Sergei Golubchik committed
5672 5673 5674 5675
      delete pending;
      cache_data->set_pending(NULL);
      DBUG_EXECUTE_IF("simulate_disk_full_at_flush_pending",
                      {DBUG_SET("-d,simulate_file_write_error");});
5676 5677 5678 5679 5680 5681
      DBUG_RETURN(1);
    }

    delete pending;
  }

5682
  thd->binlog_set_pending_rows_event(event, is_transactional);
5683 5684 5685 5686

  DBUG_RETURN(error);
}

5687 5688

/* Generate a new global transaction ID, and write it to the binlog */
5689

5690 5691
bool
MYSQL_BIN_LOG::write_gtid_event(THD *thd, bool standalone,
5692
                                bool is_transactional, uint64 commit_id)
5693 5694
{
  rpl_gtid gtid;
5695
  uint32 domain_id;
5696
  uint32 local_server_id;
5697
  uint64 seq_no;
5698
  int err;
5699 5700
  DBUG_ENTER("write_gtid_event");
  DBUG_PRINT("enter", ("standalone: %d", standalone));
5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711

#ifdef WITH_WSREP
  if (WSREP(thd) && thd->wsrep_trx_meta.gtid.seqno != -1 && wsrep_gtid_mode)
  {
    domain_id= wsrep_gtid_domain_id;
  } else {
#endif /* WITH_WSREP */
  domain_id= thd->variables.gtid_domain_id;
#ifdef WITH_WSREP
  }
#endif /* WITH_WSREP */
5712
  local_server_id= thd->variables.server_id;
5713 5714
  seq_no= thd->variables.gtid_seq_no;

5715 5716 5717 5718 5719 5720 5721
  if (thd->variables.option_bits & OPTION_GTID_BEGIN)
  {
    DBUG_PRINT("error", ("OPTION_GTID_BEGIN is set. "
                         "Master and slave will have different GTID values"));
    /* Reset the flag, as we will write out a GTID anyway */
    thd->variables.option_bits&= ~OPTION_GTID_BEGIN;
  }
5722 5723 5724 5725 5726 5727 5728 5729

  /*
    Reset the session variable gtid_seq_no, to reduce the risk of accidentally
    producing a duplicate GTID.
  */
  thd->variables.gtid_seq_no= 0;
  if (seq_no != 0)
  {
5730 5731
    /* Use the specified sequence number. */
    gtid.domain_id= domain_id;
5732
    gtid.server_id= local_server_id;
5733 5734
    gtid.seq_no= seq_no;
    err= rpl_global_gtid_binlog_state.update(&gtid, opt_gtid_strict_mode);
Sergei Golubchik's avatar
Sergei Golubchik committed
5735
    if (err && thd->get_stmt_da()->sql_errno()==ER_GTID_STRICT_OUT_OF_ORDER)
5736
      errno= ER_GTID_STRICT_OUT_OF_ORDER;
5737 5738 5739
  }
  else
  {
5740 5741
    /* Allocate the next sequence number for the GTID. */
    err= rpl_global_gtid_binlog_state.update_with_next_gtid(domain_id,
5742
                                                            local_server_id, &gtid);
5743
    seq_no= gtid.seq_no;
5744
  }
5745
  if (err)
5746
    DBUG_RETURN(true);
5747
  thd->last_commit_gtid= gtid;
5748

5749
  Gtid_log_event gtid_event(thd, seq_no, domain_id, standalone,
5750 5751
                            LOG_EVENT_SUPPRESS_USE_F, is_transactional,
                            commit_id);
5752 5753 5754

  /* Write the event to the binary log. */
  if (gtid_event.write(&mysql_bin_log.log_file))
5755
    DBUG_RETURN(true);
5756 5757
  status_var_add(thd->status_var.binlog_bytes_written, gtid_event.data_written);

5758
  DBUG_RETURN(false);
5759 5760 5761
}


5762 5763 5764 5765 5766 5767 5768 5769
int
MYSQL_BIN_LOG::write_state_to_file()
{
  File file_no;
  IO_CACHE cache;
  char buf[FN_REFLEN];
  int err;
  bool opened= false;
5770
  bool log_inited= false;
5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784

  fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
            MY_UNPACK_FILENAME);
  if ((file_no= mysql_file_open(key_file_binlog_state, buf,
                                O_RDWR|O_CREAT|O_TRUNC|O_BINARY,
                                MYF(MY_WME))) < 0)
  {
    err= 1;
    goto err;
  }
  opened= true;
  if ((err= init_io_cache(&cache, file_no, IO_SIZE, WRITE_CACHE, 0, 0,
                           MYF(MY_WME|MY_WAIT_IF_FULL))))
    goto err;
5785
  log_inited= true;
5786 5787
  if ((err= rpl_global_gtid_binlog_state.write_to_iocache(&cache)))
    goto err;
5788
  log_inited= false;
5789 5790 5791 5792 5793 5794 5795 5796
  if ((err= end_io_cache(&cache)))
    goto err;
  if ((err= mysql_file_sync(file_no, MYF(MY_WME|MY_SYNC_FILESIZE))))
    goto err;
  goto end;

err:
  sql_print_error("Error writing binlog state to file '%s'.\n", buf);
5797
  if (log_inited)
5798 5799 5800 5801 5802 5803 5804 5805 5806
    end_io_cache(&cache);
end:
  if (opened)
    mysql_file_close(file_no, MYF(0));

  return err;
}


5807 5808 5809 5810 5811 5812 5813 5814
/*
  Initialize the binlog state from the master-bin.state file, at server startup.

  Returns:
    0 for success.
    2 for when .state file did not exist.
    1 for other error.
*/
5815 5816 5817 5818 5819 5820 5821 5822
int
MYSQL_BIN_LOG::read_state_from_file()
{
  File file_no;
  IO_CACHE cache;
  char buf[FN_REFLEN];
  int err;
  bool opened= false;
5823
  bool log_inited= false;
5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840

  fn_format(buf, opt_bin_logname, mysql_data_home, ".state",
            MY_UNPACK_FILENAME);
  if ((file_no= mysql_file_open(key_file_binlog_state, buf,
                                O_RDONLY|O_BINARY, MYF(0))) < 0)
  {
    if (my_errno != ENOENT)
    {
      err= 1;
      goto err;
    }
    else
    {
      /*
        If the state file does not exist, this is the first server startup
        with GTID enabled. So initialize to empty state.
      */
unknown's avatar
unknown committed
5841
      rpl_global_gtid_binlog_state.reset();
5842
      err= 2;
5843 5844 5845 5846 5847 5848 5849
      goto end;
    }
  }
  opened= true;
  if ((err= init_io_cache(&cache, file_no, IO_SIZE, READ_CACHE, 0, 0,
                          MYF(MY_WME|MY_WAIT_IF_FULL))))
    goto err;
5850
  log_inited= true;
5851 5852 5853 5854 5855 5856 5857
  if ((err= rpl_global_gtid_binlog_state.read_from_iocache(&cache)))
    goto err;
  goto end;

err:
  sql_print_error("Error reading binlog GTID state from file '%s'.\n", buf);
end:
5858
  if (log_inited)
5859 5860 5861 5862 5863 5864 5865 5866
    end_io_cache(&cache);
  if (opened)
    mysql_file_close(file_no, MYF(0));

  return err;
}


unknown's avatar
unknown committed
5867 5868 5869 5870 5871 5872 5873
int
MYSQL_BIN_LOG::get_most_recent_gtid_list(rpl_gtid **list, uint32 *size)
{
  return rpl_global_gtid_binlog_state.get_most_recent_gtid_list(list, size);
}


unknown's avatar
unknown committed
5874 5875 5876
bool
MYSQL_BIN_LOG::append_state_pos(String *str)
{
5877
  return rpl_global_gtid_binlog_state.append_pos(str);
unknown's avatar
unknown committed
5878 5879 5880
}


unknown's avatar
unknown committed
5881 5882 5883
bool
MYSQL_BIN_LOG::append_state(String *str)
{
5884
  return rpl_global_gtid_binlog_state.append_state(str);
unknown's avatar
unknown committed
5885 5886 5887 5888 5889 5890
}


bool
MYSQL_BIN_LOG::is_empty_state()
{
5891
  return (rpl_global_gtid_binlog_state.count() == 0);
unknown's avatar
unknown committed
5892 5893 5894
}


unknown's avatar
unknown committed
5895
bool
5896
MYSQL_BIN_LOG::find_in_binlog_state(uint32 domain_id, uint32 server_id_arg,
unknown's avatar
unknown committed
5897 5898 5899
                                    rpl_gtid *out_gtid)
{
  rpl_gtid *gtid;
5900
  if ((gtid= rpl_global_gtid_binlog_state.find(domain_id, server_id_arg)))
unknown's avatar
unknown committed
5901 5902 5903 5904 5905 5906 5907 5908 5909
    *out_gtid= *gtid;
  return gtid != NULL;
}


bool
MYSQL_BIN_LOG::lookup_domain_in_binlog_state(uint32 domain_id,
                                             rpl_gtid *out_gtid)
{
5910
  rpl_gtid *found_gtid;
unknown's avatar
unknown committed
5911

5912
  if ((found_gtid= rpl_global_gtid_binlog_state.find_most_recent(domain_id)))
unknown's avatar
unknown committed
5913
  {
5914
    *out_gtid= *found_gtid;
5915
    return true;
unknown's avatar
unknown committed
5916 5917
  }

5918
  return false;
unknown's avatar
unknown committed
5919 5920 5921
}


5922 5923
int
MYSQL_BIN_LOG::bump_seq_no_counter_if_needed(uint32 domain_id, uint64 seq_no)
unknown's avatar
unknown committed
5924
{
5925
  return rpl_global_gtid_binlog_state.bump_seq_no_if_needed(domain_id, seq_no);
5926 5927 5928 5929
}


bool
5930 5931
MYSQL_BIN_LOG::check_strict_gtid_sequence(uint32 domain_id,
                                          uint32 server_id_arg,
5932
                                          uint64 seq_no)
unknown's avatar
unknown committed
5933
{
5934
  return rpl_global_gtid_binlog_state.check_strict_sequence(domain_id,
5935 5936
                                                            server_id_arg,
                                                            seq_no);
unknown's avatar
unknown committed
5937 5938 5939
}


unknown's avatar
unknown committed
5940
/**
5941 5942 5943
  Write an event to the binary log. If with_annotate != NULL and
  *with_annotate = TRUE write also Annotate_rows before the event
  (this should happen only if the event is a Table_map).
5944 5945
*/

5946
bool MYSQL_BIN_LOG::write(Log_event *event_info, my_bool *with_annotate)
unknown's avatar
unknown committed
5947
{
5948 5949
  THD *thd= event_info->thd;
  bool error= 1;
5950
  binlog_cache_data *cache_data= 0;
5951
  bool is_trans_cache= FALSE;
unknown's avatar
unknown committed
5952
  bool using_trans= event_info->use_trans_cache();
5953
  bool direct= event_info->use_direct_logging();
5954
  ulong UNINIT_VAR(prev_binlog_id);
5955
  DBUG_ENTER("MYSQL_BIN_LOG::write(Log_event *)");
5956

5957 5958 5959
  /*
    When binary logging is not enabled (--log-bin=0), wsrep-patch partially
    enables it without opening the binlog file (MSQL_BIN_LOG::open().
5960
    So, avoid writing to binlog file.
5961
  */
5962 5963 5964 5965
  if (direct &&
      (wsrep_emulate_bin_log ||
       (WSREP(thd) && !(thd->variables.option_bits & OPTION_BIN_LOG))))
    DBUG_RETURN(0);
5966

5967 5968 5969 5970 5971 5972 5973 5974
  if (thd->variables.option_bits & OPTION_GTID_BEGIN)
  {
    DBUG_PRINT("info", ("OPTION_GTID_BEGIN was set"));
    /* Wait for commit from binary log before we commit */
    direct= 0;
    using_trans= 1;
  }

5975 5976
  if (thd->binlog_evt_union.do_union)
  {
5977 5978 5979 5980
    /*
      In Stored function; Remember that function call caused an update.
      We will log the function call to the binary log on function exit
    */
5981
    thd->binlog_evt_union.unioned_events= TRUE;
unknown's avatar
unknown committed
5982
    thd->binlog_evt_union.unioned_events_trans |= using_trans;
5983 5984
    DBUG_RETURN(0);
  }
5985 5986

  /*
5987 5988 5989
    We only end the statement if we are in a top-level statement.  If
    we are inside a stored function, we do not end the statement since
    this will close all tables on the slave.
5990
  */
5991
  bool const end_stmt=
Konstantin Osipov's avatar
Konstantin Osipov committed
5992
    thd->locked_tables_mode && thd->lex->requires_prelocking();
unknown's avatar
unknown committed
5993
  if (thd->binlog_flush_pending_rows_event(end_stmt, using_trans))
5994
    DBUG_RETURN(error);
5995

5996
  /*
unknown's avatar
unknown committed
5997 5998 5999 6000
     In most cases this is only called if 'is_open()' is true; in fact this is
     mostly called if is_open() *was* true a few instructions before, but it
     could have changed since.
  */
6001
  /* applier and replayer can skip writing binlog events */
6002 6003
  if ((WSREP_EMULATE_BINLOG(thd) &&
       IF_WSREP(thd->wsrep_exec_mode != REPL_RECV, 0)) || is_open())
unknown's avatar
unknown committed
6004
  {
6005
    my_off_t UNINIT_VAR(my_org_b_tell);
unknown's avatar
SCRUM  
unknown committed
6006
#ifdef HAVE_REPLICATION
6007
    /*
6008 6009 6010
      In the future we need to add to the following if tests like
      "do the involved tables match (to be implemented)
      binlog_[wild_]{do|ignore}_table?" (WL#1049)"
unknown's avatar
unknown committed
6011
    */
6012
    const char *local_db= event_info->get_db();
6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023

    bool option_bin_log_flag= (thd->variables.option_bits & OPTION_BIN_LOG);

    /*
      Log all updates to binlog cache so that they can get replicated to other
      nodes. A check has been added to stop them from getting logged into
      binary log files.
    */
    if (WSREP(thd)) option_bin_log_flag= true;

    if ((!(option_bin_log_flag)) ||
6024 6025 6026
	(thd->lex->sql_command != SQLCOM_ROLLBACK_TO_SAVEPOINT &&
         thd->lex->sql_command != SQLCOM_SAVEPOINT &&
         !binlog_filter->db_ok(local_db)))
6027
      DBUG_RETURN(0);
unknown's avatar
SCRUM  
unknown committed
6028
#endif /* HAVE_REPLICATION */
6029

6030 6031
    IO_CACHE *file= NULL;

unknown's avatar
unknown committed
6032
    if (direct)
6033
    {
6034
      int res;
6035
      uint64 commit_id= 0;
6036
      DBUG_PRINT("info", ("direct is set"));
6037 6038
      if ((res= thd->wait_for_prior_commit()))
        DBUG_RETURN(res);
6039
      file= &log_file;
6040
      my_org_b_tell= my_b_tell(file);
6041
      mysql_mutex_lock(&LOCK_log);
6042
      prev_binlog_id= current_binlog_id;
6043 6044
      DBUG_EXECUTE_IF("binlog_force_commit_id",
        {
6045
          const LEX_STRING commit_name= { C_STRING_WITH_LEN("commit_id") };
6046 6047 6048
          bool null_value;
          user_var_entry *entry=
            (user_var_entry*) my_hash_search(&thd->user_vars,
6049 6050
                                             (uchar*) commit_name.str,
                                             commit_name.length);
6051 6052 6053
          commit_id= entry->val_int(&null_value);
        });
      if (write_gtid_event(thd, true, using_trans, commit_id))
6054
        goto err;
6055 6056
    }
    else
6057
    {
6058 6059
      binlog_cache_mngr *const cache_mngr= thd->binlog_setup_trx_data();
      if (!cache_mngr)
6060
        goto err;
6061

unknown's avatar
unknown committed
6062
      is_trans_cache= use_trans_cache(thd, using_trans);
6063 6064
      file= cache_mngr->get_binlog_cache_log(is_trans_cache);
      cache_data= cache_mngr->get_binlog_cache_data(is_trans_cache);
Konstantin Osipov's avatar
Konstantin Osipov committed
6065

6066
      if (thd->lex->stmt_accessed_non_trans_temp_table())
6067 6068
        cache_data->set_changes_to_non_trans_temp_table();

6069
      thd->binlog_start_trans_and_stmt();
6070
    }
6071
    DBUG_PRINT("info",("event type: %d",event_info->get_type_code()));
6072

unknown's avatar
unknown committed
6073
    /*
6074 6075
       No check for auto events flag here - this write method should
       never be called if auto-events are enabled.
unknown's avatar
unknown committed
6076

6077 6078 6079
       Write first log events which describe the 'run environment'
       of the SQL command. If row-based binlogging, Insert_id, Rand
       and other kind of "setting context" events are not needed.
6080
    */
unknown's avatar
unknown committed
6081

6082 6083 6084
    if (with_annotate && *with_annotate)
    {
      DBUG_ASSERT(event_info->get_type_code() == TABLE_MAP_EVENT);
unknown's avatar
unknown committed
6085
      Annotate_rows_log_event anno(thd, using_trans, direct);
6086 6087 6088 6089 6090 6091
      /* Annotate event should be written not more than once */
      *with_annotate= 0;
      if (anno.write(file))
        goto err;
    }

Sergei Golubchik's avatar
Sergei Golubchik committed
6092
    if (thd)
6093
    {
6094
      if (!thd->is_current_stmt_binlog_format_row())
6095
      {
6096

6097
        if (thd->stmt_depends_on_first_successful_insert_id_in_prev_stmt)
6098 6099
        {
          Intvar_log_event e(thd,(uchar) LAST_INSERT_ID_EVENT,
unknown's avatar
unknown committed
6100 6101
                             thd->first_successful_insert_id_in_prev_stmt_for_binlog,
                             using_trans, direct);
6102 6103 6104
          if (e.write(file))
            goto err;
        }
6105
        if (thd->auto_inc_intervals_in_cur_stmt_for_binlog.nb_elements() > 0)
6106
        {
unknown's avatar
unknown committed
6107 6108 6109 6110 6111
          DBUG_PRINT("info",("number of auto_inc intervals: %u",
                             thd->auto_inc_intervals_in_cur_stmt_for_binlog.
                             nb_elements()));
          Intvar_log_event e(thd, (uchar) INSERT_ID_EVENT,
                             thd->auto_inc_intervals_in_cur_stmt_for_binlog.
unknown's avatar
unknown committed
6112
                             minimum(), using_trans, direct);
6113 6114 6115 6116 6117
          if (e.write(file))
            goto err;
        }
        if (thd->rand_used)
        {
unknown's avatar
unknown committed
6118 6119
          Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2,
                           using_trans, direct);
6120 6121 6122 6123 6124 6125 6126 6127
          if (e.write(file))
            goto err;
        }
        if (thd->user_var_events.elements)
        {
          for (uint i= 0; i < thd->user_var_events.elements; i++)
          {
            BINLOG_USER_VAR_EVENT *user_var_event;
6128
            get_dynamic(&thd->user_var_events,(uchar*) &user_var_event, i);
6129 6130 6131

            /* setting flags for user var log event */
            uchar flags= User_var_log_event::UNDEF_F;
6132
            if (user_var_event->unsigned_flag)
6133 6134
              flags|= User_var_log_event::UNSIGNED_F;

6135 6136 6137 6138 6139
            User_var_log_event e(thd, user_var_event->user_var_event->name.str,
                                 user_var_event->user_var_event->name.length,
                                 user_var_event->value,
                                 user_var_event->length,
                                 user_var_event->type,
6140
                                 user_var_event->charset_number,
unknown's avatar
unknown committed
6141 6142 6143
                                 flags,
                                 using_trans,
                                 direct);
6144 6145 6146 6147
            if (e.write(file))
              goto err;
          }
        }
unknown's avatar
unknown committed
6148
      }
6149
    }
unknown's avatar
unknown committed
6150

6151
    /*
6152 6153 6154
      Write the event.
    */
    if (event_info->write(file) ||
6155
        DBUG_EVALUATE_IF("injecting_fault_writing", 1, 0))
6156
      goto err;
unknown's avatar
unknown committed
6157

6158 6159
    error= 0;
err:
unknown's avatar
unknown committed
6160
    if (direct)
unknown's avatar
unknown committed
6161
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
6162
      my_off_t offset= my_b_tell(file);
Sergei Golubchik's avatar
Sergei Golubchik committed
6163
      bool check_purge= false;
6164

6165 6166 6167
      if (!error)
      {
        bool synced;
6168

6169
        if ((error= flush_and_sync(&synced)))
6170 6171 6172 6173
        {
        }
        else
        {
6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188
          mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
          mysql_mutex_assert_owner(&LOCK_log);
          mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
          mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
          bool first= true;
          bool last= true;
          if ((error= RUN_HOOK(binlog_storage, after_flush,
                               (thd, log_file_name, file->pos_in_file,
                                synced, first, last))))
          {
            sql_print_error("Failed to run 'after_flush' hooks");
            error= 1;
          }
          else
          {
6189 6190 6191 6192 6193 6194 6195 6196
            /* update binlog_end_pos so it can be read by dump thread
             *
             * note: must be _after_ the RUN_HOOK(after_flush) or else
             * semi-sync-plugin might not have put the transaction into
             * it's list before dump-thread tries to send it
             */
            update_binlog_end_pos(offset);

6197 6198 6199 6200
            signal_update();
            if ((error= rotate(false, &check_purge)))
              check_purge= false;
          }
6201
        }
He Zhenxing's avatar
He Zhenxing committed
6202
      }
Sergei Golubchik's avatar
Sergei Golubchik committed
6203

Sergei Golubchik's avatar
Sergei Golubchik committed
6204 6205 6206
      status_var_add(thd->status_var.binlog_bytes_written,
                     offset - my_org_b_tell);

6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223
      mysql_mutex_lock(&LOCK_after_binlog_sync);
      mysql_mutex_unlock(&LOCK_log);

      mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
      mysql_mutex_assert_not_owner(&LOCK_log);
      mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
      mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
      bool first= true;
      bool last= true;
      if (RUN_HOOK(binlog_storage, after_sync,
                   (thd, log_file_name, file->pos_in_file,
                    first, last)))
      {
        error=1;
        /* error is already printed inside hook */
      }

unknown's avatar
unknown committed
6224 6225 6226 6227
      /*
        Take mutex to protect against a reader seeing partial writes of 64-bit
        offset on 32-bit CPUs.
      */
Sergei Golubchik's avatar
Sergei Golubchik committed
6228
      mysql_mutex_lock(&LOCK_commit_ordered);
6229
      mysql_mutex_unlock(&LOCK_after_binlog_sync);
unknown's avatar
unknown committed
6230
      last_commit_pos_offset= offset;
Sergei Golubchik's avatar
Sergei Golubchik committed
6231
      mysql_mutex_unlock(&LOCK_commit_ordered);
Sergei Golubchik's avatar
Sergei Golubchik committed
6232 6233

      if (check_purge)
6234
        checkpoint_and_purge(prev_binlog_id);
6235 6236
    }

6237
    if (error)
6238
    {
6239
      set_write_error(thd, is_trans_cache);
6240
      if (check_write_error(thd) && cache_data &&
6241
          stmt_has_updated_non_trans_table(thd))
6242 6243
        cache_data->set_incident();
    }
6244
  }
6245

6246
  DBUG_RETURN(error);
6247 6248
}

6249 6250 6251 6252 6253 6254 6255 6256 6257

int error_log_print(enum loglevel level, const char *format,
                    va_list args)
{
  return logger.error_log_print(level, format, args);
}


bool slow_log_print(THD *thd, const char *query, uint query_length,
6258
                    ulonglong current_utime)
6259
{
6260
  return logger.slow_log_print(thd, query, query_length, current_utime);
6261 6262 6263
}


6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274
bool LOGGER::log_command(THD *thd, enum enum_server_command command)
{
#ifndef NO_EMBEDDED_ACCESS_CHECKS
  Security_context *sctx= thd->security_ctx;
#endif
  /*
    Log command if we have at least one log event handler enabled and want
    to log this king of commands
  */
  if (*general_log_handler_list && (what_to_log & (1L << (uint) command)))
  {
6275
    if ((thd->variables.option_bits & OPTION_LOG_OFF)
6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291
#ifndef NO_EMBEDDED_ACCESS_CHECKS
         && (sctx->master_access & SUPER_ACL)
#endif
       )
    {
      /* No logging */
      return FALSE;
    }

    return TRUE;
  }

  return FALSE;
}


6292 6293 6294 6295 6296 6297
bool general_log_print(THD *thd, enum enum_server_command command,
                       const char *format, ...)
{
  va_list args;
  uint error= 0;

6298 6299 6300 6301
  /* Print the message to the buffer if we want to log this king of commands */
  if (! logger.log_command(thd, command))
    return FALSE;

6302 6303 6304 6305 6306 6307 6308
  va_start(args, format);
  error= logger.general_log_print(thd, command, format, args);
  va_end(args);

  return error;
}

6309 6310 6311 6312
bool general_log_write(THD *thd, enum enum_server_command command,
                       const char *query, uint query_length)
{
  /* Write the message to the log if we want to log this king of commands */
6313
  if (logger.log_command(thd, command) || mysql_audit_general_enabled())
6314 6315 6316 6317 6318
    return logger.general_log_write(thd, command, query, query_length);

  return FALSE;
}

6319

6320
static void
6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372
binlog_checkpoint_callback(void *cookie)
{
  MYSQL_BIN_LOG::xid_count_per_binlog *entry=
    (MYSQL_BIN_LOG::xid_count_per_binlog *)cookie;
  /*
    For every supporting engine, we increment the xid_count and issue a
    commit_checkpoint_request(). Then we can count when all
    commit_checkpoint_notify() callbacks have occured, and then log a new
    binlog checkpoint event.
  */
  mysql_bin_log.mark_xids_active(entry->binlog_id, 1);
}


/*
  Request a commit checkpoint from each supporting engine.
  This must be called after each binlog rotate, and after LOCK_log has been
  released. The xid_count value in the xid_count_per_binlog entry was
  incremented by 1 and will be decremented in this function; this ensures
  that the entry will not go away early despite LOCK_log not being held.
*/
void
MYSQL_BIN_LOG::do_checkpoint_request(ulong binlog_id)
{
  xid_count_per_binlog *entry;

  /*
    Find the binlog entry, and invoke commit_checkpoint_request() on it in
    each supporting storage engine.
  */
  mysql_mutex_lock(&LOCK_xid_list);
  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
  do {
    entry= it++;
    DBUG_ASSERT(entry /* binlog_id is always somewhere in the list. */);
  } while (entry->binlog_id != binlog_id);
  mysql_mutex_unlock(&LOCK_xid_list);

  ha_commit_checkpoint_request(entry, binlog_checkpoint_callback);
  /*
    When we rotated the binlog, we incremented xid_count to make sure the
    entry would not go away until this point, where we have done all necessary
    commit_checkpoint_request() calls.
    So now we can (and must) decrease the count - when it reaches zero, we
    will know that both all pending unlog() and all pending
    commit_checkpoint_notify() calls are done, and we can log a new binlog
    checkpoint.
  */
  mark_xid_done(binlog_id, true);
}


6373
/**
6374 6375 6376 6377 6378 6379
  The method executes rotation when LOCK_log is already acquired
  by the caller.

  @param force_rotate  caller can request the log rotation
  @param check_purge   is set to true if rotation took place

6380 6381 6382 6383 6384 6385 6386 6387 6388
  @note
    Caller _must_ check the check_purge variable. If this is set, it means
    that the binlog was rotated, and caller _must_ ensure that
    do_checkpoint_request() is called later with the binlog_id of the rotated
    binlog file. The call to do_checkpoint_request() must happen after
    LOCK_log is released (which is why we cannot simply do it here).
    Usually, checkpoint_and_purge() is appropriate, as it will both handle
    the checkpointing and any needed purging of old logs.

6389 6390 6391 6392 6393 6394
  @note
    If rotation fails, for instance the server was unable 
    to create a new log file, we still try to write an 
    incident event to the current log.

  @retval
6395
    nonzero - error in rotating routine.
6396
*/
6397
int MYSQL_BIN_LOG::rotate(bool force_rotate, bool* check_purge)
6398
{
6399
  int error= 0;
6400 6401
  DBUG_ENTER("MYSQL_BIN_LOG::rotate");

6402 6403 6404 6405 6406 6407 6408 6409 6410
  if (wsrep_to_isolation)
  {
    DBUG_ASSERT(WSREP_ON);
    *check_purge= false;
    WSREP_DEBUG("avoiding binlog rotate due to TO isolation: %d", 
                wsrep_to_isolation);
    DBUG_RETURN(0);
  }

6411 6412 6413 6414
  //todo: fix the macro def and restore safe_mutex_assert_owner(&LOCK_log);
  *check_purge= false;

  if (force_rotate || (my_b_tell(&log_file) >= (my_off_t) max_size))
6415
  {
6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434
    ulong binlog_id= current_binlog_id;
    /*
      We rotate the binlog, so we need to start a commit checkpoint in all
      supporting engines - when it finishes, we can log a new binlog checkpoint
      event.

      But we cannot start the checkpoint here - there could be a group commit
      still in progress which needs to be included in the checkpoint, and
      besides we do not want to do the (possibly expensive) checkpoint while
      LOCK_log is held.

      On the other hand, we must be sure that the xid_count entry for the
      previous log does not go away until we start the checkpoint - which it
      could do as it is no longer the most recent. So we increment xid_count
      (to count the pending checkpoint request) - this will fix the entry in
      place until we decrement again in do_checkpoint_request().
    */
    mark_xids_active(binlog_id, 1);

6435
    if ((error= new_file_without_locking()))
6436
    {
6437 6438 6439 6440 6441 6442 6443 6444 6445
      /** 
         Be conservative... There are possible lost events (eg, 
         failing to log the Execute_load_query_log_event
         on a LOAD DATA while using a non-transactional
         table)!

         We give it a shot and try to write an incident event anyway
         to the current log. 
      */
6446
      if (!write_incident_already_locked(current_thd))
Luis Soares's avatar
Luis Soares committed
6447
        flush_and_sync(0);
6448

6449 6450 6451 6452 6453 6454 6455 6456
      /*
        We failed to rotate - so we have to decrement the xid_count back that
        we incremented before attempting the rotate.
      */
      mark_xid_done(binlog_id, false);
    }
    else
      *check_purge= true;
6457
  }
6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468
  DBUG_RETURN(error);
}

/**
  The method executes logs purging routine.

  @retval
    nonzero - error in rotating routine.
*/
void MYSQL_BIN_LOG::purge()
{
Sergei Golubchik's avatar
Sergei Golubchik committed
6469
  mysql_mutex_assert_not_owner(&LOCK_log);
6470
#ifdef HAVE_REPLICATION
6471
  if (expire_logs_days)
6472
  {
6473
    DEBUG_SYNC(current_thd, "at_purge_logs_before_date");
6474 6475
    time_t purge_time= my_time(0) - expire_logs_days*24*60*60;
    if (purge_time >= 0)
6476
    {
6477
      purge_logs_before_date(purge_time);
6478
    }
6479
    DEBUG_SYNC(current_thd, "after_purge_logs_before_date");
6480 6481
  }
#endif
6482 6483
}

6484 6485 6486 6487 6488 6489 6490

void MYSQL_BIN_LOG::checkpoint_and_purge(ulong binlog_id)
{
  do_checkpoint_request(binlog_id);
  purge();
}

6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502
/**
  The method is a shortcut of @c rotate() and @c purge().
  LOCK_log is acquired prior to rotate and is released after it.

  @param force_rotate  caller can request the log rotation

  @retval
    nonzero - error in rotating routine.
*/
int MYSQL_BIN_LOG::rotate_and_purge(bool force_rotate)
{
  int error= 0;
6503
  ulong prev_binlog_id;
6504 6505 6506 6507 6508
  DBUG_ENTER("MYSQL_BIN_LOG::rotate_and_purge");
  bool check_purge= false;

  //todo: fix the macro def and restore safe_mutex_assert_not_owner(&LOCK_log);
  mysql_mutex_lock(&LOCK_log);
6509
  prev_binlog_id= current_binlog_id;
Sergei Golubchik's avatar
Sergei Golubchik committed
6510 6511
  if ((error= rotate(force_rotate, &check_purge)))
    check_purge= false;
6512 6513 6514 6515 6516 6517
  /*
    NOTE: Run purge_logs wo/ holding LOCK_log because it does not need
          the mutex. Otherwise causes various deadlocks.
  */
  mysql_mutex_unlock(&LOCK_log);

Sergei Golubchik's avatar
Sergei Golubchik committed
6518
  if (check_purge)
6519
    checkpoint_and_purge(prev_binlog_id);
6520

6521
  DBUG_RETURN(error);
6522
}
6523

6524
uint MYSQL_BIN_LOG::next_file_id()
unknown's avatar
unknown committed
6525 6526
{
  uint res;
Marc Alff's avatar
Marc Alff committed
6527
  mysql_mutex_lock(&LOCK_log);
unknown's avatar
unknown committed
6528
  res = file_id++;
Marc Alff's avatar
Marc Alff committed
6529
  mysql_mutex_unlock(&LOCK_log);
unknown's avatar
unknown committed
6530 6531 6532
  return res;
}

6533

6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560
/**
  Calculate checksum of possibly a part of an event containing at least
  the whole common header.

  @param    buf       the pointer to trans cache's buffer
  @param    off       the offset of the beginning of the event in the buffer
  @param    event_len no-checksum length of the event
  @param    length    the current size of the buffer

  @param    crc       [in-out] the checksum

  Event size in incremented by @c BINLOG_CHECKSUM_LEN.

  @return 0 or number of unprocessed yet bytes of the event excluding 
            the checksum part.
*/
  static ulong fix_log_event_crc(uchar *buf, uint off, uint event_len,
                                 uint length, ha_checksum *crc)
{
  ulong ret;
  uchar *event_begin= buf + off;

  ret= length >= off + event_len ? 0 : off + event_len - length;
  *crc= my_checksum(*crc, event_begin, event_len - ret); 
  return ret;
}

6561 6562 6563 6564 6565
/*
  Write the contents of a cache to the binary log.

  SYNOPSIS
    write_cache()
6566
    thd      Current_thread
6567 6568 6569 6570 6571 6572
    cache    Cache to write to the binary log

  DESCRIPTION
    Write the contents of the cache to the binary log. The cache will
    be reset as a READ_CACHE to be able to read the contents from it.

6573 6574 6575 6576
    Reading from the trans cache with possible (per @c binlog_checksum_options) 
    adding checksum value  and then fixing the length and the end_log_pos of 
    events prior to fill in the binlog cache.
*/
6577

6578
int MYSQL_BIN_LOG::write_cache(THD *thd, IO_CACHE *cache)
6579
{
Sergei Golubchik's avatar
Sergei Golubchik committed
6580
  mysql_mutex_assert_owner(&LOCK_log);
6581 6582
  if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
    return ER_ERROR_ON_WRITE;
6583
  uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
6584
  ulong remains= 0; // part of unprocessed yet netto length of the event
6585
  long val;
6586
  ulong end_log_pos_inc= 0; // each event processed adds BINLOG_CHECKSUM_LEN 2 t
6587
  uchar header[LOG_EVENT_HEADER_LEN];
Sergei Golubchik's avatar
Sergei Golubchik committed
6588
  ha_checksum crc= 0;
6589 6590
  my_bool do_checksum= (binlog_checksum_options != BINLOG_CHECKSUM_ALG_OFF);
  uchar buf[BINLOG_CHECKSUM_LEN];
6591
  DBUG_ENTER("MYSQL_BIN_LOG::write_cache");
6592 6593 6594 6595

  // while there is just one alg the following must hold:
  DBUG_ASSERT(!do_checksum ||
              binlog_checksum_options == BINLOG_CHECKSUM_ALG_CRC32);
6596

6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610
  /*
    The events in the buffer have incorrect end_log_pos data
    (relative to beginning of group rather than absolute),
    so we'll recalculate them in situ so the binlog is always
    correct, even in the middle of a group. This is possible
    because we now know the start position of the group (the
    offset of this cache in the log, if you will); all we need
    to do is to find all event-headers, and add the position of
    the group to the end_log_pos of each event.  This is pretty
    straight forward, except that we read the cache in segments,
    so an event-header might end up on the cache-border and get
    split.
  */

6611
  group= (uint)my_b_tell(&log_file);
6612
  hdr_offs= carry= 0;
Sergei Golubchik's avatar
Sergei Golubchik committed
6613

6614 6615
  do
  {
6616 6617 6618 6619 6620
    /*
      if we only got a partial header in the last iteration,
      get the other half now and process a full header.
    */
    if (unlikely(carry > 0))
6621 6622 6623
    {
      DBUG_ASSERT(carry < LOG_EVENT_HEADER_LEN);

6624
      /* assemble both halves */
6625 6626
      memcpy(&header[carry], (char *)cache->read_pos,
             LOG_EVENT_HEADER_LEN - carry);
6627

6628
      /* fix end_log_pos */
6629 6630
      val= uint4korr(&header[LOG_POS_OFFSET]) + group +
        (end_log_pos_inc+= (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
6631 6632
      int4store(&header[LOG_POS_OFFSET], val);

6633 6634 6635 6636 6637 6638 6639
      if (do_checksum)
      {
        ulong len= uint4korr(&header[EVENT_LEN_OFFSET]);
        /* fix len */
        int4store(&header[EVENT_LEN_OFFSET], len + BINLOG_CHECKSUM_LEN);
      }

6640
      /* write the first half of the split header */
6641
      if (my_b_write(&log_file, header, carry))
6642
        DBUG_RETURN(ER_ERROR_ON_WRITE);
6643
      status_var_add(thd->status_var.binlog_bytes_written, carry);
6644

6645 6646 6647 6648
      /*
        copy fixed second half of header to cache so the correct
        version will be written later.
      */
6649 6650
      memcpy((char *)cache->read_pos, &header[carry],
             LOG_EVENT_HEADER_LEN - carry);
6651

6652
      /* next event header at ... */
6653 6654
      hdr_offs= uint4korr(&header[EVENT_LEN_OFFSET]) - carry -
        (do_checksum ? BINLOG_CHECKSUM_LEN : 0);
6655

6656 6657
      if (do_checksum)
      {
Sergei Golubchik's avatar
Sergei Golubchik committed
6658
        DBUG_ASSERT(crc == 0 && remains == 0);
6659 6660 6661 6662
        crc= my_checksum(crc, header, carry);
        remains= uint4korr(header + EVENT_LEN_OFFSET) - carry -
          BINLOG_CHECKSUM_LEN;
      }
6663 6664 6665
      carry= 0;
    }

6666 6667
    /* if there is anything to write, process it. */

6668
    if (likely(length > 0))
6669
    {
6670
      /*
6671 6672 6673 6674
        process all event-headers in this (partial) cache.
        if next header is beyond current read-buffer,
        we'll get it later (though not necessarily in the
        very next iteration, just "eventually").
6675
      */
6676

6677 6678 6679 6680
      /* crc-calc the whole buffer */
      if (do_checksum && hdr_offs >= length)
      {

Sergei Golubchik's avatar
Sergei Golubchik committed
6681
        DBUG_ASSERT(remains != 0 && crc != 0);
6682 6683 6684 6685

        crc= my_checksum(crc, cache->read_pos, length); 
        remains -= length;
        if (my_b_write(&log_file, cache->read_pos, length))
6686
          DBUG_RETURN(ER_ERROR_ON_WRITE);
6687 6688 6689 6690
        if (remains == 0)
        {
          int4store(buf, crc);
          if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
6691
            DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
6692
          crc= 0;
6693 6694 6695
        }
      }

6696
      while (hdr_offs < length)
6697
      {
6698 6699 6700 6701
        /*
          partial header only? save what we can get, process once
          we get the rest.
        */
6702

6703 6704 6705 6706 6707 6708 6709 6710
        if (do_checksum)
        {
          if (remains != 0)
          {
            /*
              finish off with remains of the last event that crawls
              from previous into the current buffer
            */
Sergei Golubchik's avatar
Sergei Golubchik committed
6711
            DBUG_ASSERT(crc != 0);
6712 6713 6714 6715 6716 6717
            crc= my_checksum(crc, cache->read_pos, hdr_offs);
            int4store(buf, crc);
            remains -= hdr_offs;
            DBUG_ASSERT(remains == 0);
            if (my_b_write(&log_file, cache->read_pos, hdr_offs) ||
                my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
6718
              DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
6719
            crc= 0;
6720 6721 6722
          }
        }

6723
        if (hdr_offs + LOG_EVENT_HEADER_LEN > length)
6724
        {
6725
          carry= length - hdr_offs;
6726
          memcpy(header, (char *)cache->read_pos + hdr_offs, carry);
6727
          length= hdr_offs;
6728 6729 6730 6731
        }
        else
        {
          /* we've got a full event-header, and it came in one piece */
6732 6733 6734
          uchar *ev= (uchar *)cache->read_pos + hdr_offs;
          uint event_len= uint4korr(ev + EVENT_LEN_OFFSET); // netto len
          uchar *log_pos= ev + LOG_POS_OFFSET;
6735

6736
          /* fix end_log_pos */
6737 6738
          val= uint4korr(log_pos) + group +
            (end_log_pos_inc += (do_checksum ? BINLOG_CHECKSUM_LEN : 0));
6739
          int4store(log_pos, val);
6740

6741 6742 6743 6744 6745 6746 6747 6748 6749
	  /* fix CRC */
	  if (do_checksum)
          {
            /* fix length */
            int4store(ev + EVENT_LEN_OFFSET, event_len + BINLOG_CHECKSUM_LEN);
            remains= fix_log_event_crc(cache->read_pos, hdr_offs, event_len,
                                       length, &crc);
            if (my_b_write(&log_file, ev, 
                           remains == 0 ? event_len : length - hdr_offs))
6750
              DBUG_RETURN(ER_ERROR_ON_WRITE);
6751 6752 6753 6754
            if (remains == 0)
            {
              int4store(buf, crc);
              if (my_b_write(&log_file, buf, BINLOG_CHECKSUM_LEN))
6755
                DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
6756
              crc= 0; // crc is complete
6757 6758 6759
            }
          }

6760
          /* next event header at ... */
6761
          hdr_offs += event_len; // incr by the netto len
6762

6763
          DBUG_ASSERT(!do_checksum || remains == 0 || hdr_offs >= length);
6764
        }
6765
      }
6766 6767

      /*
6768 6769 6770 6771 6772 6773
        Adjust hdr_offs. Note that it may still point beyond the segment
        read in the next iteration; if the current event is very long,
        it may take a couple of read-iterations (and subsequent adjustments
        of hdr_offs) for it to point into the then-current segment.
        If we have a split header (!carry), hdr_offs will be set at the
        beginning of the next iteration, overwriting the value we set here:
6774
      */
6775
      hdr_offs -= length;
6776 6777 6778
    }

    /* Write data to the binary log file */
6779
    DBUG_EXECUTE_IF("fail_binlog_write_1",
6780
                    errno= 28; DBUG_RETURN(ER_ERROR_ON_WRITE););
6781 6782
    if (!do_checksum)
      if (my_b_write(&log_file, cache->read_pos, length))
6783
        DBUG_RETURN(ER_ERROR_ON_WRITE);
6784 6785
    status_var_add(thd->status_var.binlog_bytes_written, length);

6786
  } while ((length= my_b_fill(cache)));
6787

6788
  DBUG_ASSERT(carry == 0);
6789
  DBUG_ASSERT(!do_checksum || remains == 0);
Sergei Golubchik's avatar
Sergei Golubchik committed
6790
  DBUG_ASSERT(!do_checksum || crc == 0);
6791

6792
  DBUG_RETURN(0);                               // All OK
6793 6794
}

6795 6796 6797 6798 6799 6800 6801
/*
  Helper function to get the error code of the query to be binlogged.
 */
int query_error_code(THD *thd, bool not_killed)
{
  int error;
  
6802
  if (not_killed || (killed_mask_hard(thd->killed) == KILL_BAD_DATA))
6803
  {
6804
    error= thd->is_error() ? thd->get_stmt_da()->sql_errno() : 0;
6805

6806
    /* thd->get_get_stmt_da()->sql_errno() might be ER_SERVER_SHUTDOWN or
6807 6808 6809 6810
       ER_QUERY_INTERRUPTED, So here we need to make sure that error
       is not set to these errors when specified not_killed by the
       caller.
    */
6811
    if (error == ER_SERVER_SHUTDOWN || error == ER_QUERY_INTERRUPTED ||
6812
        error == ER_NEW_ABORTING_CONNECTION || error == ER_CONNECTION_KILLED)
6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824
      error= 0;
  }
  else
  {
    /* killed status for DELAYED INSERT thread should never be used */
    DBUG_ASSERT(!(thd->system_thread & SYSTEM_THREAD_DELAYED_INSERT));
    error= thd->killed_errno();
  }

  return error;
}

6825 6826

bool MYSQL_BIN_LOG::write_incident_already_locked(THD *thd)
6827 6828
{
  uint error= 0;
6829
  DBUG_ENTER("MYSQL_BIN_LOG::write_incident_already_locked");
6830 6831
  Incident incident= INCIDENT_LOST_EVENTS;
  Incident_log_event ev(thd, incident, write_error_msg);
6832

unknown's avatar
unknown committed
6833
  if (likely(is_open()))
6834
  {
unknown's avatar
unknown committed
6835
    error= ev.write(&log_file);
unknown's avatar
unknown committed
6836
    status_var_add(thd->status_var.binlog_bytes_written, ev.data_written);
6837 6838 6839 6840 6841 6842 6843 6844 6845 6846
  }

  DBUG_RETURN(error);
}


bool MYSQL_BIN_LOG::write_incident(THD *thd)
{
  uint error= 0;
  my_off_t offset;
Sergei Golubchik's avatar
Sergei Golubchik committed
6847
  bool check_purge= false;
6848
  ulong prev_binlog_id;
6849 6850
  DBUG_ENTER("MYSQL_BIN_LOG::write_incident");

Sergei Golubchik's avatar
Sergei Golubchik committed
6851
  mysql_mutex_lock(&LOCK_log);
6852
  if (likely(is_open()))
6853
  {
6854
    prev_binlog_id= current_binlog_id;
6855
    if (!(error= write_incident_already_locked(thd)) &&
Sergei Golubchik's avatar
Sergei Golubchik committed
6856
        !(error= flush_and_sync(0)))
6857 6858
    {
      signal_update();
Sergei Golubchik's avatar
Sergei Golubchik committed
6859 6860
      if ((error= rotate(false, &check_purge)))
        check_purge= false;
6861
    }
Sergei Golubchik's avatar
Sergei Golubchik committed
6862

6863
    offset= my_b_tell(&log_file);
6864 6865 6866

    update_binlog_end_pos(offset);

6867 6868 6869 6870
    /*
      Take mutex to protect against a reader seeing partial writes of 64-bit
      offset on 32-bit CPUs.
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
6871
    mysql_mutex_lock(&LOCK_commit_ordered);
6872
    last_commit_pos_offset= offset;
Sergei Golubchik's avatar
Sergei Golubchik committed
6873
    mysql_mutex_unlock(&LOCK_commit_ordered);
Sergei Golubchik's avatar
Sergei Golubchik committed
6874 6875 6876
    mysql_mutex_unlock(&LOCK_log);

    if (check_purge)
6877
      checkpoint_and_purge(prev_binlog_id);
6878
  }
6879 6880 6881 6882
  else
  {
    mysql_mutex_unlock(&LOCK_log);
  }
6883

6884 6885 6886
  DBUG_RETURN(error);
}

6887
void
6888
MYSQL_BIN_LOG::write_binlog_checkpoint_event_already_locked(const char *name_arg, uint len)
6889
{
6890
  my_off_t offset;
6891
  Binlog_checkpoint_log_event ev(name_arg, len);
6892 6893 6894 6895 6896 6897 6898 6899
  /*
    Note that we must sync the binlog checkpoint to disk.
    Otherwise a subsequent log purge could delete binlogs that XA recovery
    thinks are needed (even though they are not really).
  */
  if (!ev.write(&log_file) && !flush_and_sync(0))
  {
    signal_update();
6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911
  }
  else
  {
    /*
      If we fail to write the checkpoint event, something is probably really
      bad with the binlog. We complain in the error log.

      Note that failure to write binlog checkpoint does not compromise the
      ability to do crash recovery - crash recovery will just have to scan a
      bit more of the binlog than strictly necessary.
    */
    sql_print_error("Failed to write binlog checkpoint event to binary log\n");
6912 6913
  }

6914
  offset= my_b_tell(&log_file);
6915 6916 6917

  update_binlog_end_pos(offset);

6918
  /*
6919 6920
    Take mutex to protect against a reader seeing partial writes of 64-bit
    offset on 32-bit CPUs.
6921
  */
6922 6923 6924
  mysql_mutex_lock(&LOCK_commit_ordered);
  last_commit_pos_offset= offset;
  mysql_mutex_unlock(&LOCK_commit_ordered);
6925 6926 6927
}


unknown's avatar
unknown committed
6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938
/**
  Write a cached log entry to the binary log.
  - To support transaction over replication, we wrap the transaction
  with BEGIN/COMMIT or BEGIN/ROLLBACK in the binary log.
  We want to write a BEGIN/ROLLBACK block when a non-transactional table
  was updated in a transaction which was rolled back. This is to ensure
  that the same updates are run on the slave.

  @param thd
  @param cache		The cache to copy to the binlog
  @param commit_event   The commit event to print after writing the
6939
                        contents of the cache.
6940 6941 6942
  @param incident       Defines if an incident event should be created to
                        notify that some non-transactional changes did
                        not get into the binlog.
unknown's avatar
unknown committed
6943

unknown's avatar
unknown committed
6944 6945 6946 6947 6948 6949
  @note
    We only come here if there is something in the cache.
  @note
    The thing in the cache is always a complete transaction.
  @note
    'cache' needs to be reinitialized after this functions returns.
6950 6951
*/

6952
bool
Sergei Golubchik's avatar
Sergei Golubchik committed
6953 6954 6955 6956 6957
MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd,
                                           binlog_cache_mngr *cache_mngr,
                                           Log_event *end_ev, bool all,
                                           bool using_stmt_cache,
                                           bool using_trx_cache)
6958
{
6959
  group_commit_entry entry;
6960
  Ha_trx_info *ha_info;
6961 6962
  DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");

6963 6964
  /*
    Control should not be allowed beyond this point in wsrep_emulate_bin_log
6965 6966
    mode. Also, do not write the cached updates to binlog if binary logging is
    disabled (log-bin/sql_log_bin).
6967
  */
6968 6969
  if (wsrep_emulate_bin_log || !(thd->variables.option_bits & OPTION_BIN_LOG))
    DBUG_RETURN(0);
6970

6971
  entry.thd= thd;
Sergei Golubchik's avatar
Sergei Golubchik committed
6972
  entry.cache_mngr= cache_mngr;
6973 6974
  entry.error= 0;
  entry.all= all;
Sergei Golubchik's avatar
Sergei Golubchik committed
6975 6976
  entry.using_stmt_cache= using_stmt_cache;
  entry.using_trx_cache= using_trx_cache;
6977 6978
  entry.need_unlog= false;
  ha_info= all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
6979

6980 6981 6982 6983 6984 6985 6986
  for (; ha_info; ha_info= ha_info->next())
  {
    if (ha_info->is_started() && ha_info->ht() != binlog_hton &&
        !ha_info->ht()->commit_checkpoint_request)
      entry.need_unlog= true;
    break;
  }
6987

6988
  entry.end_event= end_ev;
Sergei Golubchik's avatar
Sergei Golubchik committed
6989 6990
  if (cache_mngr->stmt_cache.has_incident() ||
      cache_mngr->trx_cache.has_incident())
6991 6992
  {
    Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
6993 6994
    entry.incident_event= &inc_ev;
    DBUG_RETURN(write_transaction_to_binlog_events(&entry));
6995 6996 6997
  }
  else
  {
6998 6999
    entry.incident_event= NULL;
    DBUG_RETURN(write_transaction_to_binlog_events(&entry));
7000 7001
  }
}
7002

unknown's avatar
unknown committed
7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027

/*
  Put a transaction that is ready to commit in the group commit queue.
  The transaction is identified by the ENTRY object passed into this function.

  To facilitate group commit for the binlog, we first queue up ourselves in
  this function. Then later the first thread to enter the queue waits for
  the LOCK_log mutex, and commits for everyone in the queue once it gets the
  lock. Any other threads in the queue just wait for the first one to finish
  the commit and wake them up. This way, all transactions in the queue get
  committed in a single disk operation.

  The main work in this function is when the commit in one transaction has
  been marked to wait for the commit of another transaction to happen
  first. This is used to support in-order parallel replication, where
  transactions can execute out-of-order but need to be committed in-order with
  how they happened on the master. The waiting of one commit on another needs
  to be integrated with the group commit queue, to ensure that the waiting
  transaction can participate in the same group commit as the waited-for
  transaction.

  So when we put a transaction in the queue, we check if there were other
  transactions already prepared to commit but just waiting for the first one
  to commit. If so, we add those to the queue as well, transitively for all
  waiters.
Michael Widenius's avatar
Michael Widenius committed
7028

7029 7030 7031 7032
  And if a transaction is marked to wait for a prior transaction, but that
  prior transaction is already queued for group commit, then we can queue the
  new transaction directly to participate in the group commit.

7033 7034 7035 7036
  @retval < 0   Error
  @retval > 0   If queued as the first entry in the queue (meaning this
                is the leader)
  @retval   0   Otherwise (queued as participant, leader handles the commit)
unknown's avatar
unknown committed
7037 7038
*/

7039
int
unknown's avatar
unknown committed
7040
MYSQL_BIN_LOG::queue_for_group_commit(group_commit_entry *orig_entry)
7041
{
7042 7043
  group_commit_entry *entry, *orig_queue, *last;
  wait_for_commit *cur;
unknown's avatar
unknown committed
7044
  wait_for_commit *wfc;
unknown's avatar
unknown committed
7045
  DBUG_ENTER("MYSQL_BIN_LOG::queue_for_group_commit");
7046

7047
  /*
unknown's avatar
unknown committed
7048 7049 7050 7051 7052 7053
    Check if we need to wait for another transaction to commit before us.

    It is safe to do a quick check without lock first in the case where we do
    not have to wait. But if the quick check shows we need to wait, we must do
    another safe check under lock, to avoid the race where the other
    transaction wakes us up between the check and the wait.
7054
  */
unknown's avatar
unknown committed
7055 7056
  wfc= orig_entry->thd->wait_for_commit_ptr;
  orig_entry->queued_by_other= false;
unknown's avatar
unknown committed
7057
  if (wfc && wfc->waitee)
unknown's avatar
unknown committed
7058 7059
  {
    mysql_mutex_lock(&wfc->LOCK_wait_commit);
7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070
    /*
      Do an extra check here, this time safely under lock.

      If waitee->commit_started is set, it means that the transaction we need
      to wait for has already queued up for group commit. In this case it is
      safe for us to queue up immediately as well, increasing the opprtunities
      for group commit. Because waitee has taken the LOCK_prepare_ordered
      before setting the flag, so there is no risk that we can queue ahead of
      it.
    */
    if (wfc->waitee && !wfc->waitee->commit_started)
unknown's avatar
unknown committed
7071
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
7072
      PSI_stage_info old_stage;
unknown's avatar
unknown committed
7073 7074
      wait_for_commit *loc_waitee;

unknown's avatar
unknown committed
7075 7076 7077 7078 7079 7080 7081 7082
      /*
        By setting wfc->opaque_pointer to our own entry, we mark that we are
        ready to commit, but waiting for another transaction to commit before
        us.

        This other transaction may then take over the commit process for us to
        get us included in its own group commit. If this happens, the
        queued_by_other flag is set.
7083 7084 7085 7086 7087

        Setting this flag may or may not be seen by the other thread, but we
        are safe in any case: The other thread will set queued_by_other under
        its LOCK_wait_commit, and we will not check queued_by_other only after
        we have been woken up.
unknown's avatar
unknown committed
7088
      */
unknown's avatar
unknown committed
7089
      wfc->opaque_pointer= orig_entry;
unknown's avatar
unknown committed
7090
      DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior");
Sergei Golubchik's avatar
Sergei Golubchik committed
7091 7092 7093 7094
      orig_entry->thd->ENTER_COND(&wfc->COND_wait_commit,
                                  &wfc->LOCK_wait_commit,
                                  &stage_waiting_for_prior_transaction_to_commit,
                                  &old_stage);
unknown's avatar
unknown committed
7095
      while ((loc_waitee= wfc->waitee) && !orig_entry->thd->check_killed())
unknown's avatar
unknown committed
7096 7097
        mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
      wfc->opaque_pointer= NULL;
unknown's avatar
unknown committed
7098 7099
      DBUG_PRINT("info", ("After waiting for prior commit, queued_by_other=%d",
                 orig_entry->queued_by_other));
7100

unknown's avatar
unknown committed
7101
      if (loc_waitee)
7102
      {
7103 7104 7105 7106 7107 7108 7109 7110 7111 7112
        /* Wait terminated due to kill. */
        mysql_mutex_lock(&loc_waitee->LOCK_wait_commit);
        if (loc_waitee->wakeup_subsequent_commits_running ||
            orig_entry->queued_by_other)
        {
          /* Our waitee is already waking us up, so ignore the kill. */
          mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
          do
          {
            mysql_cond_wait(&wfc->COND_wait_commit, &wfc->LOCK_wait_commit);
unknown's avatar
unknown committed
7113
          } while (wfc->waitee);
7114 7115 7116 7117 7118 7119
        }
        else
        {
          /* We were killed, so remove us from the list of waitee. */
          wfc->remove_from_list(&loc_waitee->subsequent_commits_list);
          mysql_mutex_unlock(&loc_waitee->LOCK_wait_commit);
unknown's avatar
unknown committed
7120
          wfc->waitee= NULL;
7121

Sergei Golubchik's avatar
Sergei Golubchik committed
7122
          orig_entry->thd->EXIT_COND(&old_stage);
7123 7124 7125
          /* Interrupted by kill. */
          DEBUG_SYNC(orig_entry->thd, "group_commit_waiting_for_prior_killed");
          wfc->wakeup_error= orig_entry->thd->killed_errno();
7126
          if (!wfc->wakeup_error)
7127
            wfc->wakeup_error= ER_QUERY_INTERRUPTED;
7128 7129
          my_message(wfc->wakeup_error,
                     ER_THD(orig_entry->thd, wfc->wakeup_error), MYF(0));
7130 7131
          DBUG_RETURN(-1);
        }
7132
      }
Sergei Golubchik's avatar
Sergei Golubchik committed
7133
      orig_entry->thd->EXIT_COND(&old_stage);
7134 7135 7136
    }
    else
      mysql_mutex_unlock(&wfc->LOCK_wait_commit);
unknown's avatar
unknown committed
7137
  }
unknown's avatar
unknown committed
7138 7139 7140 7141 7142
  /*
    If the transaction we were waiting for has already put us into the group
    commit queue (and possibly already done the entire binlog commit for us),
    then there is nothing else to do.
  */
unknown's avatar
unknown committed
7143
  if (orig_entry->queued_by_other)
7144
    DBUG_RETURN(0);
unknown's avatar
unknown committed
7145

7146 7147 7148 7149 7150 7151
  if (wfc && wfc->wakeup_error)
  {
    my_error(ER_PRIOR_COMMIT_FAILED, MYF(0));
    DBUG_RETURN(-1);
  }

unknown's avatar
unknown committed
7152
  /* Now enqueue ourselves in the group commit queue. */
unknown's avatar
unknown committed
7153 7154
  DEBUG_SYNC(orig_entry->thd, "commit_before_enqueue");
  orig_entry->thd->clear_wakeup_ready();
Sergei Golubchik's avatar
Sergei Golubchik committed
7155
  mysql_mutex_lock(&LOCK_prepare_ordered);
7156 7157 7158 7159 7160
  orig_queue= group_commit_queue;

  /*
    Iteratively process everything added to the queue, looking for waiters,
    and their waiters, and so on. If a waiter is ready to commit, we
7161
    immediately add it to the queue, and mark it as queued_by_other.
7162

7163 7164 7165
    This would be natural to do with recursion, but we want to avoid
    potentially unbounded recursion blowing the C stack, so we use the list
    approach instead.
unknown's avatar
unknown committed
7166

7167 7168 7169
    We keep a list of the group_commit_entry of all the waiters that need to
    be processed. Initially this list contains only the entry passed into this
    function.
unknown's avatar
unknown committed
7170 7171

    We process entries in the list one by one. The element currently being
7172
    processed is pointed to by `entry`, and the element at the end of the list
unknown's avatar
unknown committed
7173 7174
    is pointed to by `last` (we do not use NULL to terminate the list).

7175 7176 7177 7178
    As we process an entry, any waiters for that entry are added at the end of
    the list, to be processed in subsequent iterations. The the entry is added
    to the group_commit_queue.  This continues until the list is exhausted,
    with all entries ever added eventually processed.
unknown's avatar
unknown committed
7179 7180

    The end result is a breath-first traversal of the tree of waiters,
7181 7182
    re-using the `next' pointers of the group_commit_entry objects in place of
    extra stack space in a recursive traversal.
7183

7184 7185 7186 7187
    The temporary list linked through these `next' pointers is not used by the
    caller or any other function; it only exists while doing the iterative
    tree traversal. After, all the processed entries are linked into the
    group_commit_queue.
7188
  */
Michael Widenius's avatar
Michael Widenius committed
7189

7190
  cur= wfc;
7191
  last= orig_entry;
unknown's avatar
unknown committed
7192
  entry= orig_entry;
7193
  for (;;)
7194
  {
7195
    group_commit_entry *next_entry;
7196 7197 7198 7199 7200 7201 7202 7203

    if (entry->cache_mngr->using_xa)
    {
      DEBUG_SYNC(entry->thd, "commit_before_prepare_ordered");
      run_prepare_ordered(entry->thd, entry->all);
      DEBUG_SYNC(entry->thd, "commit_after_prepare_ordered");
    }

7204
    if (cur)
7205
    {
unknown's avatar
unknown committed
7206
      /*
7207 7208 7209 7210 7211 7212 7213
        Now that we have taken LOCK_prepare_ordered and will queue up in the
        group commit queue, it is safe for following transactions to queue
        themselves. We will grab here any transaction that is now ready to
        queue up, but after that, more transactions may become ready while the
        leader is waiting to start the group commit. So set the flag
        `commit_started', so that later transactions can still participate in
        the group commit..
unknown's avatar
unknown committed
7214
      */
7215
      cur->commit_started= true;
7216

unknown's avatar
unknown committed
7217
      /*
7218 7219
        Check if this transaction has other transaction waiting for it to
        commit.
7220

7221 7222
        If so, process the waiting transactions, and their waiters and so on,
        transitively.
unknown's avatar
unknown committed
7223
      */
7224
      if (cur->subsequent_commits_list)
7225
      {
7226
        wait_for_commit *waiter, **waiter_ptr;
7227

7228 7229 7230 7231 7232 7233 7234 7235
        mysql_mutex_lock(&cur->LOCK_wait_commit);
        /*
          Grab the list, now safely under lock, and process it if still
          non-empty.
        */
        waiter= cur->subsequent_commits_list;
        waiter_ptr= &cur->subsequent_commits_list;
        while (waiter)
7236
        {
7237 7238 7239 7240
          wait_for_commit *next_waiter= waiter->next_subsequent_commit;
          group_commit_entry *entry2=
            (group_commit_entry *)waiter->opaque_pointer;
          if (entry2)
unknown's avatar
unknown committed
7241
          {
7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261
            /*
              This is another transaction ready to be written to the binary
              log. We can put it into the queue directly, without needing a
              separate context switch to the other thread. We just set a flag
              so that the other thread will know when it wakes up that it was
              already processed.

              So remove it from the list of our waiters, and instead put it at
              the end of the list to be processed in a subsequent iteration of
              the outer loop.
            */
            *waiter_ptr= next_waiter;
            entry2->queued_by_other= true;
            last->next= entry2;
            last= entry2;
            /*
              As a small optimisation, we do not actually need to set
              entry2->next to NULL, as we can use the pointer `last' to check
              for end-of-list.
            */
7262
          }
7263
          else
unknown's avatar
unknown committed
7264
          {
7265 7266 7267 7268 7269 7270 7271 7272
            /*
              This transaction is not ready to participate in the group commit
              yet, so leave it in the waiter list. It might join the group
              commit later, if it completes soon enough to do so (it will see
              our wfc->commit_started flag set), or it might commit later in a
              later group commit.
            */
            waiter_ptr= &waiter->next_subsequent_commit;
7273
          }
7274
          waiter= next_waiter;
unknown's avatar
unknown committed
7275
        }
7276
        mysql_mutex_unlock(&cur->LOCK_wait_commit);
7277
      }
7278
    }
7279

7280 7281 7282 7283 7284 7285 7286 7287
    /*
      Handle the heuristics that if another transaction is waiting for this
      transaction (or if it does so later), then we want to trigger group
      commit immediately, without waiting for the binlog_commit_wait_usec
      timeout to expire.
    */
    entry->thd->waiting_on_group_commit= true;

7288 7289 7290 7291 7292
    /* Add the entry to the group commit queue. */
    next_entry= entry->next;
    entry->next= group_commit_queue;
    group_commit_queue= entry;
    if (entry == last)
7293
      break;
unknown's avatar
unknown committed
7294 7295 7296 7297
    /*
      Move to the next entry in the flattened list of waiting transactions
      that still need to be processed transitively.
    */
7298
    entry= next_entry;
7299
    DBUG_ASSERT(entry != NULL);
7300
    cur= entry->thd->wait_for_commit_ptr;
7301
  }
7302

7303
  if (opt_binlog_commit_wait_count > 0 && orig_queue != NULL)
7304
    mysql_cond_signal(&COND_prepare_ordered);
Sergei Golubchik's avatar
Sergei Golubchik committed
7305
  mysql_mutex_unlock(&LOCK_prepare_ordered);
unknown's avatar
unknown committed
7306
  DEBUG_SYNC(orig_entry->thd, "commit_after_release_LOCK_prepare_ordered");
7307

unknown's avatar
unknown committed
7308 7309 7310
  DBUG_PRINT("info", ("Queued for group commit as %s\n",
                      (orig_queue == NULL) ? "leader" : "participant"));
  DBUG_RETURN(orig_queue == NULL);
7311 7312 7313 7314 7315
}

bool
MYSQL_BIN_LOG::write_transaction_to_binlog_events(group_commit_entry *entry)
{
7316
  int is_leader= queue_for_group_commit(entry);
7317

7318
  /*
7319
    The first in the queue handles group commit for all; the others just wait
7320 7321
    to be signalled when group commit is done.
  */
7322 7323 7324
  if (is_leader < 0)
    return true;                                /* Error */
  else if (is_leader)
7325 7326
    trx_group_commit_leader(entry);
  else if (!entry->queued_by_other)
7327 7328
    entry->thd->wait_for_wakeup_ready();
  else
7329 7330 7331 7332 7333 7334 7335
  {
    /*
      If we were queued by another prior commit, then we are woken up
      only when the leader has already completed the commit for us.
      So nothing to do here then.
    */
  }
He Zhenxing's avatar
He Zhenxing committed
7336

unknown's avatar
unknown committed
7337 7338 7339
  if (!opt_optimize_thread_scheduling)
  {
    /* For the leader, trx_group_commit_leader() already took the lock. */
7340
    if (!is_leader)
Sergei Golubchik's avatar
Sergei Golubchik committed
7341
      mysql_mutex_lock(&LOCK_commit_ordered);
He Zhenxing's avatar
He Zhenxing committed
7342

unknown's avatar
unknown committed
7343 7344
    DEBUG_SYNC(entry->thd, "commit_loop_entry_commit_ordered");
    ++num_commits;
Sergei Golubchik's avatar
Sergei Golubchik committed
7345
    if (entry->cache_mngr->using_xa && !entry->error)
unknown's avatar
unknown committed
7346 7347 7348 7349 7350 7351
      run_commit_ordered(entry->thd, entry->all);

    group_commit_entry *next= entry->next;
    if (!next)
    {
      group_commit_queue_busy= FALSE;
Sergei Golubchik's avatar
Sergei Golubchik committed
7352
      mysql_cond_signal(&COND_queue_busy);
unknown's avatar
unknown committed
7353
      DEBUG_SYNC(entry->thd, "commit_after_group_run_commit_ordered");
unknown's avatar
unknown committed
7354
    }
Sergei Golubchik's avatar
Sergei Golubchik committed
7355
    mysql_mutex_unlock(&LOCK_commit_ordered);
7356
    entry->thd->wakeup_subsequent_commits(entry->error);
7357

unknown's avatar
unknown committed
7358
    if (next)
7359
    {
unknown's avatar
unknown committed
7360 7361 7362 7363 7364 7365 7366 7367 7368 7369
      /*
        Wake up the next thread in the group commit.

        The next thread can be waiting in two different ways, depending on
        whether it put itself in the queue, or if it was put in queue by us
        because it had to wait for us to commit first.

        So execute the appropriate wakeup, identified by the queued_by_other
        field.
      */
7370
      if (next->queued_by_other)
7371
        next->thd->wait_for_commit_ptr->wakeup(entry->error);
7372 7373
      else
        next->thd->signal_wakeup_ready();
7374
    }
7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386
    else
    {
      /*
        If we rotated the binlog, and if we are using the unoptimized thread
        scheduling where every thread runs its own commit_ordered(), then we
        must do the commit checkpoint and log purge here, after all
        commit_ordered() calls have finished, and locks have been released.
      */
      if (entry->check_purge)
        checkpoint_and_purge(entry->binlog_id);
    }

unknown's avatar
unknown committed
7387
  }
unknown's avatar
unknown committed
7388

unknown's avatar
unknown committed
7389
  if (likely(!entry->error))
7390
    return entry->thd->wait_for_prior_commit();
7391

7392
  switch (entry->error)
7393
  {
7394 7395 7396 7397 7398
  case ER_ERROR_ON_WRITE:
    my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, entry->commit_errno);
    break;
  case ER_ERROR_ON_READ:
    my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
Sergei Golubchik's avatar
Sergei Golubchik committed
7399
             entry->error_cache->file_name, entry->commit_errno);
7400 7401
    break;
  default:
7402
    /*
7403 7404 7405 7406 7407 7408 7409
      There are not (and should not be) any errors thrown not covered above.
      But just in case one is added later without updating the above switch
      statement, include a catch-all.
    */
    my_printf_error(entry->error,
                    "Error writing transaction to binary log: %d",
                    MYF(ME_NOREFRESH), entry->error);
7410 7411
  }

7412 7413 7414 7415 7416
  /*
    Since we return error, this transaction XID will not be committed, so
    we need to mark it as not needed for recovery (unlog() is not called
    for a transaction if log_xid() fails).
  */
7417 7418 7419
  if (entry->cache_mngr->using_xa && entry->cache_mngr->xa_xid &&
      entry->cache_mngr->need_unlog)
    mark_xid_done(entry->cache_mngr->binlog_id, true);
7420 7421

  return 1;
7422 7423 7424 7425 7426
}

/*
  Do binlog group commit as the lead thread.

Sergei Golubchik's avatar
Sergei Golubchik committed
7427
  This must be called when this statement/transaction is queued at the start of
7428 7429 7430 7431 7432 7433 7434
  the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
  commit all the transactions in the queue (more may have entered while waiting
  for LOCK_log). After commit is done, all other threads in the queue will be
  signalled.

 */
void
7435
MYSQL_BIN_LOG::trx_group_commit_leader(group_commit_entry *leader)
7436
{
7437
  uint xid_count= 0;
Sergei Golubchik's avatar
Sergei Golubchik committed
7438
  my_off_t UNINIT_VAR(commit_offset);
7439
  group_commit_entry *current, *last_in_queue;
7440
  group_commit_entry *queue= NULL;
Sergei Golubchik's avatar
Sergei Golubchik committed
7441
  bool check_purge= false;
7442
  ulong UNINIT_VAR(binlog_id);
7443
  uint64 commit_id;
Sergei Golubchik's avatar
Sergei Golubchik committed
7444
  DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_leader");
7445

7446
  {
7447 7448 7449 7450
    DBUG_EXECUTE_IF("inject_binlog_commit_before_get_LOCK_log",
      DBUG_ASSERT(!debug_sync_set_action(leader->thd, STRING_WITH_LEN
        ("commit_before_get_LOCK_log SIGNAL waiting WAIT_FOR cont TIMEOUT 1")));
    );
Sergei Golubchik's avatar
Sergei Golubchik committed
7451 7452 7453 7454
    /*
      Lock the LOCK_log(), and once we get it, collect any additional writes
      that queued up while we were waiting.
    */
7455
    DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_log");
Sergei Golubchik's avatar
Sergei Golubchik committed
7456 7457 7458 7459
    mysql_mutex_lock(&LOCK_log);
    DEBUG_SYNC(leader->thd, "commit_after_get_LOCK_log");

    mysql_mutex_lock(&LOCK_prepare_ordered);
7460 7461
    if (opt_binlog_commit_wait_count)
      wait_for_sufficient_commits();
unknown's avatar
unknown committed
7462 7463 7464 7465
    /*
      Note that wait_for_sufficient_commits() may have released and
      re-acquired the LOCK_log and LOCK_prepare_ordered if it needed to wait.
    */
Sergei Golubchik's avatar
Sergei Golubchik committed
7466 7467 7468
    current= group_commit_queue;
    group_commit_queue= NULL;
    mysql_mutex_unlock(&LOCK_prepare_ordered);
unknown's avatar
unknown committed
7469
    binlog_id= current_binlog_id;
Sergei Golubchik's avatar
Sergei Golubchik committed
7470 7471 7472 7473 7474 7475

    /* As the queue is in reverse order of entering, reverse it. */
    last_in_queue= current;
    while (current)
    {
      group_commit_entry *next= current->next;
7476 7477 7478 7479 7480
      /*
        Now that group commit is started, we can clear the flag; there is no
        longer any use in waiters on this commit trying to trigger it early.
      */
      current->thd->waiting_on_group_commit= false;
Sergei Golubchik's avatar
Sergei Golubchik committed
7481 7482 7483 7484 7485 7486 7487
      current->next= queue;
      queue= current;
      current= next;
    }
    DBUG_ASSERT(leader == queue /* the leader should be first in queue */);

    /* Now we have in queue the list of transactions to be committed in order. */
7488
  }
Sergei Golubchik's avatar
Sergei Golubchik committed
7489
    
7490 7491 7492
  DBUG_ASSERT(is_open());
  if (likely(is_open()))                       // Should always be true
  {
7493
    commit_id= (last_in_queue == leader ? 0 : (uint64)leader->thd->query_id);
7494 7495
    DBUG_EXECUTE_IF("binlog_force_commit_id",
      {
7496
        const LEX_STRING commit_name= { C_STRING_WITH_LEN("commit_id") };
7497 7498 7499
        bool null_value;
        user_var_entry *entry=
          (user_var_entry*) my_hash_search(&leader->thd->user_vars,
7500 7501
                                           (uchar*) commit_name.str,
                                           commit_name.length);
7502 7503
        commit_id= entry->val_int(&null_value);
      });
unknown's avatar
unknown committed
7504
    /*
7505 7506 7507 7508 7509 7510 7511 7512 7513
      Commit every transaction in the queue.

      Note that we are doing this in a different thread than the one running
      the transaction! So we are limited in the operations we can do. In
      particular, we cannot call my_error() on behalf of a transaction, as
      that obtains the THD from thread local storage. Instead, we must set
      current->error and let the thread do the error reporting itself once
      we wake it up.
    */
7514
    for (current= queue; current != NULL; current= current->next)
7515
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
7516
      binlog_cache_mngr *cache_mngr= current->cache_mngr;
7517

7518
      /*
Sergei Golubchik's avatar
Sergei Golubchik committed
7519 7520
        We already checked before that at least one cache is non-empty; if both
        are empty we would have skipped calling into here.
7521
      */
Sergei Golubchik's avatar
Sergei Golubchik committed
7522
      DBUG_ASSERT(!cache_mngr->stmt_cache.empty() || !cache_mngr->trx_cache.empty());
7523

7524
      if ((current->error= write_transaction_or_stmt(current, commit_id)))
7525
        current->commit_errno= errno;
7526

7527
      strmake_buf(cache_mngr->last_commit_pos_file, log_file_name);
unknown's avatar
unknown committed
7528
      commit_offset= my_b_write_tell(&log_file);
Sergei Golubchik's avatar
Sergei Golubchik committed
7529 7530
      cache_mngr->last_commit_pos_offset= commit_offset;
      if (cache_mngr->using_xa && cache_mngr->xa_xid)
7531
      {
7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547
        /*
          If all storage engines support commit_checkpoint_request(), then we
          do not need to keep track of when this XID is durably committed.
          Instead we will just ask the storage engine to durably commit all its
          XIDs when we rotate a binlog file.
        */
        if (current->need_unlog)
        {
          xid_count++;
          cache_mngr->need_unlog= true;
          cache_mngr->binlog_id= binlog_id;
        }
        else
          cache_mngr->need_unlog= false;

        cache_mngr->delayed_error= false;
7548
      }
7549
    }
7550

Sergei Golubchik's avatar
Sergei Golubchik committed
7551 7552
    bool synced= 0;
    if (flush_and_sync(&synced))
7553
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
7554
      for (current= queue; current != NULL; current= current->next)
7555
      {
Sergei Golubchik's avatar
Sergei Golubchik committed
7556
        if (!current->error)
7557
        {
Sergei Golubchik's avatar
Sergei Golubchik committed
7558 7559 7560
          current->error= ER_ERROR_ON_WRITE;
          current->commit_errno= errno;
          current->error_cache= NULL;
7561 7562
        }
      }
Sergei Golubchik's avatar
Sergei Golubchik committed
7563 7564 7565 7566 7567
    }
    else
    {
      bool any_error= false;
      bool all_error= true;
7568 7569 7570 7571 7572 7573

      mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
      mysql_mutex_assert_owner(&LOCK_log);
      mysql_mutex_assert_not_owner(&LOCK_after_binlog_sync);
      mysql_mutex_assert_not_owner(&LOCK_commit_ordered);
      bool first= true, last;
Sergei Golubchik's avatar
Sergei Golubchik committed
7574
      for (current= queue; current != NULL; current= current->next)
7575
      {
7576
        last= current->next == NULL;
Sergei Golubchik's avatar
Sergei Golubchik committed
7577 7578
        if (!current->error &&
            RUN_HOOK(binlog_storage, after_flush,
7579 7580
                (current->thd,
                 current->cache_mngr->last_commit_pos_file,
7581 7582
                 current->cache_mngr->last_commit_pos_offset, synced,
                 first, last)))
Sergei Golubchik's avatar
Sergei Golubchik committed
7583 7584 7585 7586 7587 7588 7589 7590
        {
          current->error= ER_ERROR_ON_WRITE;
          current->commit_errno= -1;
          current->error_cache= NULL;
          any_error= true;
        }
        else
          all_error= false;
7591
        first= false;
7592
      }
Sergei Golubchik's avatar
Sergei Golubchik committed
7593

7594 7595 7596 7597 7598 7599 7600 7601
      /* update binlog_end_pos so it can be read by dump thread
       *
       * note: must be _after_ the RUN_HOOK(after_flush) or else
       * semi-sync-plugin might not have put the transaction into
       * it's list before dump-thread tries to send it
       */
      update_binlog_end_pos(commit_offset);

Sergei Golubchik's avatar
Sergei Golubchik committed
7602 7603 7604 7605
      if (any_error)
        sql_print_error("Failed to run 'after_flush' hooks");
      if (!all_error)
        signal_update();
unknown's avatar
unknown committed
7606
    }
7607

7608
    /*
7609 7610 7611 7612
      If any commit_events are Xid_log_event, increase the number of pending
      XIDs in current binlog (it's decreased in ::unlog()). When the count in
      a (not active) binlog file reaches zero, we know that it is no longer
      needed in XA recovery, and we can log a new binlog checkpoint event.
7613
    */
7614
    if (xid_count > 0)
7615
    {
7616
      mark_xids_active(binlog_id, xid_count);
7617
    }
7618 7619

    if (rotate(false, &check_purge))
7620
    {
7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636
      /*
        If we fail to rotate, which thread should get the error?
        We give the error to the leader, as any my_error() thrown inside
        rotate() will have been registered for the leader THD.

        However we must not return error from here - that would cause
        ha_commit_trans() to abort and rollback the transaction, which would
        leave an inconsistent state with the transaction committed in the
        binlog but rolled back in the engine.

        Instead set a flag so that we can return error later, from unlog(),
        when the transaction has been safely committed in the engine.
      */
      leader->cache_mngr->delayed_error= true;
      my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, errno);
      check_purge= false;
7637
    }
7638 7639
    /* In case of binlog rotate, update the correct current binlog offset. */
    commit_offset= my_b_write_tell(&log_file);
unknown's avatar
unknown committed
7640
  }
unknown's avatar
unknown committed
7641

7642 7643
  DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_after_binlog_sync");
  mysql_mutex_lock(&LOCK_after_binlog_sync);
7644
  /*
7645
    We cannot unlock LOCK_log until we have locked LOCK_after_binlog_sync;
7646 7647
    otherwise scheduling could allow the next group commit to run ahead of us,
    messing up the order of commit_ordered() calls. But as soon as
7648
    LOCK_after_binlog_sync is obtained, we can let the next group commit start.
7649
  */
Marc Alff's avatar
Marc Alff committed
7650
  mysql_mutex_unlock(&LOCK_log);
7651

7652
  DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_log");
7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689

  /*
    Loop through threads and run the binlog_sync hook
  */
  {
    mysql_mutex_assert_not_owner(&LOCK_prepare_ordered);
    mysql_mutex_assert_not_owner(&LOCK_log);
    mysql_mutex_assert_owner(&LOCK_after_binlog_sync);
    mysql_mutex_assert_not_owner(&LOCK_commit_ordered);

    bool first= true, last;
    for (current= queue; current != NULL; current= current->next)
    {
      last= current->next == NULL;
      if (!current->error &&
          RUN_HOOK(binlog_storage, after_sync,
                   (current->thd, log_file_name,
                    current->cache_mngr->last_commit_pos_offset,
                    first, last)))
      {
      /* error is already printed inside hook */
      }
      first= false;
    }
  }

  DEBUG_SYNC(leader->thd, "commit_before_get_LOCK_commit_ordered");
  mysql_mutex_lock(&LOCK_commit_ordered);
  last_commit_pos_offset= commit_offset;

  /*
    Unlock LOCK_after_binlog_sync only *after* LOCK_commit_ordered has been
    acquired so that groups can not reorder for the different stages of
    the group commit procedure.
  */
  mysql_mutex_unlock(&LOCK_after_binlog_sync);
  DEBUG_SYNC(leader->thd, "commit_after_release_LOCK_after_binlog_sync");
7690
  ++num_group_commits;
7691

unknown's avatar
unknown committed
7692
  if (!opt_optimize_thread_scheduling)
7693
  {
unknown's avatar
unknown committed
7694 7695 7696 7697 7698 7699 7700 7701 7702
    /*
      If we want to run commit_ordered() each in the transaction's own thread
      context, then we need to mark the queue reserved; we need to finish all
      threads in one group commit before the next group commit can be allowed
      to proceed, and we cannot unlock a simple pthreads mutex in a different
      thread from the one that locked it.
    */

    while (group_commit_queue_busy)
Sergei Golubchik's avatar
Sergei Golubchik committed
7703
      mysql_cond_wait(&COND_queue_busy, &LOCK_commit_ordered);
unknown's avatar
unknown committed
7704 7705
    group_commit_queue_busy= TRUE;

7706 7707 7708 7709 7710 7711 7712 7713 7714
    /*
      Set these so parent can run checkpoint_and_purge() in last thread.
      (When using optimized thread scheduling, we run checkpoint_and_purge()
      in this function, so parent does not need to and we need not set these
      values).
    */
    last_in_queue->check_purge= check_purge;
    last_in_queue->binlog_id= binlog_id;

unknown's avatar
unknown committed
7715 7716
    /* Note that we return with LOCK_commit_ordered locked! */
    DBUG_VOID_RETURN;
7717
  }
unknown's avatar
unknown committed
7718

7719 7720 7721
  /*
    Wakeup each participant waiting for our group commit, first calling the
    commit_ordered() methods for any transactions doing 2-phase commit.
7722
  */
7723 7724
  current= queue;
  while (current != NULL)
7725
  {
unknown's avatar
unknown committed
7726 7727
    group_commit_entry *next;

7728 7729
    DEBUG_SYNC(leader->thd, "commit_loop_entry_commit_ordered");
    ++num_commits;
7730 7731
    if (current->cache_mngr->using_xa && !current->error &&
        DBUG_EVALUATE_IF("skip_commit_ordered", 0, 1))
7732
      run_commit_ordered(current->thd, current->all);
7733
    current->thd->wakeup_subsequent_commits(current->error);
7734

7735 7736 7737
    /*
      Careful not to access current->next after waking up the other thread! As
      it may change immediately after wakeup.
7738
    */
unknown's avatar
unknown committed
7739
    next= current->next;
7740
    if (current != leader)                      // Don't wake up ourself
7741 7742
    {
      if (current->queued_by_other)
7743
        current->thd->wait_for_commit_ptr->wakeup(current->error);
7744 7745 7746
      else
        current->thd->signal_wakeup_ready();
    }
7747
    current= next;
7748
  }
7749
  DEBUG_SYNC(leader->thd, "commit_after_group_run_commit_ordered");
Sergei Golubchik's avatar
Sergei Golubchik committed
7750
  mysql_mutex_unlock(&LOCK_commit_ordered);
7751
  DEBUG_SYNC(leader->thd, "commit_after_group_release_commit_ordered");
7752

7753
  if (check_purge)
7754
    checkpoint_and_purge(binlog_id);
7755 7756

  DBUG_VOID_RETURN;
unknown's avatar
unknown committed
7757 7758
}

7759

7760
int
7761 7762
MYSQL_BIN_LOG::write_transaction_or_stmt(group_commit_entry *entry,
                                         uint64 commit_id)
7763
{
Sergei Golubchik's avatar
Sergei Golubchik committed
7764
  binlog_cache_mngr *mngr= entry->cache_mngr;
7765
  DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_or_stmt");
unknown's avatar
unknown committed
7766

7767
  if (write_gtid_event(entry->thd, false, entry->using_trx_cache, commit_id))
7768
    DBUG_RETURN(ER_ERROR_ON_WRITE);
7769

Sergei Golubchik's avatar
Sergei Golubchik committed
7770 7771 7772 7773
  if (entry->using_stmt_cache && !mngr->stmt_cache.empty() &&
      write_cache(entry->thd, mngr->get_binlog_cache_log(FALSE)))
  {
    entry->error_cache= &mngr->stmt_cache.cache_log;
7774
    DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785
  }

  if (entry->using_trx_cache && !mngr->trx_cache.empty())
  {
    DBUG_EXECUTE_IF("crash_before_writing_xid",
                    {
                      if ((write_cache(entry->thd,
                                       mngr->get_binlog_cache_log(TRUE))))
                        DBUG_PRINT("info", ("error writing binlog cache"));
                      else
                        flush_and_sync(0);
7786

Sergei Golubchik's avatar
Sergei Golubchik committed
7787 7788 7789
                      DBUG_PRINT("info", ("crashing before writing xid"));
                      DBUG_SUICIDE();
                    });
7790

Sergei Golubchik's avatar
Sergei Golubchik committed
7791 7792 7793
    if (write_cache(entry->thd, mngr->get_binlog_cache_log(TRUE)))
    {
      entry->error_cache= &mngr->trx_cache.cache_log;
7794
      DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7795 7796
    }
  }
7797

7798 7799 7800
  DBUG_EXECUTE_IF("inject_error_writing_xid",
                  {
                    entry->error_cache= NULL;
7801
                    errno= 28;
7802 7803 7804
                    DBUG_RETURN(ER_ERROR_ON_WRITE);
                  });

7805
  if (entry->end_event->write(&log_file))
Sergei Golubchik's avatar
Sergei Golubchik committed
7806 7807
  {
    entry->error_cache= NULL;
7808
    DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7809
  }
7810 7811
  status_var_add(entry->thd->status_var.binlog_bytes_written,
                 entry->end_event->data_written);
7812

7813 7814 7815
  if (entry->incident_event)
  {
    if (entry->incident_event->write(&log_file))
Sergei Golubchik's avatar
Sergei Golubchik committed
7816 7817
    {
      entry->error_cache= NULL;
7818
      DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7819
    }
7820
  }
7821

Sergei Golubchik's avatar
Sergei Golubchik committed
7822 7823 7824
  if (mngr->get_binlog_cache_log(FALSE)->error) // Error on read
  {
    entry->error_cache= &mngr->stmt_cache.cache_log;
7825
    DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7826 7827 7828 7829
  }
  if (mngr->get_binlog_cache_log(TRUE)->error)  // Error on read
  {
    entry->error_cache= &mngr->trx_cache.cache_log;
7830
    DBUG_RETURN(ER_ERROR_ON_WRITE);
Sergei Golubchik's avatar
Sergei Golubchik committed
7831
  }
7832

7833
  DBUG_RETURN(0);
7834
}
7835

7836

unknown's avatar
unknown committed
7837 7838 7839 7840 7841 7842 7843
/*
  Wait for sufficient commits to queue up for group commit, according to the
  values of binlog_commit_wait_count and binlog_commit_wait_usec.

  Note that this function may release and re-acquire LOCK_log and
  LOCK_prepare_ordered if it needs to wait.
*/
Michael Widenius's avatar
Michael Widenius committed
7844

7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855
void
MYSQL_BIN_LOG::wait_for_sufficient_commits()
{
  size_t count;
  group_commit_entry *e;
  group_commit_entry *last_head;
  struct timespec wait_until;

  mysql_mutex_assert_owner(&LOCK_log);
  mysql_mutex_assert_owner(&LOCK_prepare_ordered);

unknown's avatar
unknown committed
7856
  for (e= last_head= group_commit_queue, count= 0; e; e= e->next)
7857 7858 7859
  {
    if (++count >= opt_binlog_commit_wait_count)
    {
7860
      group_commit_trigger_count++;
7861 7862 7863 7864
      return;
    }
    if (unlikely(e->thd->has_waiter))
    {
7865
      group_commit_trigger_lock_wait++;
unknown's avatar
unknown committed
7866
      return;
7867 7868
    }
  }
7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880

  mysql_mutex_unlock(&LOCK_log);
  set_timespec_nsec(wait_until, (ulonglong)1000*opt_binlog_commit_wait_usec);

  for (;;)
  {
    int err;
    group_commit_entry *head;

    err= mysql_cond_timedwait(&COND_prepare_ordered, &LOCK_prepare_ordered,
                              &wait_until);
    if (err == ETIMEDOUT)
7881
    {
7882
      group_commit_trigger_timeout++;
7883
      break;
7884
    }
7885
    if (unlikely(last_head->thd->has_waiter))
7886
    {
7887
      group_commit_trigger_lock_wait++;
7888
      break;
7889
    }
7890 7891
    head= group_commit_queue;
    for (e= head; e && e != last_head; e= e->next)
7892
    {
7893
      ++count;
7894
      if (unlikely(e->thd->has_waiter))
7895
      {
7896
        group_commit_trigger_lock_wait++;
7897
        goto after_loop;
7898
      }
7899
    }
7900
    if (count >= opt_binlog_commit_wait_count)
7901
    {
7902
      group_commit_trigger_count++;
7903
      break;
7904
    }
7905 7906
    last_head= head;
  }
7907
after_loop:
7908

unknown's avatar
unknown committed
7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927
  /*
    We must not wait for LOCK_log while holding LOCK_prepare_ordered.
    LOCK_log can be held for long periods (eg. we do I/O under it), while
    LOCK_prepare_ordered must only be held for short periods.

    In addition, waiting for LOCK_log while holding LOCK_prepare_ordered would
    violate locking order of LOCK_log-before-LOCK_prepare_ordered. This could
    cause SAFEMUTEX warnings (even if it cannot actually deadlock with current
    code, as there can be at most one group commit leader thread at a time).

    So release and re-acquire LOCK_prepare_ordered if we need to wait for the
    LOCK_log.
  */
  if (mysql_mutex_trylock(&LOCK_log))
  {
    mysql_mutex_unlock(&LOCK_prepare_ordered);
    mysql_mutex_lock(&LOCK_log);
    mysql_mutex_lock(&LOCK_prepare_ordered);
  }
7928 7929 7930
}


7931 7932 7933 7934
void
MYSQL_BIN_LOG::binlog_trigger_immediate_group_commit()
{
  group_commit_entry *head;
7935
  mysql_mutex_assert_owner(&LOCK_prepare_ordered);
7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959
  head= group_commit_queue;
  if (head)
  {
    head->thd->has_waiter= true;
    mysql_cond_signal(&COND_prepare_ordered);
  }
}


/*
  This function is called when a transaction T1 goes to wait for another
  transaction T2. It is used to cut short any binlog group commit delay from
  --binlog-commit-wait-count in the case where another transaction is stalled
  on the wait due to conflicting row locks.

  If T2 is already ready to group commit, any waiting group commit will be
  signalled to proceed immediately. Otherwise, a flag will be set in T2, and
  when T2 later becomes ready, immediate group commit will be triggered.
*/
void
binlog_report_wait_for(THD *thd1, THD *thd2)
{
  if (opt_binlog_commit_wait_count == 0)
    return;
7960
  mysql_mutex_lock(&LOCK_prepare_ordered);
7961 7962 7963
  thd2->has_waiter= true;
  if (thd2->waiting_on_group_commit)
    mysql_bin_log.binlog_trigger_immediate_group_commit();
7964
  mysql_mutex_unlock(&LOCK_prepare_ordered);
7965 7966 7967
}


unknown's avatar
unknown committed
7968
/**
Andrei Elkin's avatar
Andrei Elkin committed
7969
  Wait until we get a signal that the relay log has been updated.
7970

unknown's avatar
unknown committed
7971
  @param thd		Thread variable
7972

unknown's avatar
unknown committed
7973
  @note
7974
    One must have a lock on LOCK_log before calling this function.
7975
    This lock will be released before return! That's required by
unknown's avatar
unknown committed
7976
    THD::enter_cond() (see NOTES in sql_class.h).
7977 7978
*/

Andrei Elkin's avatar
Andrei Elkin committed
7979
void MYSQL_BIN_LOG::wait_for_update_relay_log(THD* thd)
7980
{
Sergei Golubchik's avatar
Sergei Golubchik committed
7981
  PSI_stage_info old_stage;
Andrei Elkin's avatar
Andrei Elkin committed
7982
  DBUG_ENTER("wait_for_update_relay_log");
7983

7984
  mysql_mutex_assert_owner(&LOCK_log);
Sergei Golubchik's avatar
Sergei Golubchik committed
7985 7986 7987
  thd->ENTER_COND(&update_cond, &LOCK_log,
                  &stage_slave_has_read_all_relay_log,
                  &old_stage);
Marc Alff's avatar
Marc Alff committed
7988
  mysql_cond_wait(&update_cond, &LOCK_log);
Sergei Golubchik's avatar
Sergei Golubchik committed
7989
  thd->EXIT_COND(&old_stage);
unknown's avatar
unknown committed
7990
  DBUG_VOID_RETURN;
7991
}
unknown's avatar
unknown committed
7992

Andrei Elkin's avatar
Andrei Elkin committed
7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013
/**
  Wait until we get a signal that the binary log has been updated.
  Applies to master only.
     
  NOTES
  @param[in] thd        a THD struct
  @param[in] timeout    a pointer to a timespec;
                        NULL means to wait w/o timeout.
  @retval    0          if got signalled on update
  @retval    non-0      if wait timeout elapsed
  @note
    LOCK_log must be taken before calling this function.
    LOCK_log is being released while the thread is waiting.
    LOCK_log is released by the caller.
*/

int MYSQL_BIN_LOG::wait_for_update_bin_log(THD* thd,
                                           const struct timespec *timeout)
{
  int ret= 0;
  DBUG_ENTER("wait_for_update_bin_log");
8014

8015
  mysql_mutex_assert_owner(&LOCK_log);
Andrei Elkin's avatar
Andrei Elkin committed
8016
  if (!timeout)
Marc Alff's avatar
Marc Alff committed
8017
    mysql_cond_wait(&update_cond, &LOCK_log);
Andrei Elkin's avatar
Andrei Elkin committed
8018
  else
Marc Alff's avatar
Marc Alff committed
8019 8020
    ret= mysql_cond_timedwait(&update_cond, &LOCK_log,
                              const_cast<struct timespec *>(timeout));
Andrei Elkin's avatar
Andrei Elkin committed
8021 8022 8023
  DBUG_RETURN(ret);
}

8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038
int MYSQL_BIN_LOG::wait_for_update_binlog_end_pos(THD* thd,
                                                  struct timespec *timeout)
{
  int ret= 0;
  DBUG_ENTER("wait_for_update_binlog_end_pos");

  mysql_mutex_assert_owner(get_binlog_end_pos_lock());
  if (!timeout)
    mysql_cond_wait(&update_cond, get_binlog_end_pos_lock());
  else
    ret= mysql_cond_timedwait(&update_cond, get_binlog_end_pos_lock(),
                              timeout);
  DBUG_RETURN(ret);
}

unknown's avatar
unknown committed
8039

unknown's avatar
unknown committed
8040 8041
/**
  Close the log file.
8042

unknown's avatar
unknown committed
8043 8044 8045 8046 8047
  @param exiting     Bitmask for one or more of the following bits:
          - LOG_CLOSE_INDEX : if we should close the index file
          - LOG_CLOSE_TO_BE_OPENED : if we intend to call open
                                     at once after close.
          - LOG_CLOSE_STOP_EVENT : write a 'stop' event to the log
8048 8049
          - LOG_CLOSE_DELAYED_CLOSE : do not yet close the file and clear the
                                      LOG_EVENT_BINLOG_IN_USE_F flag
unknown's avatar
unknown committed
8050

unknown's avatar
unknown committed
8051
  @note
unknown's avatar
unknown committed
8052 8053
    One can do an open on the object at once after doing a close.
    The internal structures are not freed until cleanup() is called
8054 8055
*/

8056
void MYSQL_BIN_LOG::close(uint exiting)
unknown's avatar
unknown committed
8057
{					// One can't set log_type here!
8058 8059
  bool failed_to_save_state= false;

8060
  DBUG_ENTER("MYSQL_BIN_LOG::close");
8061
  DBUG_PRINT("enter",("exiting: %d", (int) exiting));
8062
  if (log_state == LOG_OPENED)
unknown's avatar
unknown committed
8063
  {
unknown's avatar
SCRUM  
unknown committed
8064
#ifdef HAVE_REPLICATION
8065
    if (log_type == LOG_BIN &&
8066
	(exiting & LOG_CLOSE_STOP_EVENT))
unknown's avatar
unknown committed
8067 8068
    {
      Stop_log_event s;
8069 8070
      // the checksumming rule for relay-log case is similar to Rotate
        s.checksum_alg= is_relay_log ?
Michael Widenius's avatar
Michael Widenius committed
8071
          (uint8) relay_log_checksum_alg : (uint8) binlog_checksum_options;
8072 8073
      DBUG_ASSERT(!is_relay_log ||
                  relay_log_checksum_alg != BINLOG_CHECKSUM_ALG_UNDEF);
8074
      s.write(&log_file);
8075
      bytes_written+= s.data_written;
8076
      signal_update();
8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097

      /*
        When we shut down server, write out the binlog state to a separate
        file so we do not have to scan an entire binlog file to recover it
        at next server start.

        Note that this must be written and synced to disk before marking the
        last binlog file as "not crashed".
      */
      if (!is_relay_log && write_state_to_file())
      {
        sql_print_error("Failed to save binlog GTID state during shutdown. "
                        "Binlog will be marked as crashed, so that crash "
                        "recovery can recover the state at next server "
                        "startup.");
        /*
          Leave binlog file marked as crashed, so we can recover state by
          scanning it now that we failed to write out the state properly.
        */
        failed_to_save_state= true;
      }
unknown's avatar
unknown committed
8098
    }
unknown's avatar
SCRUM  
unknown committed
8099
#endif /* HAVE_REPLICATION */
8100 8101

    /* don't pwrite in a file opened with O_APPEND - it doesn't work */
8102 8103
    if (log_file.type == WRITE_CACHE && log_type == LOG_BIN
        && !(exiting & LOG_CLOSE_DELAYED_CLOSE))
8104
    {
Marc Alff's avatar
Marc Alff committed
8105
      my_off_t org_position= mysql_file_tell(log_file.file, MYF(0));
8106 8107
      if (!failed_to_save_state)
        clear_inuse_flag_when_closing(log_file.file);
8108 8109 8110
      /*
        Restore position so that anything we have in the IO_cache is written
        to the correct position.
Marc Alff's avatar
Marc Alff committed
8111
        We need the seek here, as mysql_file_pwrite() is not guaranteed to keep the
8112 8113
        original position on system that doesn't support pwrite().
      */
Marc Alff's avatar
Marc Alff committed
8114
      mysql_file_seek(log_file.file, org_position, MY_SEEK_SET, MYF(0));
8115 8116
    }

8117 8118
    /* this will cleanup IO_CACHE, sync and close the file */
    MYSQL_LOG::close(exiting);
unknown's avatar
unknown committed
8119
  }
8120 8121 8122 8123 8124 8125

  /*
    The following test is needed even if is_open() is not set, as we may have
    called a not complete close earlier and the index file is still open.
  */

8126
  if ((exiting & LOG_CLOSE_INDEX) && my_b_inited(&index_file))
8127
  {
8128
    end_io_cache(&index_file);
Marc Alff's avatar
Marc Alff committed
8129
    if (mysql_file_close(index_file.file, MYF(0)) < 0 && ! write_error)
unknown's avatar
unknown committed
8130
    {
8131
      write_error= 1;
8132 8133
      sql_print_error(ER_THD_OR_DEFAULT(current_thd, ER_ERROR_ON_WRITE),
                      index_file_name, errno);
unknown's avatar
unknown committed
8134
    }
8135
  }
8136
  log_state= (exiting & LOG_CLOSE_TO_BE_OPENED) ? LOG_TO_BE_OPENED : LOG_CLOSED;
8137 8138
  my_free(name);
  name= NULL;
8139
  DBUG_VOID_RETURN;
unknown's avatar
unknown committed
8140 8141 8142
}


8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154
/*
  Clear the LOG_EVENT_BINLOG_IN_USE_F; this marks the binlog file as cleanly
  closed and not needing crash recovery.
*/
void MYSQL_BIN_LOG::clear_inuse_flag_when_closing(File file)
{
  my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
  uchar flags= 0;            // clearing LOG_EVENT_BINLOG_IN_USE_F
  mysql_file_pwrite(file, &flags, 1, offset, MYF(0));
}


8155
void MYSQL_BIN_LOG::set_max_size(ulong max_size_arg)
8156 8157 8158 8159 8160 8161 8162 8163
{
  /*
    We need to take locks, otherwise this may happen:
    new_file() is called, calls open(old_max_size), then before open() starts,
    set_max_size() sets max_size to max_size_arg, then open() starts and
    uses the old_max_size argument, so max_size_arg has been overwritten and
    it's like if the SET command was never run.
  */
8164
  DBUG_ENTER("MYSQL_BIN_LOG::set_max_size");
Marc Alff's avatar
Marc Alff committed
8165
  mysql_mutex_lock(&LOCK_log);
8166 8167
  if (is_open())
    max_size= max_size_arg;
Marc Alff's avatar
Marc Alff committed
8168
  mysql_mutex_unlock(&LOCK_log);
unknown's avatar
unknown committed
8169
  DBUG_VOID_RETURN;
8170
}
unknown's avatar
unknown committed
8171

unknown's avatar
unknown committed
8172

unknown's avatar
unknown committed
8173 8174
/**
  Check if a string is a valid number.
8175

unknown's avatar
unknown committed
8176 8177 8178
  @param str			String to test
  @param res			Store value here
  @param allow_wildcards	Set to 1 if we should ignore '%' and '_'
8179

unknown's avatar
unknown committed
8180
  @note
8181 8182 8183
    For the moment the allow_wildcards argument is not used
    Should be move to some other file.

unknown's avatar
unknown committed
8184
  @retval
8185
    1	String is a number
unknown's avatar
unknown committed
8186
  @retval
8187
    0	String is not a number
8188
*/
unknown's avatar
unknown committed
8189 8190

static bool test_if_number(register const char *str,
8191
			   ulong *res, bool allow_wildcards)
unknown's avatar
unknown committed
8192 8193 8194 8195 8196 8197 8198 8199 8200
{
  reg2 int flag;
  const char *start;
  DBUG_ENTER("test_if_number");

  flag=0; start=str;
  while (*str++ == ' ') ;
  if (*--str == '-' || *str == '+')
    str++;
8201 8202
  while (my_isdigit(files_charset_info,*str) ||
	 (allow_wildcards && (*str == wild_many || *str == wild_one)))
unknown's avatar
unknown committed
8203 8204 8205 8206 8207 8208 8209
  {
    flag=1;
    str++;
  }
  if (*str == '.')
  {
    for (str++ ;
8210
	 my_isdigit(files_charset_info,*str) ||
unknown's avatar
unknown committed
8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223
	   (allow_wildcards && (*str == wild_many || *str == wild_one)) ;
	 str++, flag=1) ;
  }
  if (*str != 0 || flag == 0)
    DBUG_RETURN(0);
  if (res)
    *res=atol(start);
  DBUG_RETURN(1);			/* Number ok */
} /* test_if_number */


void sql_perror(const char *message)
{
8224
#if defined(_WIN32)
Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
8225 8226 8227 8228
  char* buf;
  DWORD dw= GetLastError();
  if (FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |  FORMAT_MESSAGE_FROM_SYSTEM |
        FORMAT_MESSAGE_IGNORE_INSERTS,  NULL, dw,
8229 8230 8231 8232 8233 8234 8235 8236 8237 8238
        MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL ) > 0)
  {
    sql_print_error("%s: %s",message, buf);
    LocalFree((HLOCAL)buf);
  }
  else
  {
    sql_print_error("%s", message);
  }
#elif defined(HAVE_STRERROR)
unknown's avatar
unknown committed
8239
  sql_print_error("%s: %s",message, strerror(errno));
8240
#else 
unknown's avatar
unknown committed
8241 8242 8243
  perror(message);
#endif
}
8244

unknown's avatar
unknown committed
8245

8246 8247 8248 8249 8250
/*
  Change the file associated with two output streams. Used to
  redirect stdout and stderr to a file. The streams are reopened
  only for appending (writing at end of file).
*/
8251 8252 8253
extern "C" my_bool reopen_fstreams(const char *filename,
                                   FILE *outstream, FILE *errstream)
{
8254
  if (outstream && !my_freopen(filename, "a", outstream))
8255
    return TRUE;
8256

8257
  if (errstream && !my_freopen(filename, "a", errstream))
8258 8259
    return TRUE;

8260 8261 8262
  /* The error stream must be unbuffered. */
  if (errstream)
    setbuf(errstream, NULL);
8263 8264 8265 8266 8267

  return FALSE;
}


8268 8269 8270 8271 8272 8273
/*
  Unfortunately, there seems to be no good way
  to restore the original streams upon failure.
*/
static bool redirect_std_streams(const char *file)
{
8274 8275
  if (reopen_fstreams(file, stdout, stderr))
    return TRUE;
8276

8277 8278
  setbuf(stderr, NULL);
  return FALSE;
8279 8280 8281
}


unknown's avatar
unknown committed
8282 8283
bool flush_error_log()
{
8284
  bool result= 0;
8285
  if (opt_error_log)
unknown's avatar
unknown committed
8286
  {
Marc Alff's avatar
Marc Alff committed
8287
    mysql_mutex_lock(&LOCK_error_log);
8288 8289
    if (redirect_std_streams(log_error_file))
      result= 1;
Marc Alff's avatar
Marc Alff committed
8290
    mysql_mutex_unlock(&LOCK_error_log);
unknown's avatar
unknown committed
8291
  }
8292
  return result;
unknown's avatar
unknown committed
8293
}
unknown's avatar
unknown committed
8294

8295
void MYSQL_BIN_LOG::signal_update()
unknown's avatar
unknown committed
8296
{
8297
  DBUG_ENTER("MYSQL_BIN_LOG::signal_update");
Andrei Elkin's avatar
Andrei Elkin committed
8298
  signal_cnt++;
Marc Alff's avatar
Marc Alff committed
8299
  mysql_cond_broadcast(&update_cond);
unknown's avatar
unknown committed
8300 8301 8302
  DBUG_VOID_RETURN;
}

Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
8303
#ifdef _WIN32
8304
static void print_buffer_to_nt_eventlog(enum loglevel level, char *buff,
unknown's avatar
unknown committed
8305
                                        size_t length, size_t buffLen)
8306 8307
{
  HANDLE event;
8308
  char   *buffptr= buff;
unknown's avatar
unknown committed
8309
  DBUG_ENTER("print_buffer_to_nt_eventlog");
8310

8311
  /* Add ending CR/LF's to string, overwrite last chars if necessary */
8312
  strmov(buffptr+MY_MIN(length, buffLen-5), "\r\n\r\n");
8313

unknown's avatar
unknown committed
8314 8315
  setup_windows_event_source();
  if ((event= RegisterEventSource(NULL,"MySQL")))
8316
  {
unknown's avatar
unknown committed
8317
    switch (level) {
8318
      case ERROR_LEVEL:
unknown's avatar
unknown committed
8319
        ReportEvent(event, EVENTLOG_ERROR_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
8320
                    (LPCSTR*)&buffptr, NULL);
8321 8322
        break;
      case WARNING_LEVEL:
unknown's avatar
unknown committed
8323
        ReportEvent(event, EVENTLOG_WARNING_TYPE, 0, MSG_DEFAULT, NULL, 1, 0,
8324
                    (LPCSTR*) &buffptr, NULL);
8325 8326
        break;
      case INFORMATION_LEVEL:
unknown's avatar
unknown committed
8327
        ReportEvent(event, EVENTLOG_INFORMATION_TYPE, 0, MSG_DEFAULT, NULL, 1,
8328
                    0, (LPCSTR*) &buffptr, NULL);
8329 8330 8331 8332 8333 8334 8335
        break;
    }
    DeregisterEventSource(event);
  }

  DBUG_VOID_RETURN;
}
Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
8336
#endif /* _WIN32 */
unknown's avatar
unknown committed
8337

8338

8339
#ifndef EMBEDDED_LIBRARY
8340 8341
static void print_buffer_to_file(enum loglevel level, const char *buffer,
                                 size_t length)
8342 8343 8344 8345
{
  time_t skr;
  struct tm tm_tmp;
  struct tm *start;
8346 8347 8348
  THD *thd;
  int tag_length= 0;
  char tag[NAME_LEN];
8349 8350 8351
  DBUG_ENTER("print_buffer_to_file");
  DBUG_PRINT("enter",("buffer: %s", buffer));

8352 8353 8354 8355 8356 8357 8358 8359
  if (mysqld_server_initialized && (thd= current_thd))
  {
    if (thd->connection_name.length)
    {
      /*
        Add tag for slaves so that the user can see from which connection
        the error originates.
      */
8360 8361
      tag_length= my_snprintf(tag, sizeof(tag),
                              ER_THD(thd, ER_MASTER_LOG_PREFIX),
8362 8363 8364 8365 8366
                              (int) thd->connection_name.length,
                              thd->connection_name.str);
    }
  }

Marc Alff's avatar
Marc Alff committed
8367
  mysql_mutex_lock(&LOCK_error_log);
8368

8369
  skr= my_time(0);
8370 8371
  localtime_r(&skr, &tm_tmp);
  start=&tm_tmp;
8372

8373 8374
  fprintf(stderr, "%d-%02d-%02d %2d:%02d:%02d %lu [%s] %.*s%.*s\n",
          start->tm_year + 1900,
8375 8376 8377 8378 8379
          start->tm_mon+1,
          start->tm_mday,
          start->tm_hour,
          start->tm_min,
          start->tm_sec,
8380
          (unsigned long) pthread_self(),
8381 8382
          (level == ERROR_LEVEL ? "ERROR" : level == WARNING_LEVEL ?
           "Warning" : "Note"),
8383
          tag_length, tag,
8384
          (int) length, buffer);
8385 8386 8387

  fflush(stderr);

Marc Alff's avatar
Marc Alff committed
8388
  mysql_mutex_unlock(&LOCK_error_log);
8389 8390 8391
  DBUG_VOID_RETURN;
}

8392 8393 8394
/**
  Prints a printf style message to the error log and, under NT, to the
  Windows event log.
8395

8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407
  This function prints the message into a buffer and then sends that buffer
  to other functions to write that message to other logging sources.

  @param level          The level of the msg significance
  @param format         Printf style format of message
  @param args           va_list list of arguments for the message

  @returns
    The function always returns 0. The return value is present in the
    signature to be compatible with other logging routines, which could
    return an error (e.g. logging to the log tables)
*/
unknown's avatar
unknown committed
8408
int vprint_msg_to_log(enum loglevel level, const char *format, va_list args)
unknown's avatar
unknown committed
8409 8410
{
  char   buff[1024];
unknown's avatar
unknown committed
8411
  size_t length;
8412
  DBUG_ENTER("vprint_msg_to_log");
unknown's avatar
unknown committed
8413

8414
  length= my_vsnprintf(buff, sizeof(buff), format, args);
8415
  print_buffer_to_file(level, buff, length);
unknown's avatar
unknown committed
8416

Vladislav Vaintroub's avatar
Vladislav Vaintroub committed
8417
#ifdef _WIN32
unknown's avatar
unknown committed
8418
  print_buffer_to_nt_eventlog(level, buff, length, sizeof(buff));
unknown's avatar
unknown committed
8419
#endif
8420

8421
  DBUG_RETURN(0);
unknown's avatar
unknown committed
8422
}
8423
#endif /* EMBEDDED_LIBRARY */
unknown's avatar
unknown committed
8424

8425

unknown's avatar
unknown committed
8426
void sql_print_error(const char *format, ...) 
unknown's avatar
unknown committed
8427 8428
{
  va_list args;
unknown's avatar
unknown committed
8429
  DBUG_ENTER("sql_print_error");
unknown's avatar
unknown committed
8430

unknown's avatar
unknown committed
8431
  va_start(args, format);
8432
  error_log_print(ERROR_LEVEL, format, args);
unknown's avatar
unknown committed
8433
  va_end(args);
unknown's avatar
unknown committed
8434 8435 8436 8437 8438

  DBUG_VOID_RETURN;
}


unknown's avatar
unknown committed
8439
void sql_print_warning(const char *format, ...) 
unknown's avatar
unknown committed
8440 8441
{
  va_list args;
unknown's avatar
unknown committed
8442
  DBUG_ENTER("sql_print_warning");
unknown's avatar
unknown committed
8443

unknown's avatar
unknown committed
8444
  va_start(args, format);
8445
  error_log_print(WARNING_LEVEL, format, args);
unknown's avatar
unknown committed
8446
  va_end(args);
unknown's avatar
unknown committed
8447 8448 8449 8450 8451

  DBUG_VOID_RETURN;
}


unknown's avatar
unknown committed
8452
void sql_print_information(const char *format, ...) 
unknown's avatar
unknown committed
8453 8454
{
  va_list args;
unknown's avatar
unknown committed
8455
  DBUG_ENTER("sql_print_information");
unknown's avatar
unknown committed
8456

unknown's avatar
unknown committed
8457
  va_start(args, format);
8458
  error_log_print(INFORMATION_LEVEL, format, args);
unknown's avatar
unknown committed
8459
  va_end(args);
unknown's avatar
unknown committed
8460

unknown's avatar
unknown committed
8461 8462
  DBUG_VOID_RETURN;
}
8463

8464

8465 8466 8467 8468 8469 8470
void
TC_LOG::run_prepare_ordered(THD *thd, bool all)
{
  Ha_trx_info *ha_info=
    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;

Sergei Golubchik's avatar
Sergei Golubchik committed
8471
  mysql_mutex_assert_owner(&LOCK_prepare_ordered);
8472 8473 8474 8475 8476 8477 8478 8479 8480
  for (; ha_info; ha_info= ha_info->next())
  {
    handlerton *ht= ha_info->ht();
    if (!ht->prepare_ordered)
      continue;
    ht->prepare_ordered(ht, thd, all);
  }
}

unknown's avatar
unknown committed
8481

8482 8483 8484 8485 8486 8487
void
TC_LOG::run_commit_ordered(THD *thd, bool all)
{
  Ha_trx_info *ha_info=
    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;

Sergei Golubchik's avatar
Sergei Golubchik committed
8488
  mysql_mutex_assert_owner(&LOCK_commit_ordered);
8489 8490 8491 8492 8493 8494 8495 8496 8497 8498
  for (; ha_info; ha_info= ha_info->next())
  {
    handlerton *ht= ha_info->ht();
    if (!ht->commit_ordered)
      continue;
    ht->commit_ordered(ht, thd, all);
    DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
  }
}

unknown's avatar
unknown committed
8499

8500 8501 8502
int TC_LOG_MMAP::log_and_order(THD *thd, my_xid xid, bool all,
                               bool need_prepare_ordered,
                               bool need_commit_ordered)
8503 8504
{
  int cookie;
8505
  struct commit_entry entry;
8506
  bool UNINIT_VAR(is_group_commit_leader);
8507 8508 8509

  if (need_prepare_ordered)
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
8510
    mysql_mutex_lock(&LOCK_prepare_ordered);
8511 8512 8513 8514 8515 8516 8517
    run_prepare_ordered(thd, all);
    if (need_commit_ordered)
    {
      /*
        Must put us in queue so we can run_commit_ordered() in same sequence
        as we did run_prepare_ordered().
      */
8518
      thd->clear_wakeup_ready();
8519
      entry.thd= thd;
8520
      commit_entry *previous_queue= commit_ordered_queue;
8521
      entry.next= previous_queue;
8522
      commit_ordered_queue= &entry;
8523 8524
      is_group_commit_leader= (previous_queue == NULL);
    }
Sergei Golubchik's avatar
Sergei Golubchik committed
8525
    mysql_mutex_unlock(&LOCK_prepare_ordered);
8526 8527
  }

8528 8529
  if (thd->wait_for_prior_commit())
    return 0;
8530

unknown's avatar
unknown committed
8531
  cookie= 0;
8532
  if (xid)
8533
    cookie= log_one_transaction(xid);
8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549

  if (need_commit_ordered)
  {
    if (need_prepare_ordered)
    {
      /*
        We did the run_prepare_ordered() serialised, then ran the log_xid() in
        parallel. Now we have to do run_commit_ordered() serialised in the
        same sequence as run_prepare_ordered().

        We do this starting from the head of the queue, each thread doing
        run_commit_ordered() and signalling the next in queue.
      */
      if (is_group_commit_leader)
      {
        /* The first in queue starts the ball rolling. */
Sergei Golubchik's avatar
Sergei Golubchik committed
8550
        mysql_mutex_lock(&LOCK_prepare_ordered);
8551
        while (commit_ordered_queue_busy)
Sergei Golubchik's avatar
Sergei Golubchik committed
8552
          mysql_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
8553 8554
        commit_entry *queue= commit_ordered_queue;
        commit_ordered_queue= NULL;
8555 8556 8557 8558
        /*
          Mark the queue busy while we bounce it from one thread to the
          next.
        */
8559
        commit_ordered_queue_busy= true;
Sergei Golubchik's avatar
Sergei Golubchik committed
8560
        mysql_mutex_unlock(&LOCK_prepare_ordered);
8561

8562 8563 8564 8565 8566 8567 8568 8569 8570 8571
        /* Reverse the queue list so we get correct order. */
        commit_entry *prev= NULL;
        while (queue)
        {
          commit_entry *next= queue->next;
          queue->next= prev;
          prev= queue;
          queue= next;
        }
        DBUG_ASSERT(prev == &entry && prev->thd == thd);
8572 8573 8574 8575
      }
      else
      {
        /* Not first in queue; just wait until previous thread wakes us up. */
8576
        thd->wait_for_wakeup_ready();
8577 8578 8579 8580 8581 8582
      }
    }

    /* Only run commit_ordered() if log_xid was successful. */
    if (cookie)
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
8583
      mysql_mutex_lock(&LOCK_commit_ordered);
8584
      run_commit_ordered(thd, all);
Sergei Golubchik's avatar
Sergei Golubchik committed
8585
      mysql_mutex_unlock(&LOCK_commit_ordered);
8586 8587 8588 8589
    }

    if (need_prepare_ordered)
    {
8590
      commit_entry *next= entry.next;
8591 8592
      if (next)
      {
8593
        next->thd->signal_wakeup_ready();
8594 8595 8596
      }
      else
      {
Sergei Golubchik's avatar
Sergei Golubchik committed
8597
        mysql_mutex_lock(&LOCK_prepare_ordered);
8598
        commit_ordered_queue_busy= false;
Sergei Golubchik's avatar
Sergei Golubchik committed
8599 8600
        mysql_cond_signal(&COND_queue_busy);
        mysql_mutex_unlock(&LOCK_prepare_ordered);
8601 8602 8603 8604 8605 8606 8607 8608
      }
    }
  }

  return cookie;
}


8609 8610 8611
/********* transaction coordinator log for 2pc - mmap() based solution *******/

/*
8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624
  the log consists of a file, mapped to memory.
  file is divided into pages of tc_log_page_size size.
  (usable size of the first page is smaller because of the log header)
  there is a PAGE control structure for each page
  each page (or rather its PAGE control structure) can be in one of
  the three states - active, syncing, pool.
  there could be only one page in the active or syncing state,
  but many in pool - pool is a fifo queue.
  the usual lifecycle of a page is pool->active->syncing->pool.
  the "active" page is a page where new xid's are logged.
  the page stays active as long as the syncing slot is taken.
  the "syncing" page is being synced to disk. no new xid can be added to it.
  when the syncing is done the page is moved to a pool and an active page
unknown's avatar
unknown committed
8625 8626 8627 8628
  becomes "syncing".

  the result of such an architecture is a natural "commit grouping" -
  If commits are coming faster than the system can sync, they do not
8629 8630
  stall. Instead, all commits that came since the last sync are
  logged to the same "active" page, and they all are synced with the next -
unknown's avatar
unknown committed
8631 8632 8633
  one - sync. Thus, thought individual commits are delayed, throughput
  is not decreasing.

8634
  when an xid is added to an active page, the thread of this xid waits
unknown's avatar
unknown committed
8635 8636 8637
  for a page's condition until the page is synced. when syncing slot
  becomes vacant one of these waiters is awaken to take care of syncing.
  it syncs the page and signals all waiters that the page is synced.
8638 8639
  PAGE::waiters is used to count these waiters, and a page may never
  become active again until waiters==0 (that is all waiters from the
8640
  previous sync have noticed that the sync was completed)
8641

unknown's avatar
unknown committed
8642 8643
  note, that the page becomes "dirty" and has to be synced only when a
  new xid is added into it. Removing a xid from a page does not make it
8644
  dirty - we don't sync xid removals to disk.
8645
*/
8646 8647 8648 8649 8650

ulong tc_log_page_waits= 0;

#ifdef HAVE_MMAP

8651 8652
#define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)

Michael Widenius's avatar
Michael Widenius committed
8653
static const uchar tc_log_magic[]={(uchar) 254, 0x23, 0x05, 0x74};
8654

8655
ulong opt_tc_log_size;
8656
ulong tc_log_max_pages_used=0, tc_log_page_size=0, tc_log_cur_pages_used=0;
8657 8658 8659 8660 8661 8662 8663

int TC_LOG_MMAP::open(const char *opt_name)
{
  uint i;
  bool crashed=FALSE;
  PAGE *pg;

8664
  DBUG_ASSERT(total_ha_2pc > 1);
8665 8666
  DBUG_ASSERT(opt_name && opt_name[0]);

unknown's avatar
unknown committed
8667
  tc_log_page_size= my_getpagesize();
8668 8669

  fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
Marc Alff's avatar
Marc Alff committed
8670
  if ((fd= mysql_file_open(key_file_tclog, logname, O_RDWR, MYF(0))) < 0)
8671
  {
unknown's avatar
unknown committed
8672 8673
    if (my_errno != ENOENT)
      goto err;
8674 8675
    if (using_heuristic_recover())
      return 1;
Marc Alff's avatar
Marc Alff committed
8676 8677
    if ((fd= mysql_file_create(key_file_tclog, logname, CREATE_MODE,
                               O_RDWR, MYF(MY_WME))) < 0)
8678 8679
      goto err;
    inited=1;
unknown's avatar
unknown committed
8680
    file_length= opt_tc_log_size;
Marc Alff's avatar
Marc Alff committed
8681
    if (mysql_file_chsize(fd, file_length, 0, MYF(MY_WME)))
8682 8683 8684 8685
      goto err;
  }
  else
  {
unknown's avatar
unknown committed
8686 8687
    inited= 1;
    crashed= TRUE;
unknown's avatar
unknown committed
8688
    sql_print_information("Recovering after a crash using %s", opt_name);
8689 8690 8691 8692 8693 8694
    if (tc_heuristic_recover)
    {
      sql_print_error("Cannot perform automatic crash recovery when "
                      "--tc-heuristic-recover is used");
      goto err;
    }
Marc Alff's avatar
Marc Alff committed
8695
    file_length= mysql_file_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
8696 8697 8698 8699
    if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
      goto err;
  }

8700
  data= (uchar *)my_mmap(0, (size_t)file_length, PROT_READ|PROT_WRITE,
8701 8702 8703 8704 8705 8706 8707 8708
                        MAP_NOSYNC|MAP_SHARED, fd, 0);
  if (data == MAP_FAILED)
  {
    my_errno=errno;
    goto err;
  }
  inited=2;

8709
  npages=(uint)file_length/tc_log_page_size;
8710 8711
  if (npages < 3)             // to guarantee non-empty pool
    goto err;
8712 8713 8714 8715 8716 8717 8718
  if (!(pages=(PAGE *)my_malloc(npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
    goto err;
  inited=3;
  for (pg=pages, i=0; i < npages; i++, pg++)
  {
    pg->next=pg+1;
    pg->waiters=0;
8719
    pg->state=PS_POOL;
Marc Alff's avatar
Marc Alff committed
8720 8721
    mysql_mutex_init(key_PAGE_lock, &pg->lock, MY_MUTEX_INIT_FAST);
    mysql_cond_init(key_PAGE_cond, &pg->cond, 0);
8722
    pg->ptr= pg->start=(my_xid *)(data + i*tc_log_page_size);
8723
    pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
8724
    pg->end=pg->start + pg->size;
8725 8726 8727
  }
  pages[0].size=pages[0].free=
                (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
unknown's avatar
unknown committed
8728
  pages[0].start=pages[0].end-pages[0].size;
8729 8730 8731
  pages[npages-1].next=0;
  inited=4;

unknown's avatar
unknown committed
8732
  if (crashed && recover())
8733
      goto err;
unknown's avatar
unknown committed
8734

8735
  memcpy(data, tc_log_magic, sizeof(tc_log_magic));
8736
  data[sizeof(tc_log_magic)]= (uchar)total_ha_2pc;
8737 8738 8739
  my_msync(fd, data, tc_log_page_size, MS_SYNC);
  inited=5;

Marc Alff's avatar
Marc Alff committed
8740 8741 8742
  mysql_mutex_init(key_LOCK_sync, &LOCK_sync, MY_MUTEX_INIT_FAST);
  mysql_mutex_init(key_LOCK_active, &LOCK_active, MY_MUTEX_INIT_FAST);
  mysql_mutex_init(key_LOCK_pool, &LOCK_pool, MY_MUTEX_INIT_FAST);
8743 8744
  mysql_mutex_init(key_LOCK_pending_checkpoint, &LOCK_pending_checkpoint,
                   MY_MUTEX_INIT_FAST);
Marc Alff's avatar
Marc Alff committed
8745 8746
  mysql_cond_init(key_COND_active, &COND_active, 0);
  mysql_cond_init(key_COND_pool, &COND_pool, 0);
8747
  mysql_cond_init(key_TC_LOG_MMAP_COND_queue_busy, &COND_queue_busy, 0);
8748 8749 8750

  inited=6;

unknown's avatar
unknown committed
8751
  syncing= 0;
8752
  active=pages;
8753
  DBUG_ASSERT(npages >= 2);
8754
  pool=pages+1;
8755
  pool_last_ptr= &((pages+npages-1)->next);
8756 8757
  commit_ordered_queue= NULL;
  commit_ordered_queue_busy= false;
8758 8759 8760 8761 8762 8763 8764 8765

  return 0;

err:
  close();
  return 1;
}

unknown's avatar
unknown committed
8766 8767
/**
  there is no active page, let's got one from the pool.
unknown's avatar
unknown committed
8768

unknown's avatar
unknown committed
8769 8770 8771
  Two strategies here:
    -# take the first from the pool
    -# if there're waiters - take the one with the most free space.
8772

unknown's avatar
unknown committed
8773
  @todo
8774
    page merging. try to allocate adjacent page first,
unknown's avatar
unknown committed
8775
    so that they can be flushed both in one sync
8776
*/
unknown's avatar
unknown committed
8777

8778 8779 8780 8781 8782
void TC_LOG_MMAP::get_active_from_pool()
{
  PAGE **p, **best_p=0;
  int best_free;

Sergei Golubchik's avatar
Sergei Golubchik committed
8783
  mysql_mutex_lock(&LOCK_pool);
unknown's avatar
unknown committed
8784

8785 8786 8787
  do
  {
    best_p= p= &pool;
8788 8789
    if ((*p)->waiters == 0 && (*p)->free > 0) // can the first page be used ?
      break;                                  // yes - take it.
8790

unknown's avatar
unknown committed
8791
    best_free=0;            // no - trying second strategy
8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802
    for (p=&(*p)->next; *p; p=&(*p)->next)
    {
      if ((*p)->waiters == 0 && (*p)->free > best_free)
      {
        best_free=(*p)->free;
        best_p=p;
      }
    }
  }
  while ((*best_p == 0 || best_free == 0) && overflow());

8803
  mysql_mutex_assert_owner(&LOCK_active);
8804 8805
  active=*best_p;

8806 8807 8808 8809
  /* Unlink the page from the pool. */
  if (!(*best_p)->next)
    pool_last_ptr= best_p;
  *best_p=(*best_p)->next;
Sergei Golubchik's avatar
Sergei Golubchik committed
8810
  mysql_mutex_unlock(&LOCK_pool);
unknown's avatar
unknown committed
8811

Sergei Golubchik's avatar
Sergei Golubchik committed
8812
  mysql_mutex_lock(&active->lock);
8813 8814 8815 8816 8817
  if (active->free == active->size) // we've chosen an empty page
  {
    tc_log_cur_pages_used++;
    set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
  }
8818 8819
}

unknown's avatar
unknown committed
8820 8821 8822 8823
/**
  @todo
  perhaps, increase log size ?
*/
8824 8825 8826 8827 8828 8829 8830 8831
int TC_LOG_MMAP::overflow()
{
  /*
    simple overflow handling - just wait
    TODO perhaps, increase log size ?
    let's check the behaviour of tc_log_page_waits first
  */
  tc_log_page_waits++;
Marc Alff's avatar
Marc Alff committed
8832
  mysql_cond_wait(&COND_pool, &LOCK_pool);
unknown's avatar
unknown committed
8833
  return 1; // always return 1
8834 8835
}

unknown's avatar
unknown committed
8836 8837
/**
  Record that transaction XID is committed on the persistent storage.
8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852

    This function is called in the middle of two-phase commit:
    First all resources prepare the transaction, then tc_log->log() is called,
    then all resources commit the transaction, then tc_log->unlog() is called.

    All access to active page is serialized but it's not a problem, as
    we're assuming that fsync() will be a main bottleneck.
    That is, parallelizing writes to log pages we'll decrease number of
    threads waiting for a page, but then all these threads will be waiting
    for a fsync() anyway

   If tc_log == MYSQL_LOG then tc_log writes transaction to binlog and
   records XID in a special Xid_log_event.
   If tc_log = TC_LOG_MMAP then xid is written in a special memory-mapped
   log.
unknown's avatar
unknown committed
8853

unknown's avatar
unknown committed
8854 8855 8856 8857 8858 8859 8860
  @retval
    0  - error
  @retval
    \# - otherwise, "cookie", a number that will be passed as an argument
    to unlog() call. tc_log can define it any way it wants,
    and use for whatever purposes. TC_LOG_MMAP sets it
    to the position in memory where xid was logged to.
8861 8862
*/

8863
int TC_LOG_MMAP::log_one_transaction(my_xid xid)
8864 8865 8866 8867 8868
{
  int err;
  PAGE *p;
  ulong cookie;

Marc Alff's avatar
Marc Alff committed
8869
  mysql_mutex_lock(&LOCK_active);
8870

unknown's avatar
unknown committed
8871
  /*
8872
    if the active page is full - just wait...
unknown's avatar
unknown committed
8873 8874 8875 8876 8877
    frankly speaking, active->free here accessed outside of mutex
    protection, but it's safe, because it only means we may miss an
    unlog() for the active page, and we're not waiting for it here -
    unlog() does not signal COND_active.
  */
8878
  while (unlikely(active && active->free == 0))
Marc Alff's avatar
Marc Alff committed
8879
    mysql_cond_wait(&COND_active, &LOCK_active);
8880

unknown's avatar
unknown committed
8881
  /* no active page ? take one from the pool */
8882 8883
  if (active == 0)
    get_active_from_pool();
8884
  else
Sergei Golubchik's avatar
Sergei Golubchik committed
8885
    mysql_mutex_lock(&active->lock);
8886 8887

  p=active;
8888 8889 8890 8891 8892 8893 8894

  /*
    p->free is always > 0 here because to decrease it one needs
    to take p->lock and before it one needs to take LOCK_active.
    But checked that active->free > 0 under LOCK_active and
    haven't release it ever since
  */
8895

unknown's avatar
unknown committed
8896
  /* searching for an empty slot */
8897 8898 8899 8900 8901 8902
  while (*p->ptr)
  {
    p->ptr++;
    DBUG_ASSERT(p->ptr < p->end);               // because p->free > 0
  }

unknown's avatar
unknown committed
8903 8904
  /* found! store xid there and mark the page dirty */
  cookie= (ulong)((uchar *)p->ptr - data);      // can never be zero
8905 8906
  *p->ptr++= xid;
  p->free--;
8907
  p->state= PS_DIRTY;
Marc Alff's avatar
Marc Alff committed
8908
  mysql_mutex_unlock(&p->lock);
8909

Sergei Golubchik's avatar
Sergei Golubchik committed
8910
  mysql_mutex_lock(&LOCK_sync);
unknown's avatar
unknown committed
8911
  if (syncing)
8912
  {                                          // somebody's syncing. let's wait
Sergei Golubchik's avatar
Sergei Golubchik committed
8913 8914
    mysql_mutex_unlock(&LOCK_active);
    mysql_mutex_lock(&p->lock);
8915
    p->waiters++;
Sergei Golubchik's avatar
Sergei Golubchik committed
8916
    while (p->state == PS_DIRTY && syncing)
8917
    {
Sergei Golubchik's avatar
Sergei Golubchik committed
8918
      mysql_mutex_unlock(&p->lock);
Marc Alff's avatar
Marc Alff committed
8919
      mysql_cond_wait(&p->cond, &LOCK_sync);
Sergei Golubchik's avatar
Sergei Golubchik committed
8920
      mysql_mutex_lock(&p->lock);
8921
    }
8922
    p->waiters--;
8923 8924
    err= p->state == PS_ERROR;
    if (p->state != PS_DIRTY)                   // page was synced
8925
    {
Marc Alff's avatar
Marc Alff committed
8926
      mysql_mutex_unlock(&LOCK_sync);
8927
      if (p->waiters == 0)
Sergei Golubchik's avatar
Sergei Golubchik committed
8928 8929
        mysql_cond_signal(&COND_pool);     // in case somebody's waiting
      mysql_mutex_unlock(&p->lock);
8930 8931
      goto done;                             // we're done
    }
8932
    DBUG_ASSERT(!syncing);
Sergei Golubchik's avatar
Sergei Golubchik committed
8933
    mysql_mutex_unlock(&p->lock);
8934
    syncing = p;
Sergei Golubchik's avatar
Sergei Golubchik committed
8935
    mysql_mutex_unlock(&LOCK_sync);
8936

Sergei Golubchik's avatar
Sergei Golubchik committed
8937
    mysql_mutex_lock(&LOCK_active);
8938
    active=0;                                  // page is not active anymore
Sergei Golubchik's avatar
Sergei Golubchik committed
8939 8940
    mysql_cond_broadcast(&COND_active);
    mysql_mutex_unlock(&LOCK_active);
8941 8942 8943 8944
  }
  else
  {
    syncing = p;                               // place is vacant - take it
Sergei Golubchik's avatar
Sergei Golubchik committed
8945
    mysql_mutex_unlock(&LOCK_sync);
8946
    active = 0;                                // page is not active anymore
Sergei Golubchik's avatar
Sergei Golubchik committed
8947 8948
    mysql_cond_broadcast(&COND_active);
    mysql_mutex_unlock(&LOCK_active);
8949
  }
8950 8951 8952 8953 8954 8955 8956 8957 8958 8959
  err= sync();

done:
  return err ? 0 : cookie;
}

int TC_LOG_MMAP::sync()
{
  int err;

unknown's avatar
unknown committed
8960
  DBUG_ASSERT(syncing != active);
8961 8962 8963 8964 8965

  /*
    sit down and relax - this can take a while...
    note - no locks are held at this point
  */
8966
  err= my_msync(fd, syncing->start, syncing->size * sizeof(my_xid), MS_SYNC);
8967

unknown's avatar
unknown committed
8968
  /* page is synced. let's move it to the pool */
Marc Alff's avatar
Marc Alff committed
8969
  mysql_mutex_lock(&LOCK_pool);
8970 8971
  (*pool_last_ptr)=syncing;
  pool_last_ptr=&(syncing->next);
unknown's avatar
unknown committed
8972
  syncing->next=0;
8973
  syncing->state= err ? PS_ERROR : PS_POOL;
Sergei Golubchik's avatar
Sergei Golubchik committed
8974
  mysql_cond_signal(&COND_pool);           // in case somebody's waiting
Marc Alff's avatar
Marc Alff committed
8975
  mysql_mutex_unlock(&LOCK_pool);
8976

unknown's avatar
unknown committed
8977
  /* marking 'syncing' slot free */
Marc Alff's avatar
Marc Alff committed
8978
  mysql_mutex_lock(&LOCK_sync);
Sergei Golubchik's avatar
Sergei Golubchik committed
8979
  mysql_cond_broadcast(&syncing->cond);    // signal "sync done"
unknown's avatar
unknown committed
8980
  syncing=0;
8981 8982 8983 8984 8985
  /*
    we check the "active" pointer without LOCK_active. Still, it's safe -
    "active" can change from NULL to not NULL any time, but it
    will take LOCK_sync before waiting on active->cond. That is, it can never
    miss a signal.
8986 8987
    And "active" can change to NULL only by the syncing thread
    (the thread that will send a signal below)
8988 8989
  */
  if (active)
Sergei Golubchik's avatar
Sergei Golubchik committed
8990
    mysql_cond_signal(&active->cond);      // wake up a new syncer
Marc Alff's avatar
Marc Alff committed
8991
  mysql_mutex_unlock(&LOCK_sync);
8992 8993 8994
  return err;
}

8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005
static void
mmap_do_checkpoint_callback(void *data)
{
  TC_LOG_MMAP::pending_cookies *pending=
    static_cast<TC_LOG_MMAP::pending_cookies *>(data);
  ++pending->pending_count;
}

int TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
{
  pending_cookies *full_buffer= NULL;
9006
  uint32 ncookies= tc_log_page_size / sizeof(my_xid);
9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019
  DBUG_ASSERT(*(my_xid *)(data+cookie) == xid);

  /*
    Do not delete the entry immediately, as there may be participating storage
    engines which implement commit_checkpoint_request(), and thus have not yet
    flushed the commit durably to disk.

    Instead put it in a queue - and periodically, we will request a checkpoint
    from all engines and delete a whole batch at once.
  */
  mysql_mutex_lock(&LOCK_pending_checkpoint);
  if (pending_checkpoint == NULL)
  {
9020
    uint32 size= sizeof(*pending_checkpoint) + sizeof(ulong) * (ncookies - 1);
9021 9022 9023 9024 9025 9026 9027 9028 9029 9030
    if (!(pending_checkpoint=
          (pending_cookies *)my_malloc(size, MYF(MY_ZEROFILL))))
    {
      my_error(ER_OUTOFMEMORY, MYF(0), size);
      mysql_mutex_unlock(&LOCK_pending_checkpoint);
      return 1;
    }
  }

  pending_checkpoint->cookies[pending_checkpoint->count++]= cookie;
9031
  if (pending_checkpoint->count == ncookies)
9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064
  {
    full_buffer= pending_checkpoint;
    pending_checkpoint= NULL;
  }
  mysql_mutex_unlock(&LOCK_pending_checkpoint);

  if (full_buffer)
  {
    /*
      We do an extra increment and notify here - this ensures that
      things work also if there are no engines at all that support
      commit_checkpoint_request.
    */
    ++full_buffer->pending_count;
    ha_commit_checkpoint_request(full_buffer, mmap_do_checkpoint_callback);
    commit_checkpoint_notify(full_buffer);
  }
  return 0;
}


void
TC_LOG_MMAP::commit_checkpoint_notify(void *cookie)
{
  uint count;
  pending_cookies *pending= static_cast<pending_cookies *>(cookie);
  mysql_mutex_lock(&LOCK_pending_checkpoint);
  DBUG_ASSERT(pending->pending_count > 0);
  count= --pending->pending_count;
  mysql_mutex_unlock(&LOCK_pending_checkpoint);
  if (count == 0)
  {
    uint i;
9065
    for (i= 0; i < tc_log_page_size / sizeof(my_xid); ++i)
9066 9067 9068 9069 9070 9071
      delete_entry(pending->cookies[i]);
    my_free(pending);
  }
}


unknown's avatar
unknown committed
9072
/**
unknown's avatar
unknown committed
9073
  erase xid from the page, update page free space counters/pointers.
unknown's avatar
unknown committed
9074
  cookie points directly to the memory where xid was logged.
unknown's avatar
unknown committed
9075
*/
9076

9077
int TC_LOG_MMAP::delete_entry(ulong cookie)
9078 9079 9080 9081 9082 9083
{
  PAGE *p=pages+(cookie/tc_log_page_size);
  my_xid *x=(my_xid *)(data+cookie);

  DBUG_ASSERT(x >= p->start && x < p->end);

Marc Alff's avatar
Marc Alff committed
9084
  mysql_mutex_lock(&p->lock);
9085
  *x=0;
9086 9087 9088
  p->free++;
  DBUG_ASSERT(p->free <= p->size);
  set_if_smaller(p->ptr, x);
9089
  if (p->free == p->size)              // the page is completely empty
9090 9091
    statistic_decrement(tc_log_cur_pages_used, &LOCK_status);
  if (p->waiters == 0)                 // the page is in pool and ready to rock
Marc Alff's avatar
Marc Alff committed
9092 9093
    mysql_cond_signal(&COND_pool);     // ping ... for overflow()
  mysql_mutex_unlock(&p->lock);
9094
  return 0;
9095 9096 9097 9098
}

void TC_LOG_MMAP::close()
{
unknown's avatar
unknown committed
9099
  uint i;
9100 9101
  switch (inited) {
  case 6:
Marc Alff's avatar
Marc Alff committed
9102 9103 9104
    mysql_mutex_destroy(&LOCK_sync);
    mysql_mutex_destroy(&LOCK_active);
    mysql_mutex_destroy(&LOCK_pool);
9105
    mysql_mutex_destroy(&LOCK_pending_checkpoint);
Marc Alff's avatar
Marc Alff committed
9106
    mysql_cond_destroy(&COND_pool);
Sergei Golubchik's avatar
Sergei Golubchik committed
9107 9108
    mysql_cond_destroy(&COND_active);
    mysql_cond_destroy(&COND_queue_busy);
9109
  case 5:
Marc Alff's avatar
Marc Alff committed
9110
    data[0]='A'; // garble the first (signature) byte, in case mysql_file_delete fails
9111
  case 4:
unknown's avatar
unknown committed
9112
    for (i=0; i < npages; i++)
9113 9114 9115
    {
      if (pages[i].ptr == 0)
        break;
Marc Alff's avatar
Marc Alff committed
9116 9117
      mysql_mutex_destroy(&pages[i].lock);
      mysql_cond_destroy(&pages[i].cond);
9118 9119
    }
  case 3:
9120
    my_free(pages);
9121
  case 2:
9122
    my_munmap((char*)data, (size_t)file_length);
9123
  case 1:
Marc Alff's avatar
Marc Alff committed
9124
    mysql_file_close(fd, MYF(0));
9125 9126
  }
  if (inited>=5) // cannot do in the switch because of Windows
Marc Alff's avatar
Marc Alff committed
9127
    mysql_file_delete(key_file_tclog, logname, MYF(MY_WME));
9128 9129
  if (pending_checkpoint)
    my_free(pending_checkpoint);
9130 9131 9132
  inited=0;
}

9133

9134 9135 9136 9137 9138
int TC_LOG_MMAP::recover()
{
  HASH xids;
  PAGE *p=pages, *end_p=pages+npages;

Michael Widenius's avatar
Michael Widenius committed
9139
  if (bcmp(data, tc_log_magic, sizeof(tc_log_magic)))
9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150
  {
    sql_print_error("Bad magic header in tc log");
    goto err1;
  }

  /*
    the first byte after magic signature is set to current
    number of storage engines on startup
  */
  if (data[sizeof(tc_log_magic)] != total_ha_2pc)
  {
9151
    sql_print_error("Recovery failed! You must enable "
unknown's avatar
unknown committed
9152 9153 9154
                    "exactly %d storage engines that support "
                    "two-phase commit protocol",
                    data[sizeof(tc_log_magic)]);
9155 9156 9157
    goto err1;
  }

Konstantin Osipov's avatar
Konstantin Osipov committed
9158 9159
  if (my_hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
                   sizeof(my_xid), 0, 0, MYF(0)))
9160 9161 9162 9163 9164
    goto err1;

  for ( ; p < end_p ; p++)
  {
    for (my_xid *x=p->start; x < p->end; x++)
9165
      if (*x && my_hash_insert(&xids, (uchar *)x))
9166 9167 9168 9169 9170 9171
        goto err2; // OOM
  }

  if (ha_recover(&xids))
    goto err2;

Konstantin Osipov's avatar
Konstantin Osipov committed
9172
  my_hash_free(&xids);
9173
  bzero(data, (size_t)file_length);
9174 9175 9176
  return 0;

err2:
Konstantin Osipov's avatar
Konstantin Osipov committed
9177
  my_hash_free(&xids);
9178 9179 9180 9181 9182 9183 9184
err1:
  sql_print_error("Crash recovery failed. Either correct the problem "
                  "(if it's, for example, out of memory error) and restart, "
                  "or delete tc log and start mysqld with "
                  "--tc-heuristic-recover={commit|rollback}");
  return 1;
}
unknown's avatar
unknown committed
9185 9186 9187 9188 9189
#endif

TC_LOG *tc_log;
TC_LOG_DUMMY tc_log_dummy;
TC_LOG_MMAP  tc_log_mmap;
9190

unknown's avatar
unknown committed
9191 9192
/**
  Perform heuristic recovery, if --tc-heuristic-recover was used.
9193

unknown's avatar
unknown committed
9194
  @note
9195 9196
    no matter whether heuristic recovery was successful or not
    mysqld must exit. So, return value is the same in both cases.
unknown's avatar
unknown committed
9197 9198 9199 9200 9201

  @retval
    0	no heuristic recovery was requested
  @retval
    1   heuristic recovery was performed
9202 9203
*/

9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216
int TC_LOG::using_heuristic_recover()
{
  if (!tc_heuristic_recover)
    return 0;

  sql_print_information("Heuristic crash recovery mode");
  if (ha_recover(0))
    sql_print_error("Heuristic crash recovery failed");
  sql_print_information("Please restart mysqld without --tc-heuristic-recover");
  return 1;
}

/****** transaction coordinator log for 2pc - binlog() based solution ******/
9217
#define TC_LOG_BINLOG MYSQL_BIN_LOG
9218

unknown's avatar
unknown committed
9219
int TC_LOG_BINLOG::open(const char *opt_name)
9220
{
9221
  int      error= 1;
9222 9223

  DBUG_ASSERT(total_ha_2pc > 1);
9224
  DBUG_ASSERT(opt_name && opt_name[0]);
9225

9226 9227 9228 9229 9230 9231 9232
  if (!my_b_inited(&index_file))
  {
    /* There was a failure to open the index file, can't open the binlog */
    cleanup();
    return 1;
  }

9233
  if (using_heuristic_recover())
9234 9235
  {
    /* generate a new binlog to mask a corrupted one */
9236
    open(opt_name, LOG_BIN, 0, 0, WRITE_CACHE, max_binlog_size, 0, TRUE);
9237
    cleanup();
9238
    return 1;
9239
  }
9240

9241 9242
  error= do_binlog_recovery(opt_name, true);
  binlog_state_recover_done= true;
9243
  return error;
9244 9245
}

unknown's avatar
unknown committed
9246
/** This is called on shutdown, after ha_panic. */
unknown's avatar
unknown committed
9247
void TC_LOG_BINLOG::close()
9248 9249 9250
{
}

9251 9252 9253
/*
  Do a binlog log_xid() for a group of transactions, linked through
  thd->next_commit_ordered.
unknown's avatar
unknown committed
9254
*/
9255
int
9256 9257 9258
TC_LOG_BINLOG::log_and_order(THD *thd, my_xid xid, bool all,
                             bool need_prepare_ordered __attribute__((unused)),
                             bool need_commit_ordered __attribute__((unused)))
9259
{
9260 9261 9262
  int err;
  DBUG_ENTER("TC_LOG_BINLOG::log_and_order");

9263 9264
  binlog_cache_mngr *cache_mngr= thd->binlog_setup_trx_data();
  if (!cache_mngr)
9265 9266
  {
    WSREP_DEBUG("Skipping empty log_xid: %s", thd->query());
9267
    DBUG_RETURN(0);
9268
  }
9269

Sergei Golubchik's avatar
Sergei Golubchik committed
9270 9271 9272
  cache_mngr->using_xa= TRUE;
  cache_mngr->xa_xid= xid;
  err= binlog_commit_flush_xid_caches(thd, cache_mngr, all, xid);
9273

9274 9275
  DEBUG_SYNC(thd, "binlog_after_log_and_order");

9276 9277 9278 9279 9280 9281
  if (err)
    DBUG_RETURN(0);
  /*
    If using explicit user XA, we will not have XID. We must still return a
    non-zero cookie (as zero cookie signals error).
  */
9282 9283 9284 9285 9286
  if (!xid || !cache_mngr->need_unlog)
    DBUG_RETURN(BINLOG_COOKIE_DUMMY(cache_mngr->delayed_error));
  else
    DBUG_RETURN(BINLOG_COOKIE_MAKE(cache_mngr->binlog_id,
                                   cache_mngr->delayed_error));
9287 9288
}

9289 9290 9291 9292 9293 9294 9295 9296 9297 9298
/*
  After an XID is logged, we need to hold on to the current binlog file until
  it is fully committed in the storage engine. The reason is that crash
  recovery only looks at the latest binlog, so we must make sure there are no
  outstanding prepared (but not committed) transactions before rotating the
  binlog.

  To handle this, we keep a count of outstanding XIDs. This function is used
  to increase this count when committing one or more transactions to the
  binary log.
unknown's avatar
unknown committed
9299
*/
9300
void
9301
TC_LOG_BINLOG::mark_xids_active(ulong binlog_id, uint xid_count)
9302
{
9303 9304
  xid_count_per_binlog *b;

9305
  DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
9306
  DBUG_PRINT("info", ("binlog_id=%lu xid_count=%u", binlog_id, xid_count));
9307 9308 9309 9310 9311

  mysql_mutex_lock(&LOCK_xid_list);
  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
  while ((b= it++))
  {
9312
    if (b->binlog_id == binlog_id)
9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323
    {
      b->xid_count += xid_count;
      break;
    }
  }
  /*
    As we do not delete elements until count reach zero, elements should always
    be found.
  */
  DBUG_ASSERT(b);
  mysql_mutex_unlock(&LOCK_xid_list);
9324
  DBUG_VOID_RETURN;
9325 9326
}

9327
/*
9328 9329
  Once an XID is committed, it can no longer be needed during crash recovery,
  as it has been durably recorded on disk as "committed".
9330 9331

  This function is called to mark an XID this way. It needs to decrease the
9332 9333 9334 9335
  count of pending XIDs in the corresponding binlog. When the count reaches
  zero (for an "old" binlog that is not the active one), that binlog file no
  longer need to be scanned during crash recovery, so we can log a new binlog
  checkpoint.
9336 9337
*/
void
9338
TC_LOG_BINLOG::mark_xid_done(ulong binlog_id, bool write_checkpoint)
9339
{
9340 9341 9342
  xid_count_per_binlog *b;
  bool first;
  ulong current;
unknown's avatar
unknown committed
9343

9344
  DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
9345 9346 9347 9348 9349 9350 9351

  mysql_mutex_lock(&LOCK_xid_list);
  current= current_binlog_id;
  I_List_iterator<xid_count_per_binlog> it(binlog_xid_count_list);
  first= true;
  while ((b= it++))
  {
9352
    if (b->binlog_id == binlog_id)
9353 9354 9355 9356 9357 9358 9359 9360
    {
      --b->xid_count;
      break;
    }
    first= false;
  }
  /* Binlog is always found, as we do not remove until count reaches 0 */
  DBUG_ASSERT(b);
9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376
  /*
    If a RESET MASTER is pending, we are about to remove all log files, and
    the RESET MASTER thread is waiting for all pending unlog() calls to
    complete while holding LOCK_log. In this case we should not log a binlog
    checkpoint event (it would be deleted immediately anyway and we would
    deadlock on LOCK_log) but just signal the thread.
  */
  if (unlikely(reset_master_pending))
  {
    mysql_cond_signal(&COND_xid_list);
    mysql_mutex_unlock(&LOCK_xid_list);
    DBUG_VOID_RETURN;
  }

  if (likely(binlog_id == current) || b->xid_count != 0 || !first ||
      !write_checkpoint)
9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401
  {
    /* No new binlog checkpoint reached yet. */
    mysql_mutex_unlock(&LOCK_xid_list);
    DBUG_VOID_RETURN;
  }

  /*
    Now log a binlog checkpoint for the first binlog file with a non-zero count.

    Note that it is possible (though perhaps unlikely) that when count of
    binlog (N-2) drops to zero, binlog (N-1) is already at zero. So we may
    need to skip several entries before we find the one to log in the binlog
    checkpoint event.

    We chain the locking of LOCK_xid_list and LOCK_log, so that we ensure that
    Binlog_checkpoint_events are logged in order. This simplifies recovery a
    bit, as it can just take the last binlog checkpoint in the log, rather
    than compare all found against each other to find the one pointing to the
    most recent binlog.

    Note also that we need to first release LOCK_xid_list, then aquire
    LOCK_log, then re-aquire LOCK_xid_list. If we were to take LOCK_log while
    holding LOCK_xid_list, we might deadlock with other threads that take the
    locks in the opposite order.
  */
9402

9403
  ++mark_xid_done_waiting;
9404 9405 9406
  mysql_mutex_unlock(&LOCK_xid_list);
  mysql_mutex_lock(&LOCK_log);
  mysql_mutex_lock(&LOCK_xid_list);
9407 9408 9409
  --mark_xid_done_waiting;
  if (unlikely(reset_master_pending))
    mysql_cond_signal(&COND_xid_list);
9410 9411 9412
  /* We need to reload current_binlog_id due to release/re-take of lock. */
  current= current_binlog_id;

9413 9414 9415 9416 9417
  for (;;)
  {
    /* Remove initial element(s) with zero count. */
    b= binlog_xid_count_list.head();
    /*
9418 9419
      We must not remove all elements in the list - the entry for the current
      binlog must be present always.
9420
    */
9421 9422
    DBUG_ASSERT(b);
    if (b->binlog_id == current || b->xid_count > 0)
9423 9424
      break;
    my_free(binlog_xid_count_list.get());
9425
  }
9426 9427 9428 9429 9430

  mysql_mutex_unlock(&LOCK_xid_list);
  write_binlog_checkpoint_event_already_locked(b->binlog_name,
                                               b->binlog_name_len);
  mysql_mutex_unlock(&LOCK_log);
9431 9432 9433
  DBUG_VOID_RETURN;
}

9434
int TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
9435 9436
{
  DBUG_ENTER("TC_LOG_BINLOG::unlog");
9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451
  if (!xid)
    DBUG_RETURN(0);

  if (!BINLOG_COOKIE_IS_DUMMY(cookie))
    mark_xid_done(BINLOG_COOKIE_GET_ID(cookie), true);
  /*
    See comment in trx_group_commit_leader() - if rotate() gave a failure,
    we delay the return of error code to here.
  */
  DBUG_RETURN(BINLOG_COOKIE_GET_ERROR_FLAG(cookie));
}

void
TC_LOG_BINLOG::commit_checkpoint_notify(void *cookie)
{
9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478
  xid_count_per_binlog *entry= static_cast<xid_count_per_binlog *>(cookie);
  mysql_mutex_lock(&LOCK_binlog_background_thread);
  entry->next_in_queue= binlog_background_thread_queue;
  binlog_background_thread_queue= entry;
  mysql_cond_signal(&COND_binlog_background_thread);
  mysql_mutex_unlock(&LOCK_binlog_background_thread);
}

/*
  Binlog background thread.

  This thread is used to log binlog checkpoints in the background, rather than
  in the context of random storage engine threads that happen to call
  commit_checkpoint_notify_ha() and may not like the delays while syncing
  binlog to disk or may not be setup with all my_thread_init() and other
  necessary stuff.

  In the future, this thread could also be used to do log rotation in the
  background, which could elimiate all stalls around binlog rotations.
*/
pthread_handler_t
binlog_background_thread(void *arg __attribute__((unused)))
{
  bool stop;
  MYSQL_BIN_LOG::xid_count_per_binlog *queue, *next;
  THD *thd;
  my_thread_init();
9479 9480
  DBUG_ENTER("binlog_background_thread");

9481 9482 9483 9484 9485 9486 9487
  thd= new THD;
  thd->system_thread= SYSTEM_THREAD_BINLOG_BACKGROUND;
  thd->thread_stack= (char*) &thd;           /* Set approximate stack start */
  mysql_mutex_lock(&LOCK_thread_count);
  thd->thread_id= thread_id++;
  mysql_mutex_unlock(&LOCK_thread_count);
  thd->store_globals();
9488 9489
  thd->security_ctx->skip_grants();
  thd->set_command(COM_DAEMON);
9490

unknown's avatar
unknown committed
9491
  /*
unknown's avatar
unknown committed
9492
    Load the slave replication GTID state from the mysql.gtid_slave_pos
unknown's avatar
unknown committed
9493 9494 9495 9496 9497 9498 9499 9500 9501 9502
    table.

    This is mostly so that we can start our seq_no counter from the highest
    seq_no seen by a slave. This way, we have a way to tell if a transaction
    logged by ourselves as master is newer or older than a replicated
    transaction.
  */
#ifdef HAVE_REPLICATION
  if (rpl_load_gtid_slave_state(thd))
    sql_print_warning("Failed to load slave replication state from table "
unknown's avatar
unknown committed
9503 9504
                      "%s.%s: %u: %s", "mysql",
                      rpl_gtid_slave_state_table_name.str,
Sergei Golubchik's avatar
Sergei Golubchik committed
9505 9506
                      thd->get_stmt_da()->sql_errno(),
                      thd->get_stmt_da()->message());
unknown's avatar
unknown committed
9507 9508 9509 9510 9511 9512 9513
#endif

  mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
  binlog_background_thread_started= true;
  mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
  mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);

9514 9515 9516 9517 9518 9519
  for (;;)
  {
    /*
      Wait until there is something in the queue to process, or we are asked
      to shut down.
    */
9520
    THD_STAGE_INFO(thd, stage_binlog_waiting_background_tasks);
9521 9522 9523 9524 9525
    mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
    for (;;)
    {
      stop= binlog_background_thread_stop;
      queue= binlog_background_thread_queue;
9526 9527 9528 9529 9530 9531 9532
      if (stop && !mysql_bin_log.is_xidlist_idle())
      {
        /*
          Delay stop until all pending binlog checkpoints have been processed.
        */
        stop= false;
      }
9533 9534 9535 9536 9537 9538 9539 9540 9541 9542
      if (stop || queue)
        break;
      mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread,
                      &mysql_bin_log.LOCK_binlog_background_thread);
    }
    /* Grab the queue, if any. */
    binlog_background_thread_queue= NULL;
    mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);

    /* Process any incoming commit_checkpoint_notify() calls. */
9543 9544 9545 9546 9547 9548 9549 9550
    DBUG_EXECUTE_IF("inject_binlog_background_thread_before_mark_xid_done",
      DBUG_ASSERT(!debug_sync_set_action(
        thd,
        STRING_WITH_LEN("binlog_background_thread_before_mark_xid_done "
                        "SIGNAL injected_binlog_background_thread "
                        "WAIT_FOR something_that_will_never_happen "
                        "TIMEOUT 2")));
      );
9551 9552
    while (queue)
    {
9553
      THD_STAGE_INFO(thd, stage_binlog_processing_checkpoint_notify);
9554
      DEBUG_SYNC(thd, "binlog_background_thread_before_mark_xid_done");
9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570
      /* Grab next pointer first, as mark_xid_done() may free the element. */
      next= queue->next_in_queue;
      mysql_bin_log.mark_xid_done(queue->binlog_id, true);
      queue= next;

      DBUG_EXECUTE_IF("binlog_background_checkpoint_processed",
        DBUG_ASSERT(!debug_sync_set_action(
          thd,
          STRING_WITH_LEN("now SIGNAL binlog_background_checkpoint_processed")));
        );
    }

    if (stop)
      break;
  }

9571
  THD_STAGE_INFO(thd, stage_binlog_stopping_background_thread);
9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584

  mysql_mutex_lock(&LOCK_thread_count);
  delete thd;
  mysql_mutex_unlock(&LOCK_thread_count);

  my_thread_end();

  /* Signal that we are (almost) stopped. */
  mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
  binlog_background_thread_stop= false;
  mysql_cond_signal(&mysql_bin_log.COND_binlog_background_thread_end);
  mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);

9585
  DBUG_RETURN(0);
9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607
}

#ifdef HAVE_PSI_INTERFACE
static PSI_thread_key key_thread_binlog;

static PSI_thread_info all_binlog_threads[]=
{
  { &key_thread_binlog, "binlog_background", PSI_FLAG_GLOBAL},
};
#endif /* HAVE_PSI_INTERFACE */

static bool
start_binlog_background_thread()
{
  pthread_t th;

#ifdef HAVE_PSI_INTERFACE
  if (PSI_server)
    PSI_server->register_thread("sql", all_binlog_threads,
                                array_elements(all_binlog_threads));
#endif

unknown's avatar
unknown committed
9608
  if (mysql_thread_create(key_thread_binlog, &th, &connection_attrib,
9609 9610 9611
                          binlog_background_thread, NULL))
    return 1;

unknown's avatar
unknown committed
9612 9613 9614 9615 9616 9617 9618 9619 9620 9621
  /*
    Wait for the thread to have started (so we know that the slave replication
    state is loaded and we have correct global_gtid_counter).
  */
  mysql_mutex_lock(&mysql_bin_log.LOCK_binlog_background_thread);
  while (!binlog_background_thread_started)
    mysql_cond_wait(&mysql_bin_log.COND_binlog_background_thread_end,
                    &mysql_bin_log.LOCK_binlog_background_thread);
  mysql_mutex_unlock(&mysql_bin_log.LOCK_binlog_background_thread);

9622
  return 0;
9623 9624
}

9625

9626 9627
int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
                           IO_CACHE *first_log,
9628
                           Format_description_log_event *fdle, bool do_xa)
9629
{
9630
  Log_event *ev= NULL;
9631 9632
  HASH xids;
  MEM_ROOT mem_root;
9633 9634 9635 9636 9637 9638
  char binlog_checkpoint_name[FN_REFLEN];
  bool binlog_checkpoint_found;
  bool first_round;
  IO_CACHE log;
  File file= -1;
  const char *errmsg;
9639 9640 9641 9642 9643
#ifdef HAVE_REPLICATION
  rpl_gtid last_gtid;
  bool last_gtid_standalone= false;
  bool last_gtid_valid= false;
#endif
9644 9645

  if (! fdle->is_valid() ||
9646 9647
      (do_xa && my_hash_init(&xids, &my_charset_bin, TC_LOG_PAGE_SIZE/3, 0,
                             sizeof(my_xid), 0, 0, MYF(0))))
9648 9649
    goto err1;

9650 9651
  if (do_xa)
    init_alloc_root(&mem_root, TC_LOG_PAGE_SIZE, TC_LOG_PAGE_SIZE, MYF(0));
9652

9653 9654
  fdle->flags&= ~LOG_EVENT_BINLOG_IN_USE_F; // abort on the first error

9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665
  /*
    Scan the binlog for XIDs that need to be committed if still in the
    prepared stage.

    Start with the latest binlog file, then continue with any other binlog
    files if the last found binlog checkpoint indicates it is needed.
  */

  binlog_checkpoint_found= false;
  first_round= true;
  for (;;)
9666
  {
9667 9668 9669 9670
    while ((ev= Log_event::read_log_event(first_round ? first_log : &log,
                                          0, fdle, opt_master_verify_checksum))
           && ev->is_valid())
    {
9671 9672
      enum Log_event_type typ= ev->get_type_code();
      switch (typ)
9673 9674 9675
      {
      case XID_EVENT:
      {
9676
        if (do_xa)
9677
        {
9678 9679 9680 9681 9682 9683
          Xid_log_event *xev=(Xid_log_event *)ev;
          uchar *x= (uchar *) memdup_root(&mem_root, (uchar*) &xev->xid,
                                          sizeof(xev->xid));
          if (!x || my_hash_insert(&xids, x))
            goto err2;
          break;
9684 9685 9686
        }
      }
      case BINLOG_CHECKPOINT_EVENT:
9687
        if (first_round && do_xa)
9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705
        {
          uint dir_len;
          Binlog_checkpoint_log_event *cev= (Binlog_checkpoint_log_event *)ev;
          if (cev->binlog_file_len >= FN_REFLEN)
            sql_print_warning("Incorrect binlog checkpoint event with too "
                              "long file name found.");
          else
          {
            /*
              Note that we cannot use make_log_name() here, as we have not yet
              initialised MYSQL_BIN_LOG::log_file_name.
            */
            dir_len= dirname_length(last_log_name);
            strmake(strnmov(binlog_checkpoint_name, last_log_name, dir_len),
                    cev->binlog_file_name, FN_REFLEN - 1 - dir_len);
            binlog_checkpoint_found= true;
          }
        }
9706
        break;
9707 9708 9709 9710 9711 9712
      case GTID_LIST_EVENT:
        if (first_round)
        {
          Gtid_list_log_event *glev= (Gtid_list_log_event *)ev;

          /* Initialise the binlog state from the Gtid_list event. */
unknown's avatar
unknown committed
9713 9714
          if (rpl_global_gtid_binlog_state.load(glev->list, glev->count))
            goto err2;
9715 9716 9717
        }
        break;

9718
#ifdef HAVE_REPLICATION
9719 9720 9721 9722 9723 9724
      case GTID_EVENT:
        if (first_round)
        {
          Gtid_log_event *gev= (Gtid_log_event *)ev;

          /* Update the binlog state with any GTID logged after Gtid_list. */
9725 9726 9727 9728 9729 9730
          last_gtid.domain_id= gev->domain_id;
          last_gtid.server_id= gev->server_id;
          last_gtid.seq_no= gev->seq_no;
          last_gtid_standalone=
            ((gev->flags2 & Gtid_log_event::FL_STANDALONE) ? true : false);
          last_gtid_valid= true;
9731 9732
        }
        break;
9733
#endif
9734

9735 9736 9737 9738
      default:
        /* Nothing. */
        break;
      }
9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754

#ifdef HAVE_REPLICATION
      if (last_gtid_valid &&
          ((last_gtid_standalone && !ev->is_part_of_group(typ)) ||
           (!last_gtid_standalone &&
            (typ == XID_EVENT ||
             (typ == QUERY_EVENT &&
              (((Query_log_event *)ev)->is_commit() ||
               ((Query_log_event *)ev)->is_rollback()))))))
      {
        if (rpl_global_gtid_binlog_state.update_nolock(&last_gtid, false))
          goto err2;
        last_gtid_valid= false;
      }
#endif

9755
      delete ev;
9756
      ev= NULL;
9757 9758
    }

9759 9760
    if (!do_xa)
      break;
9761 9762 9763 9764 9765 9766 9767 9768 9769
    /*
      If the last binlog checkpoint event points to an older log, we have to
      scan all logs from there also, to get all possible XIDs to recover.

      If there was no binlog checkpoint event at all, this means the log was
      written by an older version of MariaDB (or MySQL) - these always have an
      (implicit) binlog checkpoint event at the start of the last binlog file.
    */
    if (first_round)
9770
    {
9771 9772 9773
      if (!binlog_checkpoint_found)
        break;
      first_round= false;
9774 9775 9776 9777 9778
      DBUG_EXECUTE_IF("xa_recover_expect_master_bin_000004",
          if (0 != strcmp("./master-bin.000004", binlog_checkpoint_name) &&
              0 != strcmp(".\\master-bin.000004", binlog_checkpoint_name))
            DBUG_SUICIDE();
        );
9779 9780 9781 9782
      if (find_log_pos(linfo, binlog_checkpoint_name, 1))
      {
        sql_print_error("Binlog file '%s' not found in binlog index, needed "
                        "for recovery. Aborting.", binlog_checkpoint_name);
9783
        goto err2;
9784 9785 9786 9787 9788 9789 9790 9791 9792
      }
    }
    else
    {
      end_io_cache(&log);
      mysql_file_close(file, MYF(MY_WME));
      file= -1;
    }

9793
    if (!strcmp(linfo->log_file_name, last_log_name))
9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809
      break;                                    // No more files to do
    if ((file= open_binlog(&log, linfo->log_file_name, &errmsg)) < 0)
    {
      sql_print_error("%s", errmsg);
      goto err2;
    }
    /*
      We do not need to read the Format_description_log_event of other binlog
      files. It is not possible for a binlog checkpoint to span multiple
      binlog files written by different versions of the server. So we can use
      the first one read for reading from all binlog files.
    */
    if (find_next_log(linfo, 1))
    {
      sql_print_error("Error reading binlog files during recovery. Aborting.");
      goto err2;
9810 9811 9812
    }
  }

9813 9814 9815 9816
  if (do_xa)
  {
    if (ha_recover(&xids))
      goto err2;
9817

9818 9819 9820
    free_root(&mem_root, MYF(0));
    my_hash_free(&xids);
  }
9821 9822 9823
  return 0;

err2:
9824
  delete ev;
9825 9826 9827 9828 9829
  if (file >= 0)
  {
    end_io_cache(&log);
    mysql_file_close(file, MYF(MY_WME));
  }
9830 9831 9832 9833 9834
  if (do_xa)
  {
    free_root(&mem_root, MYF(0));
    my_hash_free(&xids);
  }
9835 9836 9837 9838 9839 9840 9841 9842
err1:
  sql_print_error("Crash recovery failed. Either correct the problem "
                  "(if it's, for example, out of memory error) and restart, "
                  "or delete (or rename) binary log and start mysqld with "
                  "--tc-heuristic-recover={commit|rollback}");
  return 1;
}

9843

9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857
int
MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
{
  LOG_INFO log_info;
  const char *errmsg;
  IO_CACHE    log;
  File        file;
  Log_event  *ev= 0;
  Format_description_log_event fdle(BINLOG_VERSION);
  char        log_name[FN_REFLEN];
  int error;

  if ((error= find_log_pos(&log_info, NullS, 1)))
  {
9858 9859 9860 9861 9862 9863
    /*
      If there are no binlog files (LOG_INFO_EOF), then we still try to read
      the .state file to restore the binlog state. This allows to copy a server
      to provision a new one without copying the binlog files (except the
      master-bin.state file) and still preserve the correct binlog state.
    */
9864 9865 9866
    if (error != LOG_INFO_EOF)
      sql_print_error("find_log_pos() failed (error: %d)", error);
    else
9867
    {
9868
      error= read_state_from_file();
9869 9870 9871 9872 9873 9874 9875 9876 9877
      if (error == 2)
      {
        /*
          No binlog files and no binlog state is not an error (eg. just initial
          server start after fresh installation).
        */
        error= 0;
      }
    }
9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902
    return error;
  }

  if (! fdle.is_valid())
    return 1;

  do
  {
    strmake_buf(log_name, log_info.log_file_name);
  } while (!(error= find_next_log(&log_info, 1)));

  if (error !=  LOG_INFO_EOF)
  {
    sql_print_error("find_log_pos() failed (error: %d)", error);
    return error;
  }

  if ((file= open_binlog(&log, log_name, &errmsg)) < 0)
  {
    sql_print_error("%s", errmsg);
    return 1;
  }

  if ((ev= Log_event::read_log_event(&log, 0, &fdle,
                                     opt_master_verify_checksum)) &&
9903
      ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
9904
  {
9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937
    if (ev->flags & LOG_EVENT_BINLOG_IN_USE_F)
    {
      sql_print_information("Recovering after a crash using %s", opt_name);
      error= recover(&log_info, log_name, &log,
                     (Format_description_log_event *)ev, do_xa_recovery);
    }
    else
    {
      error= read_state_from_file();
      if (error == 2)
      {
        /*
          The binlog exists, but the .state file is missing. This is normal if
          this is the first master start after a major upgrade to 10.0 (with
          GTID support).

          However, it could also be that the .state file was lost somehow, and
          in this case it could be a serious issue, as we would set the wrong
          binlog state in the next binlog file to be created, and GTID
          processing would be corrupted. A common way would be copying files
          from an old server to a new one and forgetting the .state file.

          So in this case, we want to try to recover the binlog state by
          scanning the last binlog file (but we do not need any XA recovery).

          ToDo: We could avoid one scan at first start after major upgrade, by
          detecting that there is no GTID_LIST event at the start of the
          binlog file, and stopping the scan in that case.
        */
        error= recover(&log_info, log_name, &log,
                       (Format_description_log_event *)ev, false);
      }
    }
9938 9939 9940 9941 9942 9943 9944 9945 9946 9947
  }

  delete ev;
  end_io_cache(&log);
  mysql_file_close(file, MYF(MY_WME));

  return error;
}


9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966
#ifdef INNODB_COMPATIBILITY_HOOKS
/**
  Get the file name of the MySQL binlog.
  @return the name of the binlog file
*/
extern "C"
const char* mysql_bin_log_file_name(void)
{
  return mysql_bin_log.get_log_fname();
}
/**
  Get the current position of the MySQL binlog.
  @return byte offset from the beginning of the binlog
*/
extern "C"
ulonglong mysql_bin_log_file_pos(void)
{
  return (ulonglong) mysql_bin_log.get_log_file()->pos_in_file;
}
9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979
/*
  Get the current position of the MySQL binlog for transaction currently being
  committed.

  This is valid to call from within storage engine commit_ordered() and
  commit() methods only.

  Since it stores the position inside THD, it is safe to call without any
  locking.
*/
void
mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file)
{
Sergei Golubchik's avatar
Sergei Golubchik committed
9980 9981 9982
  binlog_cache_mngr *cache_mngr;
  if (opt_bin_log &&
      (cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton)))
9983
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
9984 9985
    *out_file= cache_mngr->last_commit_pos_file;
    *out_pos= (ulonglong)(cache_mngr->last_commit_pos_offset);
9986 9987 9988 9989
  }
  else
  {
    *out_file= NULL;
9990
    *out_pos= 0;
9991 9992
  }
}
9993 9994 9995
#endif /* INNODB_COMPATIBILITY_HOOKS */


9996 9997 9998 9999
static void
binlog_checksum_update(MYSQL_THD thd, struct st_mysql_sys_var *var,
                       void *var_ptr, const void *save)
{
10000
  ulong value=  *((ulong *)save);
Sergei Golubchik's avatar
Sergei Golubchik committed
10001
  bool check_purge= false;
10002
  ulong UNINIT_VAR(prev_binlog_id);
10003

Sergei Golubchik's avatar
Sergei Golubchik committed
10004
  mysql_mutex_lock(mysql_bin_log.get_log_lock());
10005 10006
  if(mysql_bin_log.is_open())
  {
10007
    prev_binlog_id= mysql_bin_log.current_binlog_id;
Sergei Golubchik's avatar
Sergei Golubchik committed
10008
    if (binlog_checksum_options != value)
10009
      mysql_bin_log.checksum_alg_reset= (uint8) value;
Sergei Golubchik's avatar
Sergei Golubchik committed
10010 10011
    if (mysql_bin_log.rotate(true, &check_purge))
      check_purge= false;
10012 10013 10014 10015 10016
  }
  else
  {
    binlog_checksum_options= value;
  }
Sergei Golubchik's avatar
Sergei Golubchik committed
10017 10018
  DBUG_ASSERT(binlog_checksum_options == value);
  mysql_bin_log.checksum_alg_reset= BINLOG_CHECKSUM_ALG_UNDEF;
Sergei Golubchik's avatar
Sergei Golubchik committed
10019
  mysql_mutex_unlock(mysql_bin_log.get_log_lock());
Sergei Golubchik's avatar
Sergei Golubchik committed
10020
  if (check_purge)
10021
    mysql_bin_log.checkpoint_and_purge(prev_binlog_id);
10022 10023 10024
}


10025 10026
static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
{
10027
  mysql_bin_log.set_status_variables(thd);
10028 10029 10030 10031 10032 10033
  var->type= SHOW_ARRAY;
  var->value= (char *)&binlog_status_vars_detail;
  return 0;
}

static SHOW_VAR binlog_status_vars_top[]= {
10034
  {"Binlog", (char *) &show_binlog_vars, SHOW_FUNC},
10035 10036 10037
  {NullS, NullS, SHOW_LONG}
};

unknown's avatar
unknown committed
10038 10039 10040 10041
static MYSQL_SYSVAR_BOOL(
  optimize_thread_scheduling,
  opt_optimize_thread_scheduling,
  PLUGIN_VAR_READONLY,
10042 10043 10044 10045 10046
  "Run fast part of group commit in a single thread, to optimize kernel "
  "thread scheduling. On by default. Disable to run each transaction in group "
  "commit in its own thread, which can be slower at very high concurrency. "
  "This option is mostly for testing one algorithm versus the other, and it "
  "should not normally be necessary to change it.",
unknown's avatar
unknown committed
10047 10048 10049 10050
  NULL,
  NULL,
  1);

10051 10052 10053 10054 10055
static MYSQL_SYSVAR_ENUM(
  checksum,
  binlog_checksum_options,
  PLUGIN_VAR_RQCMDARG,
  "Type of BINLOG_CHECKSUM_ALG. Include checksum for "
10056
  "log events in the binary log",
10057 10058 10059 10060 10061
  NULL,
  binlog_checksum_update,
  BINLOG_CHECKSUM_ALG_OFF,
  &binlog_checksum_typelib);

10062 10063
static struct st_mysql_sys_var *binlog_sys_vars[]=
{
unknown's avatar
unknown committed
10064
  MYSQL_SYSVAR(optimize_thread_scheduling),
10065
  MYSQL_SYSVAR(checksum),
10066 10067 10068 10069
  NULL
};


10070 10071 10072 10073 10074 10075 10076 10077 10078
/*
  Copy out the non-directory part of binlog position filename for the
  `binlog_snapshot_file' status variable, same way as it is done for
  SHOW MASTER STATUS.
*/
static void
set_binlog_snapshot_file(const char *src)
{
  int dir_len = dirname_length(src);
10079
  strmake_buf(binlog_snapshot_file, src + dir_len);
10080 10081
}

10082 10083 10084 10085
/*
  Copy out current values of status variables, for SHOW STATUS or
  information_schema.global_status.

10086
  This is called only under LOCK_show_status, so we can fill in a static array.
10087 10088
*/
void
10089
TC_LOG_BINLOG::set_status_variables(THD *thd)
10090
{
Sergei Golubchik's avatar
Sergei Golubchik committed
10091
  binlog_cache_mngr *cache_mngr;
10092

Sergei Golubchik's avatar
Sergei Golubchik committed
10093 10094
  if (thd && opt_bin_log)
    cache_mngr= (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
10095
  else
Sergei Golubchik's avatar
Sergei Golubchik committed
10096
    cache_mngr= 0;
10097

Sergei Golubchik's avatar
Sergei Golubchik committed
10098 10099
  bool have_snapshot= (cache_mngr && cache_mngr->last_commit_pos_file[0] != 0);
  mysql_mutex_lock(&LOCK_commit_ordered);
10100 10101
  binlog_status_var_num_commits= this->num_commits;
  binlog_status_var_num_group_commits= this->num_group_commits;
10102
  if (!have_snapshot)
10103
  {
10104
    set_binlog_snapshot_file(last_commit_pos_file);
10105
    binlog_snapshot_position= last_commit_pos_offset;
10106
  }
Sergei Golubchik's avatar
Sergei Golubchik committed
10107
  mysql_mutex_unlock(&LOCK_commit_ordered);
10108
  mysql_mutex_lock(&LOCK_prepare_ordered);
10109 10110 10111
  binlog_status_group_commit_trigger_count= this->group_commit_trigger_count;
  binlog_status_group_commit_trigger_timeout= this->group_commit_trigger_timeout;
  binlog_status_group_commit_trigger_lock_wait= this->group_commit_trigger_lock_wait;
10112
  mysql_mutex_unlock(&LOCK_prepare_ordered);
10113

10114
  if (have_snapshot)
10115
  {
Sergei Golubchik's avatar
Sergei Golubchik committed
10116 10117
    set_binlog_snapshot_file(cache_mngr->last_commit_pos_file);
    binlog_snapshot_position= cache_mngr->last_commit_pos_offset;
10118
  }
10119 10120
}

unknown's avatar
unknown committed
10121
struct st_mysql_storage_engine binlog_storage_engine=
10122
{ MYSQL_HANDLERTON_INTERFACE_VERSION };
unknown's avatar
unknown committed
10123

unknown's avatar
unknown committed
10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134
maria_declare_plugin(binlog)
{
  MYSQL_STORAGE_ENGINE_PLUGIN,
  &binlog_storage_engine,
  "binlog",
  "MySQL AB",
  "This is a pseudo storage engine to represent the binlog in a transaction",
  PLUGIN_LICENSE_GPL,
  binlog_init, /* Plugin Init */
  NULL, /* Plugin Deinit */
  0x0100 /* 1.0 */,
10135 10136
  binlog_status_vars_top,     /* status variables                */
  binlog_sys_vars,            /* system variables                */
unknown's avatar
unknown committed
10137 10138 10139 10140
  "1.0",                      /* string version */
  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
}
maria_declare_plugin_end;
10141 10142 10143 10144

#ifdef WITH_WSREP
IO_CACHE * get_trans_log(THD * thd)
{
10145
  DBUG_ASSERT(binlog_hton->slot != HA_SLOT_UNDEF);
10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173
  binlog_cache_mngr *cache_mngr = (binlog_cache_mngr*)
    thd_get_ha_data(thd, binlog_hton);
  if (cache_mngr)
    return cache_mngr->get_binlog_cache_log(true);

  WSREP_DEBUG("binlog cache not initialized, conn :%ld", thd->thread_id);
  return NULL;
}


bool wsrep_trans_cache_is_empty(THD *thd)
{
  binlog_cache_mngr *const cache_mngr=
      (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
  return (!cache_mngr || cache_mngr->trx_cache.empty());
}


void thd_binlog_trx_reset(THD * thd)
{
  /*
    todo: fix autocommit select to not call the caller
  */
  if (thd_get_ha_data(thd, binlog_hton) != NULL)
  {
    binlog_cache_mngr *const cache_mngr=
      (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
    if (cache_mngr)
10174
    {
10175
      cache_mngr->reset(false, true);
10176 10177 10178 10179 10180 10181
      if (!cache_mngr->stmt_cache.empty())
      {
        WSREP_DEBUG("pending events in stmt cache, sql: %s", thd->query());
        cache_mngr->stmt_cache.reset();
      }
    }
10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195
  }
  thd->clear_binlog_table_maps();
}


void thd_binlog_rollback_stmt(THD * thd)
{
  WSREP_DEBUG("thd_binlog_rollback_stmt :%ld", thd->thread_id);
  binlog_cache_mngr *const cache_mngr=
    (binlog_cache_mngr*) thd_get_ha_data(thd, binlog_hton);
  if (cache_mngr)
    cache_mngr->trx_cache.set_prev_position(MY_OFF_T_UNDEF);
}
#endif /* WITH_WSREP */