MWL#116: Efficient group commit for binary log

Preliminary commit for testing

MWL#116: Efficient group commit for binary log
Preliminary commit for testing
0394cf20 · unknown · e432151e · 0394cf20 · 0394cf20 · 0394cf20
Commit 0394cf20 authored Sep 30, 2010 by unknown
16 changed files
--- a/mysql-test/r/group_commit.result
+++ b/mysql-test/r/group_commit.result
+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
+SELECT variable_value INTO @commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_commits';
+SELECT variable_value INTO @group_commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_group_commits';
+SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
+INSERT INTO t1 VALUES ("con1");
+set DEBUG_SYNC= "now WAIT_FOR group1_running";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
+SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
+INSERT INTO t1 VALUES ("con2");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
+INSERT INTO t1 VALUES ("con3");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
+INSERT INTO t1 VALUES ("con4");
+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT * FROM t1 ORDER BY a;
+a
+SET DEBUG_SYNC= "now SIGNAL group2_queued";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
+SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
+INSERT INTO t1 VALUES ("con5");
+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
+INSERT INTO t1 VALUES ("con6");
+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+SET DEBUG_SYNC= "now SIGNAL group3_committed";
+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+con2
+con3
+con4
+SET DEBUG_SYNC= "now SIGNAL group2_checked";
+SELECT * FROM t1 ORDER BY a;
+a
+con1
+con2
+con3
+con4
+con5
+con6
+SELECT variable_value - @commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_commits';
+variable_value - @commits
+6
+SELECT variable_value - @group_commits FROM information_schema.global_status
+WHERE variable_name = 'binlog_group_commits';
+variable_value - @group_commits
+3
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
--- a/mysql-test/suite/binlog/r/binlog_ioerr.result
+++ b/mysql-test/suite/binlog/r/binlog_ioerr.result
+CALL mtr.add_suppression("Error writing file 'master-bin'");
+RESET MASTER;
+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
+INSERT INTO t1 VALUES(0);
+SET SESSION debug='+d,fail_binlog_write_1';
+INSERT INTO t1 VALUES(1);
+ERROR HY000: Error writing file 'master-bin' (errno: 22)
+INSERT INTO t1 VALUES(2);
+ERROR HY000: Error writing file 'master-bin' (errno: 22)
+SET SESSION debug='';
+INSERT INTO t1 VALUES(3);
+SELECT * FROM t1;
+a
+0
+3
+SHOW BINLOG EVENTS;
+Log_name	Pos	Event_type	Server_id	End_log_pos	Info
+BINLOG	POS	Format_desc	1	ENDPOS	Server ver: #, Binlog ver: #
+BINLOG	POS	Query	1	ENDPOS	use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	use `test`; INSERT INTO t1 VALUES(0)
+BINLOG	POS	Xid	1	ENDPOS	COMMIT /* XID */
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	BEGIN
+BINLOG	POS	Query	1	ENDPOS	use `test`; INSERT INTO t1 VALUES(3)
+BINLOG	POS	Xid	1	ENDPOS	COMMIT /* XID */
+DROP TABLE t1;
--- a/mysql-test/suite/binlog/t/binlog_ioerr.test
+++ b/mysql-test/suite/binlog/t/binlog_ioerr.test
+source include/have_debug.inc;
+source include/have_innodb.inc;
+source include/have_log_bin.inc;
+source include/have_binlog_format_mixed_or_statement.inc;
+
+CALL mtr.add_suppression("Error writing file 'master-bin'");
+
+RESET MASTER;
+
+CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
+INSERT INTO t1 VALUES(0);
+SET SESSION debug='+d,fail_binlog_write_1';
+--error ER_ERROR_ON_WRITE
+INSERT INTO t1 VALUES(1);
+--error ER_ERROR_ON_WRITE
+INSERT INTO t1 VALUES(2);
+SET SESSION debug='';
+INSERT INTO t1 VALUES(3);
+SELECT * FROM t1;
+
+# Actually the output from this currently shows a bug.
+# The injected IO error leaves partially written transactions in the binlog in
+# the form of stray "BEGIN" events.
+# These should disappear from the output if binlog error handling is improved.
+--replace_regex /\/\* xid=.* \*\//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
+--replace_column 1 BINLOG 2 POS 5 ENDPOS
+SHOW BINLOG EVENTS;
+
+DROP TABLE t1;
--- a/mysql-test/t/group_commit.test
+++ b/mysql-test/t/group_commit.test
+--source include/have_debug_sync.inc
+--source include/have_innodb.inc
+--source include/have_log_bin.inc
+
+# Test some group commit code paths by using debug_sync to do controlled
+# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
+# group.
+#
+# Group 3 is allowed to race as far as possible ahead before group 2 finishes
+# to check some edge case for concurrency control.
+
+CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
+
+SELECT variable_value INTO @commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_commits';
+SELECT variable_value INTO @group_commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_group_commits';
+
+connect(con1,localhost,root,,);
+connect(con2,localhost,root,,);
+connect(con3,localhost,root,,);
+connect(con4,localhost,root,,);
+connect(con5,localhost,root,,);
+connect(con6,localhost,root,,);
+
+# Start group1 (with one thread) doing commit, waiting for
+# group2 to queue up before finishing.
+
+connection con1;
+SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
+send INSERT INTO t1 VALUES ("con1");
+
+# Make group2 (with three threads) queue up.
+# Make sure con2 is the group commit leader for group2.
+# Make group2 wait with running commit_ordered() until group3 has committed.
+
+connection con2;
+set DEBUG_SYNC= "now WAIT_FOR group1_running";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
+SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
+SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
+send INSERT INTO t1 VALUES ("con2");
+connection con3;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
+send INSERT INTO t1 VALUES ("con3");
+connection con4;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
+send INSERT INTO t1 VALUES ("con4");
+
+# When group2 is queued, let group1 continue and queue group3.
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
+
+# At this point, trasaction 1 is still not visible as commit_ordered() has not
+# been called yet.
+SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT * FROM t1 ORDER BY a;
+
+SET DEBUG_SYNC= "now SIGNAL group2_queued";
+connection con1;
+reap;
+
+# Now transaction 1 is visible.
+connection default;
+SELECT * FROM t1 ORDER BY a;
+
+connection con5;
+SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
+SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
+send INSERT INTO t1 VALUES ("con5");
+
+connection con6;
+SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
+SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
+send INSERT INTO t1 VALUES ("con6");
+
+connection default;
+SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
+# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
+SELECT * FROM t1 ORDER BY a;
+SET DEBUG_SYNC= "now SIGNAL group3_committed";
+SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
+# Now transactions 1-4 visible.
+SELECT * FROM t1 ORDER BY a;
+SET DEBUG_SYNC= "now SIGNAL group2_checked";
+
+connection con2;
+reap;
+
+connection con3;
+reap;
+
+connection con4;
+reap;
+
+connection con5;
+reap;
+
+connection con6;
+reap;
+
+connection default;
+# Check all transactions finally visible.
+SELECT * FROM t1 ORDER BY a;
+
+SELECT variable_value - @commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_commits';
+SELECT variable_value - @group_commits FROM information_schema.global_status
+ WHERE variable_name = 'binlog_group_commits';
+
+SET DEBUG_SYNC= 'RESET';
+DROP TABLE t1;
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
 uint known_extensions_id= 0;

+static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
+                              bool is_real_trans);


 static plugin_ref ha_default_plugin(THD *thd)
@@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
 */
 int ha_commit_trans(THD *thd, bool all)
 {
-  int error= 0, cookie= 0;
+  int error= 0, cookie;
  /*
    'all' means that this is either an explicit commit issued by
    user, or an implicit commit issued by a DDL.
@@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all)
  */
  bool is_real_trans= all || thd->transaction.all.ha_list == 0;
  Ha_trx_info *ha_info= trans->ha_list;
-  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
+  bool need_prepare_ordered, need_commit_ordered;
+  my_xid xid;
  DBUG_ENTER("ha_commit_trans");

  /*
@@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all)
    DBUG_RETURN(2);
  }
 #ifdef USING_TRANSACTIONS
-  if (ha_info)
+  if (!ha_info)
  {
-    uint rw_ha_count;
-    bool rw_trans;
+    /* Free resources and perform other cleanup even for 'empty' transactions. */
+    if (is_real_trans)
+      thd->transaction.cleanup();
+    DBUG_RETURN(0);
+  }

-    DBUG_EXECUTE_IF("crash_commit_before", abort(););
+  DBUG_EXECUTE_IF("crash_commit_before", abort(););

-    /* Close all cursors that can not survive COMMIT */
-    if (is_real_trans)                          /* not a statement commit */
-      thd->stmt_map.close_transient_cursors();
+  /* Close all cursors that can not survive COMMIT */
+  if (is_real_trans)                          /* not a statement commit */
+    thd->stmt_map.close_transient_cursors();

-    rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
-    /* rw_trans is TRUE when we in a transaction changing data */
-    rw_trans= is_real_trans && (rw_ha_count > 0);
+  uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
+  /* rw_trans is TRUE when we in a transaction changing data */
+  bool rw_trans= is_real_trans && (rw_ha_count > 0);

-    if (rw_trans &&
-        wait_if_global_read_lock(thd, 0, 0))
-    {
-      ha_rollback_trans(thd, all);
-      DBUG_RETURN(1);
-    }
+  if (rw_trans &&
+      wait_if_global_read_lock(thd, 0, 0))
+  {
+    ha_rollback_trans(thd, all);
+    DBUG_RETURN(1);
+  }

-    if (rw_trans &&
-        opt_readonly &&
-        !(thd->security_ctx->master_access & SUPER_ACL) &&
-        !thd->slave_thread)
-    {
-      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
-      ha_rollback_trans(thd, all);
-      error= 1;
-      goto end;
-    }
+  if (rw_trans &&
+      opt_readonly &&
+      !(thd->security_ctx->master_access & SUPER_ACL) &&
+      !thd->slave_thread)
+  {
+    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
+    goto err;
+  }

-    if (!trans->no_2pc && (rw_ha_count > 1))
-    {
-      for (; ha_info && !error; ha_info= ha_info->next())
-      {
-        int err;
-        handlerton *ht= ha_info->ht();
-        /*
-          Do not call two-phase commit if this particular
-          transaction is read-only. This allows for simpler
-          implementation in engines that are always read-only.
-        */
-        if (! ha_info->is_trx_read_write())
-          continue;
-        /*
-          Sic: we know that prepare() is not NULL since otherwise
-          trans->no_2pc would have been set.
-        */
-        if ((err= ht->prepare(ht, thd, all)))
-        {
-          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
-          error= 1;
-        }
-        status_var_increment(thd->status_var.ha_prepare_count);
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
-      if (error || (is_real_trans && xid &&
-                    (error= !(cookie= tc_log->log_xid(thd, xid)))))
-      {
-        ha_rollback_trans(thd, all);
-        error= 1;
-        goto end;
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
-    }
-    error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
-    DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
-    if (cookie)
-      tc_log->unlog(cookie, xid);
+  if (trans->no_2pc || (rw_ha_count <= 1))
+  {
+    error= ha_commit_one_phase(thd, all);
    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
-end:
-    if (rw_trans)
-      start_waiting_global_read_lock(thd);
+    goto end;
  }
-  /* Free resources and perform other cleanup even for 'empty' transactions. */
-  else if (is_real_trans)
-    thd->transaction.cleanup();
+
+  need_prepare_ordered= FALSE;
+  need_commit_ordered= FALSE;
+  xid= thd->transaction.xid_state.xid.get_my_xid();
+
+  for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
+  {
+    int err;
+    handlerton *ht= hi->ht();
+    /*
+      Do not call two-phase commit if this particular
+      transaction is read-only. This allows for simpler
+      implementation in engines that are always read-only.
+    */
+    if (! hi->is_trx_read_write())
+      continue;
+    /*
+      Sic: we know that prepare() is not NULL since otherwise
+      trans->no_2pc would have been set.
+    */
+    if ((err= ht->prepare(ht, thd, all)))
+      my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
+    status_var_increment(thd->status_var.ha_prepare_count);
+
+    if (err)
+      goto err;
+
+    if (ht->prepare_ordered)
+      need_prepare_ordered= TRUE;
+    if (ht->commit_ordered)
+      need_commit_ordered= TRUE;
+  }
+  DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
+
+  if (!is_real_trans)
+  {
+    error= commit_one_phase_2(thd, all, trans, is_real_trans);
+    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+    goto end;
+  }
+
+  cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
+                                need_commit_ordered);
+  if (!cookie)
+    goto err;
+
+  DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
+
+  error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+
+  DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
+  tc_log->unlog(cookie, xid);
+
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+  goto end;
+
+  /* Come here if error and we need to rollback. */
+err:
+  if (!error)
+    error= 1;
+  ha_rollback_trans(thd, all);
+
+end:
+  if (rw_trans)
+    start_waiting_global_read_lock(thd);
 #endif /* USING_TRANSACTIONS */
  DBUG_RETURN(error);
 }
@@ -1207,7 +1237,6 @@ end:
 */
 int ha_commit_one_phase(THD *thd, bool all)
 {
-  int error=0;
  THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
  /*
    "real" is a nick name for a transaction for which a commit will
@@ -1217,8 +1246,41 @@ int ha_commit_one_phase(THD *thd, bool all)
    enclosing 'all' transaction is rolled back.
  */
  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
-  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  Ha_trx_info *ha_info= trans->ha_list;
  DBUG_ENTER("ha_commit_one_phase");
+#ifdef USING_TRANSACTIONS
+  if (ha_info)
+  {
+    if (is_real_trans)
+    {
+      bool locked= false;
+      for (; ha_info; ha_info= ha_info->next())
+      {
+        handlerton *ht= ha_info->ht();
+        if (ht->commit_ordered)
+        {
+          if (ha_info->is_trx_read_write() && !locked)
+          {
+            pthread_mutex_lock(&LOCK_commit_ordered);
+            locked= 1;
+          }
+          ht->commit_ordered(ht, thd, all);
+        }
+      }
+      if (locked)
+        pthread_mutex_unlock(&LOCK_commit_ordered);
+    }
+  }
+#endif /* USING_TRANSACTIONS */
+  DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans));
+}
+
+static int
+commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
+{
+  int error= 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  DBUG_ENTER("commit_one_phase_2");
 #ifdef USING_TRANSACTIONS
  if (ha_info)
  {

--- a/sql/handler.h
+++ b/sql/handler.h
@@ -656,9 +656,96 @@ struct handlerton
     NOTE 'all' is also false in auto-commit mode where 'end of statement'
     and 'real commit' mean the same event.
   */
-   int  (*commit)(handlerton *hton, THD *thd, bool all);
+   int (*commit)(handlerton *hton, THD *thd, bool all);
+   /*
+     The commit_ordered() method is called prior to the commit() method, after
+     the transaction manager has decided to commit (not rollback) the
+     transaction. Unlike commit(), commit_ordered() is called only when the
+     full transaction is committed, not for each commit of statement
+     transaction in a multi-statement transaction.
+
+     The calls to commit_ordered() in multiple parallel transactions is
+     guaranteed to happen in the same order in every participating
+     handler. This can be used to ensure the same commit order among multiple
+     handlers (eg. in table handler and binlog). So if transaction T1 calls
+     into commit_ordered() of handler A before T2, then T1 will also call
+     commit_ordered() of handler B before T2.
+
+     Engines that implement this method should during this call make the
+     transaction visible to other transactions, thereby making the order of
+     transaction commits be defined by the order of commit_ordered() calls.
+
+     The intension is that commit_ordered() should do the minimal amount of
+     work that needs to happen in consistent commit order among handlers. To
+     preserve ordering, calls need to be serialised on a global mutex, so
+     doing any time-consuming or blocking operations in commit_ordered() will
+     limit scalability.
+
+     Handlers can rely on commit_ordered() calls for transactions that updated
+     data to be serialised (no two calls can run in parallel, so no extra
+     locking on the handler part is required to ensure this). However, calls
+     for SELECT-only transactions are not serialised, so can occur in parallel
+     with each other and with at most one write-transaction.
+
+     Note that commit_ordered() can be called from a different thread than the
+     one handling the transaction! So it can not do anything that depends on
+     thread local storage, in particular it can not call my_error() and
+     friends (instead it can store the error code and delay the call of
+     my_error() to the commit() method).
+
+     Similarly, since commit_ordered() returns void, any return error code
+     must be saved and returned from the commit() method instead.
+
+     The commit_ordered method is optional, and can be left unset if not
+     needed in a particular handler.
+   */
+   void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
   int  (*rollback)(handlerton *hton, THD *thd, bool all);
   int  (*prepare)(handlerton *hton, THD *thd, bool all);
+   /*
+     The prepare_ordered method is optional. If set, it will be called after
+     successful prepare() in all handlers participating in 2-phase
+     commit. Like commit_ordered(), it is called only when the full
+     transaction is committed, not for each commit of statement transaction.
+
+     The calls to prepare_ordered() among multiple parallel transactions are
+     ordered consistently with calls to commit_ordered(). This means that
+     calls to prepare_ordered() effectively define the commit order, and that
+     each handler will see the same sequence of transactions calling into
+     prepare_ordered() and commit_ordered().
+
+     Thus, prepare_ordered() can be used to define commit order for handlers
+     that need to do this in the prepare step (like binlog). It can also be
+     used to release transaction's locks early in an order consistent with the
+     order transactions will be eventually committed.
+
+     Like commit_ordered(), prepare_ordered() calls are serialised to maintain
+     ordering, so the intension is that they should execute fast, with only
+     the minimal amount of work needed to define commit order. Handlers can
+     rely on this serialisation, and do not need to do any extra locking to
+     avoid two prepare_ordered() calls running in parallel.
+
+     Like commit_ordered(), prepare_ordered() is not guaranteed to be called
+     in the context of the thread handling the rest of the transaction. So it
+     cannot invoke code that relies on thread local storage, in particular it
+     cannot call my_error().
+
+     When prepare_ordered() is called, the transaction coordinator has already
+     decided to commit (not rollback) the transaction. So prepare_ordered()
+     cannot cause a rollback by returning an error, all possible errors must
+     be handled in prepare() (the prepare_ordered() method returns void). In
+     case of some fatal error, a record of the error must be made internally
+     by the engine and returned from commit() later.
+
+     Note that for user-level XA SQL commands, no consistent ordering among
+     prepare_ordered() and commit_ordered() is guaranteed (as that would
+     require blocking all other commits for an indefinite time).
+
+     When 2-phase commit is not used (eg. only one engine (and no binlog) in
+     transaction), prepare() is not called and in such cases prepare_ordered()
+     also is not called.
+   */
+   void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
   int  (*recover)(handlerton *hton, XID *xid_list, uint len);
   int  (*commit_by_xid)(handlerton *hton, XID *xid);
   int  (*rollback_by_xid)(handlerton *hton, XID *xid);

--- a/sql/log.cc
+++ b/sql/log.cc
--- a/sql/log.h
+++ b/sql/log.h
@@ -33,11 +33,173 @@ class TC_LOG

  virtual int open(const char *opt_name)=0;
  virtual void close()=0;
-  virtual int log_xid(THD *thd, my_xid xid)=0;
+  virtual int log_and_order(THD *thd, my_xid xid, bool all,
+                            bool need_prepare_ordered,
+                            bool need_commit_ordered) = 0;
  virtual void unlog(ulong cookie, my_xid xid)=0;
+
+protected:
+  /*
+    These methods are meant to be invoked from log_and_order() implementations
+    to run any prepare_ordered() respectively commit_ordered() methods in
+    participating handlers.
+
+    They must be called using suitable thread syncronisation to ensure that
+    they are each called in the correct commit order among all
+    transactions. However, it is only necessary to call them if the
+    corresponding flag passed to log_and_order is set (it is safe, but not
+    required, to call them when the flag is false).
+
+    The caller must be holding LOCK_prepare_ordered respectively
+    LOCK_commit_ordered when calling these methods.
+  */
+  void run_prepare_ordered(THD *thd, bool all);
+  void run_commit_ordered(THD *thd, bool all);
+};
+
+/*
+  Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
+  and TC_LOG::run_commit_ordered(), or any other code that calls handler
+  prepare_ordered() or commit_ordered() methods.
+*/
+extern pthread_mutex_t LOCK_prepare_ordered;
+extern pthread_mutex_t LOCK_commit_ordered;
+
+extern void TC_init();
+extern void TC_destroy();
+
+/*
+  Base class for two TC implementations TC_LOG_unordered and
+  TC_LOG_group_commit that both use a queue of threads waiting for group
+  commit.
+*/
+class TC_LOG_queued: public TC_LOG
+{
+protected:
+  TC_LOG_queued();
+  ~TC_LOG_queued();
+
+  /* Structure used to link list of THDs waiting for group commit. */
+  struct TC_group_commit_entry
+  {
+    struct TC_group_commit_entry *next;
+    THD *thd;
+    /* This is the `all' parameter for ha_commit_trans() etc. */
+    bool all;
+    /*
+      Flag set true when it is time for this thread to wake up after group
+      commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered.
+    */
+    bool group_commit_ready;
+    /*
+      Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and
+      cookie.
+    */
+    int xid_error;
+  };
+
+  TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue);
+
+  void group_commit_wait_for_wakeup(TC_group_commit_entry *entry);
+  void group_commit_wakeup_other(TC_group_commit_entry *other);
+
+  /*
+    This is a queue of threads waiting for being allowed to commit.
+    Access to the queue must be protected by LOCK_prepare_ordered.
+  */
+  TC_group_commit_entry *group_commit_queue;
+};
+
+class TC_LOG_unordered: public TC_LOG_queued
+{
+public:
+  TC_LOG_unordered();
+  ~TC_LOG_unordered();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  virtual int log_xid(THD *thd, my_xid xid)=0;
+
+private:
+  /*
+    This flag and condition is used to reserve the queue while threads in it
+    each run the commit_ordered() methods one after the other. Only once the
+    last commit_ordered() in the queue is done can we start on a new queue
+    run.
+
+    Since we start this process in the first thread in the queue and finish in
+    the last (and possibly different) thread, we need a condition variable for
+    this (we cannot unlock a mutex in a different thread than the one who
+    locked it).
+
+    The condition is used together with the LOCK_prepare_ordered mutex.
+  */
+  my_bool group_commit_queue_busy;
+  pthread_cond_t COND_queue_busy;
+};
+
+class TC_LOG_group_commit: public TC_LOG_queued
+{
+public:
+  TC_LOG_group_commit();
+  ~TC_LOG_group_commit();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  /* Total number of committed transactions. */
+  ulonglong num_commits;
+  /* Number of group commits done. */
+  ulonglong num_group_commits;
+
+  /*
+    When using this class, this method is used instead of log_xid() to do
+    logging of a group of transactions all at once.
+
+    The transactions will be linked through THD::next_commit_ordered.
+
+    Additionally, when this method is used instead of log_xid(), the order in
+    which handler->prepare_ordered() and handler->commit_ordered() are called
+    is guaranteed to be the same as the order of calls and THD list elements
+    for group_log_xid().
+
+    This can be used to efficiently implement group commit that at the same
+    time preserves the order of commits among handlers and TC (eg. to get same
+    commit order in InnoDB and binary log).
+
+    For TCs that do not need this, it can be preferable to use plain log_xid()
+    with class TC_LOG_unordered instead, as it allows threads to run log_xid()
+    in parallel with each other. In contrast, group_log_xid() runs under a
+    global mutex, so it is guaranteed that only once call into it will be
+    active at once.
+
+    Since this call handles multiple threads/THDs at once, my_error() (and
+    other code that relies on thread local storage) cannot be used in this
+    method. Instead, the implementation must record any error and report it as
+    the return value from xid_log_after(), which will be invoked individually
+    for each thread.
+
+    In the success case, this method must set thd->xid_cookie for each thread
+    to the cookie that is normally returned from log_xid() (which must be
+    non-zero in the non-error case).
+  */
+  virtual void group_log_xid(TC_group_commit_entry *first) = 0;
+  /*
+    Called for each transaction (in corrent thread context) after
+    group_log_xid() has finished, but with no guarantee on ordering among
+    threads.
+    Can be used to do error reporting etc. */
+  virtual int xid_log_after(TC_group_commit_entry *entry) = 0;
+
+private:
+  /* Mutex used to serialise calls to group_log_xid(). */
+  pthread_mutex_t LOCK_group_commit;
 };

-class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
+class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging
 {
 public:
  TC_LOG_DUMMY() {}
@@ -48,7 +210,7 @@ public:
 };

 #ifdef HAVE_MMAP
-class TC_LOG_MMAP: public TC_LOG
+class TC_LOG_MMAP: public TC_LOG_unordered
 {
  public:                // only to keep Sun Forte on sol9x86 happy
  typedef enum {
@@ -227,12 +389,19 @@ private:
  time_t last_time;
 };

-class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
+class binlog_trx_data;
+class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG
 {
 private:
  /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
  pthread_mutex_t LOCK_index;
  pthread_mutex_t LOCK_prep_xids;
+  /*
+    Mutex to protect the queue of transactions waiting to participate in group
+    commit. (Only used on platforms without native atomic operations).
+  */
+  pthread_mutex_t LOCK_queue;
+
  pthread_cond_t  COND_prep_xids;
  pthread_cond_t update_cond;
  ulonglong bytes_written;
@@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
    In 5.0 it's 0 for relay logs too!
  */
  bool no_auto_events;
-
-  ulonglong m_table_map_version;
+  /* Queue of transactions queued up to participate in group commit. */
+  binlog_trx_data *group_commit_queue;

  int write_to_file(IO_CACHE *cache);
  /*
@@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
  */
  void new_file_without_locking();
  void new_file_impl(bool need_lock);
+  int write_transaction(binlog_trx_data *trx_data);
+  bool write_transaction_to_binlog_events(binlog_trx_data *trx_data);
+  void trx_group_commit_participant(binlog_trx_data *trx_data);
+  void trx_group_commit_leader(TC_group_commit_entry *first);
+  binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data);
+  binlog_trx_data *atomic_grab_trx_queue();
+  void mark_xid_done();
+  void mark_xids_active(uint xid_count);

 public:
  MYSQL_LOG::generate_name;
@@ -310,18 +487,11 @@ public:

  int open(const char *opt_name);
  void close();
-  int log_xid(THD *thd, my_xid xid);
+  void group_log_xid(TC_group_commit_entry *first);
+  int xid_log_after(TC_group_commit_entry *entry);
  void unlog(ulong cookie, my_xid xid);
  int recover(IO_CACHE *log, Format_description_log_event *fdle);
 #if !defined(MYSQL_CLIENT)
-  bool is_table_mapped(TABLE *table) const
-  {
-    return table->s->table_map_version == table_map_version();
-  }
-
-  ulonglong table_map_version() const { return m_table_map_version; }
-  void update_table_map_version() { ++m_table_map_version; }
-
  int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event);
  int remove_pending_rows_event(THD *thd);

@@ -362,10 +532,12 @@ public:
  void new_file();

  bool write(Log_event* event_info); // binary log write
-  bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
-  bool write_incident(THD *thd, bool lock);
+  bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
+                                   Log_event *end_ev);
+  bool trx_group_commit_finish(binlog_trx_data *trx_data);
+  bool write_incident(THD *thd);

-  int  write_cache(IO_CACHE *cache, bool lock_log, bool flush_and_sync);
+  int  write_cache(IO_CACHE *cache);
  void set_write_error(THD *thd);
  bool check_write_error(THD *thd);

@@ -420,6 +592,7 @@ public:
  inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
  inline IO_CACHE *get_index_file() { return &index_file;}
  inline uint32 get_open_count() { return open_count; }
+  void set_status_variables();
 };

 class Log_event_handler

--- a/sql/log_event.h
+++ b/sql/log_event.h
@@ -463,10 +463,9 @@ struct sql_ex_info
 #define LOG_EVENT_SUPPRESS_USE_F    0x8

 /*
-  The table map version internal to the log should be increased after
-  the event has been written to the binary log.
+  This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused.
 */
-#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10
+#define LOG_EVENT_UNUSED1_F 0x10

 /**
   @def LOG_EVENT_ARTIFICIAL_F

--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -1333,6 +1333,7 @@ void clean_up(bool print_message)
  ha_end();
  if (tc_log)
    tc_log->close();
+  TC_destroy();
  xid_cache_free();
  wt_end();
  delete_elements(&key_caches, (void (*)(const char*, uchar*)) free_key_cache);
@@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name);
  if (!errmesg[0][0])
    unireg_abort(1);

+  TC_init();
+
  /* We have to initialize the storage engines before CSV logging */
  if (ha_init())
  {

--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -673,6 +673,8 @@ THD::THD()
  active_vio = 0;
 #endif
  pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_commit_ordered, 0);

  /* Variables with default values */
  proc_info="login";
@@ -999,6 +1001,8 @@ THD::~THD()
  free_root(&transaction.mem_root,MYF(0));
 #endif
  mysys_var=0;					// Safety (shouldn't be needed)
+  pthread_cond_destroy(&COND_commit_ordered);
+  pthread_mutex_destroy(&LOCK_commit_ordered);
  pthread_mutex_destroy(&LOCK_thd_data);
 #ifndef DBUG_OFF
  dbug_sentry= THD_SENTRY_GONE;
@@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end)
    if (stmt_end)
    {
      pending->set_flags(Rows_log_event::STMT_END_F);
-      pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
      binlog_table_maps= 0;
    }

@@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
    {
      Query_log_event qinfo(this, query_arg, query_len, is_trans, suppress_use,
                            errcode);
-      qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
      /*
        Binlog table maps will be irrelevant after a Query_log_event
        (they are just removed on the slave side) so after the query

--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1438,6 +1438,10 @@ public:
  /* container for handler's private per-connection data */
  Ha_data ha_data[MAX_HA];

+  /* Mutex and condition for waking up threads after group commit. */
+  pthread_mutex_t LOCK_commit_ordered;
+  pthread_cond_t COND_commit_ordered;
+
 #ifndef MYSQL_CLIENT
  int binlog_setup_trx_data();


--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
 	  else
 	  {
 	    Delete_file_log_event d(thd, db, transactional_table);
-            d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
 	    (void) mysql_bin_log.write(&d);
 	  }
 	}
@@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
      (duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE :
      (ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR),
      transactional_table, FALSE, errcode);
-  e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
  return mysql_bin_log.write(&e);
 }


--- a/sql/table.cc
+++ b/sql/table.cc
@@ -296,13 +296,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,

    share->version=       refresh_version;

-    /*
-      This constant is used to mark that no table map version has been
-      assigned.  No arithmetic is done on the value: it will be
-      overwritten with a value taken from MYSQL_BIN_LOG.
-    */
-    share->table_map_version= ~(ulonglong)0;
-
    /*
      Since alloc_table_share() can be called without any locking (for
      example, ha_create_table... functions), we do not assign a table
@@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
  share->frm_version= 		 FRM_VER_TRUE_VARCHAR;

  /*
-    Temporary tables are not replicated, but we set up these fields
+    Temporary tables are not replicated, but we set up this fields
    anyway to be able to catch errors.
   */
-  share->table_map_version= ~(ulonglong)0;
  share->cached_row_logging_check= -1;

  /*

--- a/sql/table.h
+++ b/sql/table.h
@@ -433,7 +433,6 @@ typedef struct st_table_share
  bool waiting_on_cond;                 /* Protection against free */
  bool deleting;                        /* going to delete this table */
  ulong table_map_id;                   /* for row-based replication */
-  ulonglong table_map_version;

  /*
    Cache for row-based replication table share checks that does not

--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc