Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
M
MariaDB
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
nexedi
MariaDB
Commits
706d198f
Commit
706d198f
authored
Sep 30, 2010
by
unknown
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
MWL#116: Efficient group commit for binary log
Preliminary commit for testing
parent
72b347bc
Changes
16
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
1806 additions
and
452 deletions
+1806
-452
mysql-test/r/group_commit.result
mysql-test/r/group_commit.result
+63
-0
mysql-test/suite/binlog/r/binlog_ioerr.result
mysql-test/suite/binlog/r/binlog_ioerr.result
+28
-0
mysql-test/suite/binlog/t/binlog_ioerr.test
mysql-test/suite/binlog/t/binlog_ioerr.test
+29
-0
mysql-test/t/group_commit.test
mysql-test/t/group_commit.test
+115
-0
sql/handler.cc
sql/handler.cc
+136
-74
sql/handler.h
sql/handler.h
+88
-1
sql/log.cc
sql/log.cc
+1040
-236
sql/log.h
sql/log.h
+191
-18
sql/log_event.h
sql/log_event.h
+2
-3
sql/mysqld.cc
sql/mysqld.cc
+3
-0
sql/sql_class.cc
sql/sql_class.cc
+4
-2
sql/sql_class.h
sql/sql_class.h
+4
-0
sql/sql_load.cc
sql/sql_load.cc
+0
-2
sql/table.cc
sql/table.cc
+1
-9
sql/table.h
sql/table.h
+0
-1
storage/xtradb/handler/ha_innodb.cc
storage/xtradb/handler/ha_innodb.cc
+102
-106
No files found.
mysql-test/r/group_commit.result
0 → 100644
View file @
706d198f
CREATE TABLE t1 (a VARCHAR(10) PRIMARY KEY) ENGINE=innodb;
SELECT variable_value INTO @commits FROM information_schema.global_status
WHERE variable_name = 'binlog_commits';
SELECT variable_value INTO @group_commits FROM information_schema.global_status
WHERE variable_name = 'binlog_group_commits';
SET DEBUG_SYNC= "commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued";
INSERT INTO t1 VALUES ("con1");
set DEBUG_SYNC= "now WAIT_FOR group1_running";
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con2";
SET DEBUG_SYNC= "commit_after_release_LOCK_group_commit WAIT_FOR group3_committed";
SET DEBUG_SYNC= "commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked";
INSERT INTO t1 VALUES ("con2");
SET DEBUG_SYNC= "now WAIT_FOR group2_con2";
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con3";
INSERT INTO t1 VALUES ("con3");
SET DEBUG_SYNC= "now WAIT_FOR group2_con3";
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL group2_con4";
INSERT INTO t1 VALUES ("con4");
SET DEBUG_SYNC= "now WAIT_FOR group2_con4";
SET SESSION TRANSACTION ISOLATION LEVEL READ COMMITTED;
SELECT * FROM t1 ORDER BY a;
a
SET DEBUG_SYNC= "now SIGNAL group2_queued";
SELECT * FROM t1 ORDER BY a;
a
con1
SET DEBUG_SYNC= "commit_before_get_LOCK_commit_ordered SIGNAL group3_con5";
SET DEBUG_SYNC= "commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued";
INSERT INTO t1 VALUES ("con5");
SET DEBUG_SYNC= "now WAIT_FOR con5_leader";
SET DEBUG_SYNC= "commit_after_prepare_ordered SIGNAL con6_queued";
INSERT INTO t1 VALUES ("con6");
SET DEBUG_SYNC= "now WAIT_FOR group3_con5";
SELECT * FROM t1 ORDER BY a;
a
con1
SET DEBUG_SYNC= "now SIGNAL group3_committed";
SET DEBUG_SYNC= "now WAIT_FOR group2_visible";
SELECT * FROM t1 ORDER BY a;
a
con1
con2
con3
con4
SET DEBUG_SYNC= "now SIGNAL group2_checked";
SELECT * FROM t1 ORDER BY a;
a
con1
con2
con3
con4
con5
con6
SELECT variable_value - @commits FROM information_schema.global_status
WHERE variable_name = 'binlog_commits';
variable_value - @commits
6
SELECT variable_value - @group_commits FROM information_schema.global_status
WHERE variable_name = 'binlog_group_commits';
variable_value - @group_commits
3
SET DEBUG_SYNC= 'RESET';
DROP TABLE t1;
mysql-test/suite/binlog/r/binlog_ioerr.result
0 → 100644
View file @
706d198f
CALL mtr.add_suppression("Error writing file 'master-bin'");
RESET MASTER;
CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb;
INSERT INTO t1 VALUES(0);
SET SESSION debug='+d,fail_binlog_write_1';
INSERT INTO t1 VALUES(1);
ERROR HY000: Error writing file 'master-bin' (errno: 22)
INSERT INTO t1 VALUES(2);
ERROR HY000: Error writing file 'master-bin' (errno: 22)
SET SESSION debug='';
INSERT INTO t1 VALUES(3);
SELECT * FROM t1;
a
0
3
SHOW BINLOG EVENTS;
Log_name Pos Event_type Server_id End_log_pos Info
BINLOG POS Format_desc 1 ENDPOS Server ver: #, Binlog ver: #
BINLOG POS Query 1 ENDPOS use `test`; CREATE TABLE t1 (a INT PRIMARY KEY) ENGINE=innodb
BINLOG POS Query 1 ENDPOS BEGIN
BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(0)
BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
BINLOG POS Query 1 ENDPOS BEGIN
BINLOG POS Query 1 ENDPOS BEGIN
BINLOG POS Query 1 ENDPOS BEGIN
BINLOG POS Query 1 ENDPOS use `test`; INSERT INTO t1 VALUES(3)
BINLOG POS Xid 1 ENDPOS COMMIT /* XID */
DROP TABLE t1;
mysql-test/suite/binlog/t/binlog_ioerr.test
0 → 100644
View file @
706d198f
source
include
/
have_debug
.
inc
;
source
include
/
have_innodb
.
inc
;
source
include
/
have_log_bin
.
inc
;
source
include
/
have_binlog_format_mixed_or_statement
.
inc
;
CALL
mtr
.
add_suppression
(
"Error writing file 'master-bin'"
);
RESET
MASTER
;
CREATE
TABLE
t1
(
a
INT
PRIMARY
KEY
)
ENGINE
=
innodb
;
INSERT
INTO
t1
VALUES
(
0
);
SET
SESSION
debug
=
'+d,fail_binlog_write_1'
;
--
error
ER_ERROR_ON_WRITE
INSERT
INTO
t1
VALUES
(
1
);
--
error
ER_ERROR_ON_WRITE
INSERT
INTO
t1
VALUES
(
2
);
SET
SESSION
debug
=
''
;
INSERT
INTO
t1
VALUES
(
3
);
SELECT
*
FROM
t1
;
# Actually the output from this currently shows a bug.
# The injected IO error leaves partially written transactions in the binlog in
# the form of stray "BEGIN" events.
# These should disappear from the output if binlog error handling is improved.
--
replace_regex
/
\
/
\
*
xid
=.*
\
*
\
//\/* XID *\// /Server ver: .*, Binlog ver: .*/Server ver: #, Binlog ver: #/ /table_id: [0-9]+/table_id: #/
--
replace_column
1
BINLOG
2
POS
5
ENDPOS
SHOW
BINLOG
EVENTS
;
DROP
TABLE
t1
;
mysql-test/t/group_commit.test
0 → 100644
View file @
706d198f
--
source
include
/
have_debug_sync
.
inc
--
source
include
/
have_innodb
.
inc
--
source
include
/
have_log_bin
.
inc
# Test some group commit code paths by using debug_sync to do controlled
# commits of 6 transactions: first 1 alone, then 3 as a group, then 2 as a
# group.
#
# Group 3 is allowed to race as far as possible ahead before group 2 finishes
# to check some edge case for concurrency control.
CREATE
TABLE
t1
(
a
VARCHAR
(
10
)
PRIMARY
KEY
)
ENGINE
=
innodb
;
SELECT
variable_value
INTO
@
commits
FROM
information_schema
.
global_status
WHERE
variable_name
=
'binlog_commits'
;
SELECT
variable_value
INTO
@
group_commits
FROM
information_schema
.
global_status
WHERE
variable_name
=
'binlog_group_commits'
;
connect
(
con1
,
localhost
,
root
,,);
connect
(
con2
,
localhost
,
root
,,);
connect
(
con3
,
localhost
,
root
,,);
connect
(
con4
,
localhost
,
root
,,);
connect
(
con5
,
localhost
,
root
,,);
connect
(
con6
,
localhost
,
root
,,);
# Start group1 (with one thread) doing commit, waiting for
# group2 to queue up before finishing.
connection
con1
;
SET
DEBUG_SYNC
=
"commit_after_group_log_xid SIGNAL group1_running WAIT_FOR group2_queued"
;
send
INSERT
INTO
t1
VALUES
(
"con1"
);
# Make group2 (with three threads) queue up.
# Make sure con2 is the group commit leader for group2.
# Make group2 wait with running commit_ordered() until group3 has committed.
connection
con2
;
set
DEBUG_SYNC
=
"now WAIT_FOR group1_running"
;
SET
DEBUG_SYNC
=
"commit_after_prepare_ordered SIGNAL group2_con2"
;
SET
DEBUG_SYNC
=
"commit_after_release_LOCK_group_commit WAIT_FOR group3_committed"
;
SET
DEBUG_SYNC
=
"commit_after_group_run_commit_ordered SIGNAL group2_visible WAIT_FOR group2_checked"
;
send
INSERT
INTO
t1
VALUES
(
"con2"
);
connection
con3
;
SET
DEBUG_SYNC
=
"now WAIT_FOR group2_con2"
;
SET
DEBUG_SYNC
=
"commit_after_prepare_ordered SIGNAL group2_con3"
;
send
INSERT
INTO
t1
VALUES
(
"con3"
);
connection
con4
;
SET
DEBUG_SYNC
=
"now WAIT_FOR group2_con3"
;
SET
DEBUG_SYNC
=
"commit_after_prepare_ordered SIGNAL group2_con4"
;
send
INSERT
INTO
t1
VALUES
(
"con4"
);
# When group2 is queued, let group1 continue and queue group3.
connection
default
;
SET
DEBUG_SYNC
=
"now WAIT_FOR group2_con4"
;
# At this point, trasaction 1 is still not visible as commit_ordered() has not
# been called yet.
SET
SESSION
TRANSACTION
ISOLATION
LEVEL
READ
COMMITTED
;
SELECT
*
FROM
t1
ORDER
BY
a
;
SET
DEBUG_SYNC
=
"now SIGNAL group2_queued"
;
connection
con1
;
reap
;
# Now transaction 1 is visible.
connection
default
;
SELECT
*
FROM
t1
ORDER
BY
a
;
connection
con5
;
SET
DEBUG_SYNC
=
"commit_before_get_LOCK_commit_ordered SIGNAL group3_con5"
;
SET
DEBUG_SYNC
=
"commit_after_get_LOCK_group_commit SIGNAL con5_leader WAIT_FOR con6_queued"
;
send
INSERT
INTO
t1
VALUES
(
"con5"
);
connection
con6
;
SET
DEBUG_SYNC
=
"now WAIT_FOR con5_leader"
;
SET
DEBUG_SYNC
=
"commit_after_prepare_ordered SIGNAL con6_queued"
;
send
INSERT
INTO
t1
VALUES
(
"con6"
);
connection
default
;
SET
DEBUG_SYNC
=
"now WAIT_FOR group3_con5"
;
# Still only transaction 1 visible, as group2 have not yet run commit_ordered().
SELECT
*
FROM
t1
ORDER
BY
a
;
SET
DEBUG_SYNC
=
"now SIGNAL group3_committed"
;
SET
DEBUG_SYNC
=
"now WAIT_FOR group2_visible"
;
# Now transactions 1-4 visible.
SELECT
*
FROM
t1
ORDER
BY
a
;
SET
DEBUG_SYNC
=
"now SIGNAL group2_checked"
;
connection
con2
;
reap
;
connection
con3
;
reap
;
connection
con4
;
reap
;
connection
con5
;
reap
;
connection
con6
;
reap
;
connection
default
;
# Check all transactions finally visible.
SELECT
*
FROM
t1
ORDER
BY
a
;
SELECT
variable_value
-
@
commits
FROM
information_schema
.
global_status
WHERE
variable_name
=
'binlog_commits'
;
SELECT
variable_value
-
@
group_commits
FROM
information_schema
.
global_status
WHERE
variable_name
=
'binlog_group_commits'
;
SET
DEBUG_SYNC
=
'RESET'
;
DROP
TABLE
t1
;
sql/handler.cc
View file @
706d198f
...
...
@@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
static
TYPELIB
known_extensions
=
{
0
,
"known_exts"
,
NULL
,
NULL
};
uint
known_extensions_id
=
0
;
static
int
commit_one_phase_2
(
THD
*
thd
,
bool
all
,
THD_TRANS
*
trans
,
bool
is_real_trans
);
static
plugin_ref
ha_default_plugin
(
THD
*
thd
)
...
...
@@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
*/
int
ha_commit_trans
(
THD
*
thd
,
bool
all
)
{
int
error
=
0
,
cookie
=
0
;
int
error
=
0
,
cookie
;
/*
'all' means that this is either an explicit commit issued by
user, or an implicit commit issued by a DDL.
...
...
@@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all)
*/
bool
is_real_trans
=
all
||
thd
->
transaction
.
all
.
ha_list
==
0
;
Ha_trx_info
*
ha_info
=
trans
->
ha_list
;
my_xid
xid
=
thd
->
transaction
.
xid_state
.
xid
.
get_my_xid
();
bool
need_prepare_ordered
,
need_commit_ordered
;
my_xid
xid
;
DBUG_ENTER
(
"ha_commit_trans"
);
/*
...
...
@@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all)
DBUG_RETURN
(
2
);
}
#ifdef USING_TRANSACTIONS
if
(
ha_info
)
if
(
!
ha_info
)
{
uint
rw_ha_count
;
bool
rw_trans
;
/* Free resources and perform other cleanup even for 'empty' transactions. */
if
(
is_real_trans
)
thd
->
transaction
.
cleanup
();
DBUG_RETURN
(
0
);
}
DBUG_EXECUTE_IF
(
"crash_commit_before"
,
abort
(););
DBUG_EXECUTE_IF
(
"crash_commit_before"
,
abort
(););
/* Close all cursors that can not survive COMMIT */
if
(
is_real_trans
)
/* not a statement commit */
thd
->
stmt_map
.
close_transient_cursors
();
/* Close all cursors that can not survive COMMIT */
if
(
is_real_trans
)
/* not a statement commit */
thd
->
stmt_map
.
close_transient_cursors
();
rw_ha_count
=
ha_check_and_coalesce_trx_read_only
(
thd
,
ha_info
,
all
);
/* rw_trans is TRUE when we in a transaction changing data */
rw_trans
=
is_real_trans
&&
(
rw_ha_count
>
0
);
uint
rw_ha_count
=
ha_check_and_coalesce_trx_read_only
(
thd
,
ha_info
,
all
);
/* rw_trans is TRUE when we in a transaction changing data */
bool
rw_trans
=
is_real_trans
&&
(
rw_ha_count
>
0
);
if
(
rw_trans
&&
wait_if_global_read_lock
(
thd
,
0
,
0
))
{
ha_rollback_trans
(
thd
,
all
);
DBUG_RETURN
(
1
);
}
if
(
rw_trans
&&
wait_if_global_read_lock
(
thd
,
0
,
0
))
{
ha_rollback_trans
(
thd
,
all
);
DBUG_RETURN
(
1
);
}
if
(
rw_trans
&&
opt_readonly
&&
!
(
thd
->
security_ctx
->
master_access
&
SUPER_ACL
)
&&
!
thd
->
slave_thread
)
{
my_error
(
ER_OPTION_PREVENTS_STATEMENT
,
MYF
(
0
),
"--read-only"
);
ha_rollback_trans
(
thd
,
all
);
error
=
1
;
goto
end
;
}
if
(
rw_trans
&&
opt_readonly
&&
!
(
thd
->
security_ctx
->
master_access
&
SUPER_ACL
)
&&
!
thd
->
slave_thread
)
{
my_error
(
ER_OPTION_PREVENTS_STATEMENT
,
MYF
(
0
),
"--read-only"
);
goto
err
;
}
if
(
!
trans
->
no_2pc
&&
(
rw_ha_count
>
1
))
{
for
(;
ha_info
&&
!
error
;
ha_info
=
ha_info
->
next
())
{
int
err
;
handlerton
*
ht
=
ha_info
->
ht
();
/*
Do not call two-phase commit if this particular
transaction is read-only. This allows for simpler
implementation in engines that are always read-only.
*/
if
(
!
ha_info
->
is_trx_read_write
())
continue
;
/*
Sic: we know that prepare() is not NULL since otherwise
trans->no_2pc would have been set.
*/
if
((
err
=
ht
->
prepare
(
ht
,
thd
,
all
)))
{
my_error
(
ER_ERROR_DURING_COMMIT
,
MYF
(
0
),
err
);
error
=
1
;
}
status_var_increment
(
thd
->
status_var
.
ha_prepare_count
);
}
DBUG_EXECUTE_IF
(
"crash_commit_after_prepare"
,
DBUG_ABORT
(););
if
(
error
||
(
is_real_trans
&&
xid
&&
(
error
=
!
(
cookie
=
tc_log
->
log_xid
(
thd
,
xid
)))))
{
ha_rollback_trans
(
thd
,
all
);
error
=
1
;
goto
end
;
}
DBUG_EXECUTE_IF
(
"crash_commit_after_log"
,
DBUG_ABORT
(););
}
error
=
ha_commit_one_phase
(
thd
,
all
)
?
(
cookie
?
2
:
1
)
:
0
;
DBUG_EXECUTE_IF
(
"crash_commit_before_unlog"
,
DBUG_ABORT
(););
if
(
cookie
)
tc_log
->
unlog
(
cookie
,
xid
);
if
(
trans
->
no_2pc
||
(
rw_ha_count
<=
1
))
{
error
=
ha_commit_one_phase
(
thd
,
all
);
DBUG_EXECUTE_IF
(
"crash_commit_after"
,
DBUG_ABORT
(););
end:
if
(
rw_trans
)
start_waiting_global_read_lock
(
thd
);
goto
end
;
}
/* Free resources and perform other cleanup even for 'empty' transactions. */
else
if
(
is_real_trans
)
thd
->
transaction
.
cleanup
();
need_prepare_ordered
=
FALSE
;
need_commit_ordered
=
FALSE
;
xid
=
thd
->
transaction
.
xid_state
.
xid
.
get_my_xid
();
for
(
Ha_trx_info
*
hi
=
ha_info
;
hi
;
hi
=
hi
->
next
())
{
int
err
;
handlerton
*
ht
=
hi
->
ht
();
/*
Do not call two-phase commit if this particular
transaction is read-only. This allows for simpler
implementation in engines that are always read-only.
*/
if
(
!
hi
->
is_trx_read_write
())
continue
;
/*
Sic: we know that prepare() is not NULL since otherwise
trans->no_2pc would have been set.
*/
if
((
err
=
ht
->
prepare
(
ht
,
thd
,
all
)))
my_error
(
ER_ERROR_DURING_COMMIT
,
MYF
(
0
),
err
);
status_var_increment
(
thd
->
status_var
.
ha_prepare_count
);
if
(
err
)
goto
err
;
if
(
ht
->
prepare_ordered
)
need_prepare_ordered
=
TRUE
;
if
(
ht
->
commit_ordered
)
need_commit_ordered
=
TRUE
;
}
DBUG_EXECUTE_IF
(
"crash_commit_after_prepare"
,
DBUG_ABORT
(););
if
(
!
is_real_trans
)
{
error
=
commit_one_phase_2
(
thd
,
all
,
trans
,
is_real_trans
);
DBUG_EXECUTE_IF
(
"crash_commit_after"
,
DBUG_ABORT
(););
goto
end
;
}
cookie
=
tc_log
->
log_and_order
(
thd
,
xid
,
all
,
need_prepare_ordered
,
need_commit_ordered
);
if
(
!
cookie
)
goto
err
;
DBUG_EXECUTE_IF
(
"crash_commit_after_log"
,
DBUG_ABORT
(););
error
=
commit_one_phase_2
(
thd
,
all
,
trans
,
is_real_trans
)
?
2
:
0
;
DBUG_EXECUTE_IF
(
"crash_commit_after"
,
DBUG_ABORT
(););
DBUG_EXECUTE_IF
(
"crash_commit_before_unlog"
,
DBUG_ABORT
(););
tc_log
->
unlog
(
cookie
,
xid
);
DBUG_EXECUTE_IF
(
"crash_commit_after"
,
DBUG_ABORT
(););
goto
end
;
/* Come here if error and we need to rollback. */
err:
if
(
!
error
)
error
=
1
;
ha_rollback_trans
(
thd
,
all
);
end:
if
(
rw_trans
)
start_waiting_global_read_lock
(
thd
);
#endif
/* USING_TRANSACTIONS */
DBUG_RETURN
(
error
);
}
...
...
@@ -1207,7 +1237,6 @@ int ha_commit_trans(THD *thd, bool all)
*/
int
ha_commit_one_phase
(
THD
*
thd
,
bool
all
)
{
int
error
=
0
;
THD_TRANS
*
trans
=
all
?
&
thd
->
transaction
.
all
:
&
thd
->
transaction
.
stmt
;
/*
"real" is a nick name for a transaction for which a commit will
...
...
@@ -1217,8 +1246,41 @@ int ha_commit_one_phase(THD *thd, bool all)
enclosing 'all' transaction is rolled back.
*/
bool
is_real_trans
=
all
||
thd
->
transaction
.
all
.
ha_list
==
0
;
Ha_trx_info
*
ha_info
=
trans
->
ha_list
,
*
ha_info_next
;
Ha_trx_info
*
ha_info
=
trans
->
ha_list
;
DBUG_ENTER
(
"ha_commit_one_phase"
);
#ifdef USING_TRANSACTIONS
if
(
ha_info
)
{
if
(
is_real_trans
)
{
bool
locked
=
false
;
for
(;
ha_info
;
ha_info
=
ha_info
->
next
())
{
handlerton
*
ht
=
ha_info
->
ht
();
if
(
ht
->
commit_ordered
)
{
if
(
ha_info
->
is_trx_read_write
()
&&
!
locked
)
{
pthread_mutex_lock
(
&
LOCK_commit_ordered
);
locked
=
1
;
}
ht
->
commit_ordered
(
ht
,
thd
,
all
);
}
}
if
(
locked
)
pthread_mutex_unlock
(
&
LOCK_commit_ordered
);
}
}
#endif
/* USING_TRANSACTIONS */
DBUG_RETURN
(
commit_one_phase_2
(
thd
,
all
,
trans
,
is_real_trans
));
}
static
int
commit_one_phase_2
(
THD
*
thd
,
bool
all
,
THD_TRANS
*
trans
,
bool
is_real_trans
)
{
int
error
=
0
;
Ha_trx_info
*
ha_info
=
trans
->
ha_list
,
*
ha_info_next
;
DBUG_ENTER
(
"commit_one_phase_2"
);
#ifdef USING_TRANSACTIONS
if
(
ha_info
)
{
...
...
sql/handler.h
View file @
706d198f
...
...
@@ -656,9 +656,96 @@ struct handlerton
NOTE 'all' is also false in auto-commit mode where 'end of statement'
and 'real commit' mean the same event.
*/
int
(
*
commit
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
int
(
*
commit
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
/*
The commit_ordered() method is called prior to the commit() method, after
the transaction manager has decided to commit (not rollback) the
transaction. Unlike commit(), commit_ordered() is called only when the
full transaction is committed, not for each commit of statement
transaction in a multi-statement transaction.
The calls to commit_ordered() in multiple parallel transactions is
guaranteed to happen in the same order in every participating
handler. This can be used to ensure the same commit order among multiple
handlers (eg. in table handler and binlog). So if transaction T1 calls
into commit_ordered() of handler A before T2, then T1 will also call
commit_ordered() of handler B before T2.
Engines that implement this method should during this call make the
transaction visible to other transactions, thereby making the order of
transaction commits be defined by the order of commit_ordered() calls.
The intension is that commit_ordered() should do the minimal amount of
work that needs to happen in consistent commit order among handlers. To
preserve ordering, calls need to be serialised on a global mutex, so
doing any time-consuming or blocking operations in commit_ordered() will
limit scalability.
Handlers can rely on commit_ordered() calls for transactions that updated
data to be serialised (no two calls can run in parallel, so no extra
locking on the handler part is required to ensure this). However, calls
for SELECT-only transactions are not serialised, so can occur in parallel
with each other and with at most one write-transaction.
Note that commit_ordered() can be called from a different thread than the
one handling the transaction! So it can not do anything that depends on
thread local storage, in particular it can not call my_error() and
friends (instead it can store the error code and delay the call of
my_error() to the commit() method).
Similarly, since commit_ordered() returns void, any return error code
must be saved and returned from the commit() method instead.
The commit_ordered method is optional, and can be left unset if not
needed in a particular handler.
*/
void
(
*
commit_ordered
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
int
(
*
rollback
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
int
(
*
prepare
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
/*
The prepare_ordered method is optional. If set, it will be called after
successful prepare() in all handlers participating in 2-phase
commit. Like commit_ordered(), it is called only when the full
transaction is committed, not for each commit of statement transaction.
The calls to prepare_ordered() among multiple parallel transactions are
ordered consistently with calls to commit_ordered(). This means that
calls to prepare_ordered() effectively define the commit order, and that
each handler will see the same sequence of transactions calling into
prepare_ordered() and commit_ordered().
Thus, prepare_ordered() can be used to define commit order for handlers
that need to do this in the prepare step (like binlog). It can also be
used to release transaction's locks early in an order consistent with the
order transactions will be eventually committed.
Like commit_ordered(), prepare_ordered() calls are serialised to maintain
ordering, so the intension is that they should execute fast, with only
the minimal amount of work needed to define commit order. Handlers can
rely on this serialisation, and do not need to do any extra locking to
avoid two prepare_ordered() calls running in parallel.
Like commit_ordered(), prepare_ordered() is not guaranteed to be called
in the context of the thread handling the rest of the transaction. So it
cannot invoke code that relies on thread local storage, in particular it
cannot call my_error().
When prepare_ordered() is called, the transaction coordinator has already
decided to commit (not rollback) the transaction. So prepare_ordered()
cannot cause a rollback by returning an error, all possible errors must
be handled in prepare() (the prepare_ordered() method returns void). In
case of some fatal error, a record of the error must be made internally
by the engine and returned from commit() later.
Note that for user-level XA SQL commands, no consistent ordering among
prepare_ordered() and commit_ordered() is guaranteed (as that would
require blocking all other commits for an indefinite time).
When 2-phase commit is not used (eg. only one engine (and no binlog) in
transaction), prepare() is not called and in such cases prepare_ordered()
also is not called.
*/
void
(
*
prepare_ordered
)(
handlerton
*
hton
,
THD
*
thd
,
bool
all
);
int
(
*
recover
)(
handlerton
*
hton
,
XID
*
xid_list
,
uint
len
);
int
(
*
commit_by_xid
)(
handlerton
*
hton
,
XID
*
xid
);
int
(
*
rollback_by_xid
)(
handlerton
*
hton
,
XID
*
xid
);
...
...
sql/log.cc
View file @
706d198f
This diff is collapsed.
Click to expand it.
sql/log.h
View file @
706d198f
...
...
@@ -33,11 +33,173 @@ class TC_LOG
virtual
int
open
(
const
char
*
opt_name
)
=
0
;
virtual
void
close
()
=
0
;
virtual
int
log_xid
(
THD
*
thd
,
my_xid
xid
)
=
0
;
virtual
int
log_and_order
(
THD
*
thd
,
my_xid
xid
,
bool
all
,
bool
need_prepare_ordered
,
bool
need_commit_ordered
)
=
0
;
virtual
void
unlog
(
ulong
cookie
,
my_xid
xid
)
=
0
;
protected:
/*
These methods are meant to be invoked from log_and_order() implementations
to run any prepare_ordered() respectively commit_ordered() methods in
participating handlers.
They must be called using suitable thread syncronisation to ensure that
they are each called in the correct commit order among all
transactions. However, it is only necessary to call them if the
corresponding flag passed to log_and_order is set (it is safe, but not
required, to call them when the flag is false).
The caller must be holding LOCK_prepare_ordered respectively
LOCK_commit_ordered when calling these methods.
*/
void
run_prepare_ordered
(
THD
*
thd
,
bool
all
);
void
run_commit_ordered
(
THD
*
thd
,
bool
all
);
};
/*
Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
and TC_LOG::run_commit_ordered(), or any other code that calls handler
prepare_ordered() or commit_ordered() methods.
*/
extern
pthread_mutex_t
LOCK_prepare_ordered
;
extern
pthread_mutex_t
LOCK_commit_ordered
;
extern
void
TC_init
();
extern
void
TC_destroy
();
/*
Base class for two TC implementations TC_LOG_unordered and
TC_LOG_group_commit that both use a queue of threads waiting for group
commit.
*/
class
TC_LOG_queued
:
public
TC_LOG
{
protected:
TC_LOG_queued
();
~
TC_LOG_queued
();
/* Structure used to link list of THDs waiting for group commit. */
struct
TC_group_commit_entry
{
struct
TC_group_commit_entry
*
next
;
THD
*
thd
;
/* This is the `all' parameter for ha_commit_trans() etc. */
bool
all
;
/*
Flag set true when it is time for this thread to wake up after group
commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered.
*/
bool
group_commit_ready
;
/*
Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and
cookie.
*/
int
xid_error
;
};
TC_group_commit_entry
*
reverse_queue
(
TC_group_commit_entry
*
queue
);
void
group_commit_wait_for_wakeup
(
TC_group_commit_entry
*
entry
);
void
group_commit_wakeup_other
(
TC_group_commit_entry
*
other
);
/*
This is a queue of threads waiting for being allowed to commit.
Access to the queue must be protected by LOCK_prepare_ordered.
*/
TC_group_commit_entry
*
group_commit_queue
;
};
class
TC_LOG_unordered
:
public
TC_LOG_queued
{
public:
TC_LOG_unordered
();
~
TC_LOG_unordered
();
int
log_and_order
(
THD
*
thd
,
my_xid
xid
,
bool
all
,
bool
need_prepare_ordered
,
bool
need_commit_ordered
);
protected:
virtual
int
log_xid
(
THD
*
thd
,
my_xid
xid
)
=
0
;
private:
/*
This flag and condition is used to reserve the queue while threads in it
each run the commit_ordered() methods one after the other. Only once the
last commit_ordered() in the queue is done can we start on a new queue
run.
Since we start this process in the first thread in the queue and finish in
the last (and possibly different) thread, we need a condition variable for
this (we cannot unlock a mutex in a different thread than the one who
locked it).
The condition is used together with the LOCK_prepare_ordered mutex.
*/
my_bool
group_commit_queue_busy
;
pthread_cond_t
COND_queue_busy
;
};
class
TC_LOG_group_commit
:
public
TC_LOG_queued
{
public:
TC_LOG_group_commit
();
~
TC_LOG_group_commit
();
int
log_and_order
(
THD
*
thd
,
my_xid
xid
,
bool
all
,
bool
need_prepare_ordered
,
bool
need_commit_ordered
);
protected:
/* Total number of committed transactions. */
ulonglong
num_commits
;
/* Number of group commits done. */
ulonglong
num_group_commits
;
/*
When using this class, this method is used instead of log_xid() to do
logging of a group of transactions all at once.
The transactions will be linked through THD::next_commit_ordered.
Additionally, when this method is used instead of log_xid(), the order in
which handler->prepare_ordered() and handler->commit_ordered() are called
is guaranteed to be the same as the order of calls and THD list elements
for group_log_xid().
This can be used to efficiently implement group commit that at the same
time preserves the order of commits among handlers and TC (eg. to get same
commit order in InnoDB and binary log).
For TCs that do not need this, it can be preferable to use plain log_xid()
with class TC_LOG_unordered instead, as it allows threads to run log_xid()
in parallel with each other. In contrast, group_log_xid() runs under a
global mutex, so it is guaranteed that only once call into it will be
active at once.
Since this call handles multiple threads/THDs at once, my_error() (and
other code that relies on thread local storage) cannot be used in this
method. Instead, the implementation must record any error and report it as
the return value from xid_log_after(), which will be invoked individually
for each thread.
In the success case, this method must set thd->xid_cookie for each thread
to the cookie that is normally returned from log_xid() (which must be
non-zero in the non-error case).
*/
virtual
void
group_log_xid
(
TC_group_commit_entry
*
first
)
=
0
;
/*
Called for each transaction (in corrent thread context) after
group_log_xid() has finished, but with no guarantee on ordering among
threads.
Can be used to do error reporting etc. */
virtual
int
xid_log_after
(
TC_group_commit_entry
*
entry
)
=
0
;
private:
/* Mutex used to serialise calls to group_log_xid(). */
pthread_mutex_t
LOCK_group_commit
;
};
class
TC_LOG_DUMMY
:
public
TC_LOG
// use it to disable the logging
class
TC_LOG_DUMMY
:
public
TC_LOG
_unordered
// use it to disable the logging
{
public:
TC_LOG_DUMMY
()
{}
...
...
@@ -48,7 +210,7 @@ class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
};
#ifdef HAVE_MMAP
class
TC_LOG_MMAP
:
public
TC_LOG
class
TC_LOG_MMAP
:
public
TC_LOG
_unordered
{
public:
// only to keep Sun Forte on sol9x86 happy
typedef
enum
{
...
...
@@ -227,12 +389,19 @@ class MYSQL_QUERY_LOG: public MYSQL_LOG
time_t
last_time
;
};
class
MYSQL_BIN_LOG
:
public
TC_LOG
,
private
MYSQL_LOG
class
binlog_trx_data
;
class
MYSQL_BIN_LOG
:
public
TC_LOG_group_commit
,
private
MYSQL_LOG
{
private:
/* LOCK_log and LOCK_index are inited by init_pthread_objects() */
pthread_mutex_t
LOCK_index
;
pthread_mutex_t
LOCK_prep_xids
;
/*
Mutex to protect the queue of transactions waiting to participate in group
commit. (Only used on platforms without native atomic operations).
*/
pthread_mutex_t
LOCK_queue
;
pthread_cond_t
COND_prep_xids
;
pthread_cond_t
update_cond
;
ulonglong
bytes_written
;
...
...
@@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
In 5.0 it's 0 for relay logs too!
*/
bool
no_auto_events
;
ulonglong
m_table_map_version
;
/* Queue of transactions queued up to participate in group commit. */
binlog_trx_data
*
group_commit_queue
;
int
write_to_file
(
IO_CACHE
*
cache
);
/*
...
...
@@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
*/
void
new_file_without_locking
();
void
new_file_impl
(
bool
need_lock
);
int
write_transaction
(
binlog_trx_data
*
trx_data
);
bool
write_transaction_to_binlog_events
(
binlog_trx_data
*
trx_data
);
void
trx_group_commit_participant
(
binlog_trx_data
*
trx_data
);
void
trx_group_commit_leader
(
TC_group_commit_entry
*
first
);
binlog_trx_data
*
atomic_enqueue_trx
(
binlog_trx_data
*
trx_data
);
binlog_trx_data
*
atomic_grab_trx_queue
();
void
mark_xid_done
();
void
mark_xids_active
(
uint
xid_count
);
public:
MYSQL_LOG
::
generate_name
;
...
...
@@ -310,18 +487,11 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
int
open
(
const
char
*
opt_name
);
void
close
();
int
log_xid
(
THD
*
thd
,
my_xid
xid
);
void
group_log_xid
(
TC_group_commit_entry
*
first
);
int
xid_log_after
(
TC_group_commit_entry
*
entry
);
void
unlog
(
ulong
cookie
,
my_xid
xid
);
int
recover
(
IO_CACHE
*
log
,
Format_description_log_event
*
fdle
);
#if !defined(MYSQL_CLIENT)
bool
is_table_mapped
(
TABLE
*
table
)
const
{
return
table
->
s
->
table_map_version
==
table_map_version
();
}
ulonglong
table_map_version
()
const
{
return
m_table_map_version
;
}
void
update_table_map_version
()
{
++
m_table_map_version
;
}
int
flush_and_set_pending_rows_event
(
THD
*
thd
,
Rows_log_event
*
event
);
int
remove_pending_rows_event
(
THD
*
thd
);
...
...
@@ -362,10 +532,12 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
void
new_file
();
bool
write
(
Log_event
*
event_info
);
// binary log write
bool
write
(
THD
*
thd
,
IO_CACHE
*
cache
,
Log_event
*
commit_event
,
bool
incident
);
bool
write_incident
(
THD
*
thd
,
bool
lock
);
bool
write_transaction_to_binlog
(
THD
*
thd
,
binlog_trx_data
*
trx_data
,
Log_event
*
end_ev
);
bool
trx_group_commit_finish
(
binlog_trx_data
*
trx_data
);
bool
write_incident
(
THD
*
thd
);
int
write_cache
(
IO_CACHE
*
cache
,
bool
lock_log
,
bool
flush_and_sync
);
int
write_cache
(
IO_CACHE
*
cache
);
void
set_write_error
(
THD
*
thd
);
bool
check_write_error
(
THD
*
thd
);
...
...
@@ -420,6 +592,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
inline
void
unlock_index
()
{
pthread_mutex_unlock
(
&
LOCK_index
);}
inline
IO_CACHE
*
get_index_file
()
{
return
&
index_file
;}
inline
uint32
get_open_count
()
{
return
open_count
;
}
void
set_status_variables
();
};
class
Log_event_handler
...
...
sql/log_event.h
View file @
706d198f
...
...
@@ -463,10 +463,9 @@ struct sql_ex_info
#define LOG_EVENT_SUPPRESS_USE_F 0x8
/*
The table map version internal to the log should be increased after
the event has been written to the binary log.
This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused.
*/
#define LOG_EVENT_U
PDATE_TABLE_MAP_VERSION
_F 0x10
#define LOG_EVENT_U
NUSED1
_F 0x10
/**
@def LOG_EVENT_ARTIFICIAL_F
...
...
sql/mysqld.cc
View file @
706d198f
...
...
@@ -1333,6 +1333,7 @@ void clean_up(bool print_message)
ha_end
();
if
(
tc_log
)
tc_log
->
close
();
TC_destroy
();
xid_cache_free
();
wt_end
();
delete_elements
(
&
key_caches
,
(
void
(
*
)(
const
char
*
,
uchar
*
))
free_key_cache
);
...
...
@@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name);
if
(
!
errmesg
[
0
][
0
])
unireg_abort
(
1
);
TC_init
();
/* We have to initialize the storage engines before CSV logging */
if
(
ha_init
())
{
...
...
sql/sql_class.cc
View file @
706d198f
...
...
@@ -673,6 +673,8 @@ THD::THD()
active_vio
=
0
;
#endif
pthread_mutex_init
(
&
LOCK_thd_data
,
MY_MUTEX_INIT_FAST
);
pthread_mutex_init
(
&
LOCK_commit_ordered
,
MY_MUTEX_INIT_FAST
);
pthread_cond_init
(
&
COND_commit_ordered
,
0
);
/* Variables with default values */
proc_info
=
"login"
;
...
...
@@ -999,6 +1001,8 @@ THD::~THD()
free_root
(
&
transaction
.
mem_root
,
MYF
(
0
));
#endif
mysys_var
=
0
;
// Safety (shouldn't be needed)
pthread_cond_destroy
(
&
COND_commit_ordered
);
pthread_mutex_destroy
(
&
LOCK_commit_ordered
);
pthread_mutex_destroy
(
&
LOCK_thd_data
);
#ifndef DBUG_OFF
dbug_sentry
=
THD_SENTRY_GONE
;
...
...
@@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end)
if
(
stmt_end
)
{
pending
->
set_flags
(
Rows_log_event
::
STMT_END_F
);
pending
->
flags
|=
LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
;
binlog_table_maps
=
0
;
}
...
...
@@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
{
Query_log_event
qinfo
(
this
,
query_arg
,
query_len
,
is_trans
,
suppress_use
,
errcode
);
qinfo
.
flags
|=
LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
;
/*
Binlog table maps will be irrelevant after a Query_log_event
(they are just removed on the slave side) so after the query
...
...
sql/sql_class.h
View file @
706d198f
...
...
@@ -1438,6 +1438,10 @@ class THD :public Statement,
/* container for handler's private per-connection data */
Ha_data
ha_data
[
MAX_HA
];
/* Mutex and condition for waking up threads after group commit. */
pthread_mutex_t
LOCK_commit_ordered
;
pthread_cond_t
COND_commit_ordered
;
#ifndef MYSQL_CLIENT
int
binlog_setup_trx_data
();
...
...
sql/sql_load.cc
View file @
706d198f
...
...
@@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
else
{
Delete_file_log_event
d
(
thd
,
db
,
transactional_table
);
d
.
flags
|=
LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
;
(
void
)
mysql_bin_log
.
write
(
&
d
);
}
}
...
...
@@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
(
duplicates
==
DUP_REPLACE
)
?
LOAD_DUP_REPLACE
:
(
ignore
?
LOAD_DUP_IGNORE
:
LOAD_DUP_ERROR
),
transactional_table
,
FALSE
,
errcode
);
e
.
flags
|=
LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F
;
return
mysql_bin_log
.
write
(
&
e
);
}
...
...
sql/table.cc
View file @
706d198f
...
...
@@ -296,13 +296,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,
share
->
version
=
refresh_version
;
/*
This constant is used to mark that no table map version has been
assigned. No arithmetic is done on the value: it will be
overwritten with a value taken from MYSQL_BIN_LOG.
*/
share
->
table_map_version
=
~
(
ulonglong
)
0
;
/*
Since alloc_table_share() can be called without any locking (for
example, ha_create_table... functions), we do not assign a table
...
...
@@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
share
->
frm_version
=
FRM_VER_TRUE_VARCHAR
;
/*
Temporary tables are not replicated, but we set up th
ese
fields
Temporary tables are not replicated, but we set up th
is
fields
anyway to be able to catch errors.
*/
share
->
table_map_version
=
~
(
ulonglong
)
0
;
share
->
cached_row_logging_check
=
-
1
;
/*
...
...
sql/table.h
View file @
706d198f
...
...
@@ -433,7 +433,6 @@ typedef struct st_table_share
bool
waiting_on_cond
;
/* Protection against free */
bool
deleting
;
/* going to delete this table */
ulong
table_map_id
;
/* for row-based replication */
ulonglong
table_map_version
;
/*
Cache for row-based replication table share checks that does not
...
...
storage/xtradb/handler/ha_innodb.cc
View file @
706d198f
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment