Commit 0ee1082b authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-28495 InnoDB corruption due to lack of file locking

Starting with commit da094188 (MDEV-24393),
MariaDB will no longer acquire advisory file locks on InnoDB data
files by default, because it would create a large number of
entries in Linux /proc/locks.

The motivation for acquiring the file locks is to prevent accidental
concurrent startup of multiple server processes on the same data files.
Such mistake still turns out to be relatively common, based on
corruption bug reports from the community.

To prevent corruption due to concurrent startup attempts, the
Aria storage engine would unconditionally acquire an advisory lock
on one of its log files.

Solution: InnoDB will always lock its system tablespace files.
(Ever since commit 685d958e
the InnoDB log file will not necessarily be open while the
server is running, because it can be accessed via memory-mapped I/O.)

If more protection is desired, then the option --external-locking
can be used.

The mandatory advisory lock also fixes intermittent failures of
some crash recovery tests. It turns out that when the mtr test harness
kills and restarts the server, it will not actually ensure that the
old process has terminated before starting the new one.
parent bd935a41
......@@ -157,6 +157,10 @@ INSERT INTO global_suppressions VALUES
("InnoDB: Error: table `test`.`t[123]` .*does not exist in the InnoDB internal"),
("InnoDB: Warning: semaphore wait:"),
/* MDEV-28976: Tests that kill the server do not ensure that the
old process has terminated before starting a new one */
("InnoDB: Unable to lock"),
/*
BUG#32080 - Excessive warnings on Solaris: setrlimit could not
change the size of core files
......
......@@ -1744,15 +1744,13 @@ database server shutdown. This should be called at a server startup after the
space objects for the log and the system tablespace have been created. The
purpose of this operation is to make sure we never run out of file descriptors
if we need to read from the insert buffer or to write to the log. */
void
fil_open_log_and_system_tablespace_files(void)
/*==========================================*/
dberr_t fil_open_log_and_system_tablespace_files()
{
fil_space_t* space;
dberr_t err = DB_SUCCESS;
mutex_enter(&fil_system.mutex);
for (space = UT_LIST_GET_FIRST(fil_system.space_list);
for (fil_space_t* space = UT_LIST_GET_FIRST(fil_system.space_list);
space != NULL;
space = UT_LIST_GET_NEXT(space_list, space)) {
......@@ -1767,17 +1765,29 @@ fil_open_log_and_system_tablespace_files(void)
node != NULL;
node = UT_LIST_GET_NEXT(chain, node)) {
if (!node->is_open()) {
if (node->is_open()) {
continue;
}
if (!fil_node_open_file(node)) {
/* This func is called during server's
startup. If some file of log or system
tablespace is missing, the server
can't start successfully. So we should
assert for it. */
ut_a(0);
err = DB_ERROR;
#ifndef _WIN32
} else if (!space->id && my_disable_locking
&& !srv_read_only_mode
&& os_file_lock(node->handle, node->name)) {
/* Retry for 60 seconds. */
for (int i = 60; i--;) {
os_thread_sleep(1000000);
if (!os_file_lock(node->handle,
node->name)) {
goto got_lock;
}
}
err = DB_ERROR;
#endif
}
#ifndef _WIN32
got_lock:
#endif
if (srv_max_n_open_files < 10 + fil_system.n_open) {
ib::warn() << "You must raise the value of"
......@@ -1799,6 +1809,7 @@ fil_open_log_and_system_tablespace_files(void)
}
mutex_exit(&fil_system.mutex);
return err;
}
/*******************************************************************//**
......
/*****************************************************************************
Copyright (c) 2013, 2016, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2021, MariaDB Corporation.
Copyright (c) 2021, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -445,12 +445,27 @@ SysTablespace::create_file(
case SRV_NOT_RAW:
err = file.open_or_create(
m_ignore_read_only ? false : srv_read_only_mode);
!m_ignore_read_only && srv_read_only_mode);
break;
}
if (err != DB_SUCCESS) {
return err;
}
if (err == DB_SUCCESS && file.m_type != SRV_OLD_RAW) {
switch (file.m_type) {
case SRV_OLD_RAW:
break;
case SRV_NOT_RAW:
#ifndef _WIN32
if (!space_id() && my_disable_locking
&& os_file_lock(file.m_handle, file.m_filepath)) {
err = DB_ERROR;
break;
}
#endif
/* fall through */
case SRV_NEW_RAW:
err = set_size(file);
}
......@@ -491,7 +506,7 @@ SysTablespace::open_file(
case SRV_NOT_RAW:
err = file.open_or_create(
m_ignore_read_only ? false : srv_read_only_mode);
!m_ignore_read_only && srv_read_only_mode);
if (err != DB_SUCCESS) {
return(err);
......@@ -506,6 +521,14 @@ SysTablespace::open_file(
break;
case SRV_NOT_RAW:
#ifndef _WIN32
if (!space_id() && (m_ignore_read_only || !srv_read_only_mode)
&& my_disable_locking
&& os_file_lock(file.m_handle, file.m_filepath)) {
err = DB_ERROR;
break;
}
#endif
/* Check file size for existing file. */
err = check_size(file);
break;
......
/*****************************************************************************
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2013, 2021, MariaDB Corporation.
Copyright (c) 2013, 2022, MariaDB Corporation.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
......@@ -794,9 +794,8 @@ database server shutdown. This should be called at a server startup after the
space objects for the log and the system tablespace have been created. The
purpose of this operation is to make sure we never run out of file descriptors
if we need to read from the insert buffer or to write to the log. */
void
fil_open_log_and_system_tablespace_files(void);
/*==========================================*/
dberr_t fil_open_log_and_system_tablespace_files()
MY_ATTRIBUTE((warn_unused_result));
/*******************************************************************//**
Closes all open files. There must not be any pending i/o's or not flushed
modifications in the files. */
......
......@@ -2,7 +2,7 @@
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2013, 2021, MariaDB Corporation.
Copyright (c) 2013, 2022, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
......@@ -640,6 +640,14 @@ os_file_set_nocache(
const char* operation_name);
#endif
#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
/** Obtain an exclusive lock on a file.
@param fd file descriptor
@param name file name
@return 0 on success */
int os_file_lock(int fd, const char *name);
#endif
/** NOTE! Use the corresponding macro os_file_create(), not directly
this function!
Opens an existing file or creates a new.
......
......@@ -1090,26 +1090,13 @@ os_aio_validate_skip()
}
#endif /* UNIV_DEBUG */
#undef USE_FILE_LOCK
#ifndef _WIN32
/* On Windows, mandatory locking is used */
# define USE_FILE_LOCK
#endif
#ifdef USE_FILE_LOCK
#ifndef _WIN32 /* On Microsoft Windows, mandatory locking is used */
/** Obtain an exclusive lock on a file.
@param[in] fd file descriptor
@param[in] name file name
@param fd file descriptor
@param name file name
@return 0 on success */
static
int
os_file_lock(
int fd,
const char* name)
int os_file_lock(int fd, const char *name)
{
if (my_disable_locking) {
return 0;
}
struct flock lk;
lk.l_type = F_WRLCK;
......@@ -1135,7 +1122,7 @@ os_file_lock(
return(0);
}
#endif /* USE_FILE_LOCK */
#endif /* !_WIN32 */
/** Calculates local segment number and aio array from global segment number.
@param[out] array aio wait array
......@@ -2741,17 +2728,18 @@ os_file_create_simple_func(
os_file_set_nocache(file, name, mode_str);
}
#ifdef USE_FILE_LOCK
#ifndef _WIN32
if (!read_only
&& *success
&& (access_type == OS_FILE_READ_WRITE)
&& access_type == OS_FILE_READ_WRITE
&& !my_disable_locking
&& os_file_lock(file, name)) {
*success = false;
close(file);
file = -1;
}
#endif /* USE_FILE_LOCK */
#endif /* !_WIN32 */
return(file);
}
......@@ -3050,10 +3038,11 @@ os_file_create_func(
os_file_set_nocache(file, name, mode_str);
}
#ifdef USE_FILE_LOCK
#ifndef _WIN32
if (!read_only
&& *success
&& create_mode != OS_FILE_OPEN_RAW
&& !my_disable_locking
&& os_file_lock(file, name)) {
if (create_mode == OS_FILE_OPEN_RETRY) {
......@@ -3078,7 +3067,7 @@ os_file_create_func(
close(file);
file = -1;
}
#endif /* USE_FILE_LOCK */
#endif /* !_WIN32 */
return(file);
}
......@@ -3151,10 +3140,11 @@ os_file_create_simple_no_error_handling_func(
*success = (file != -1);
#ifdef USE_FILE_LOCK
#ifndef _WIN32
if (!read_only
&& *success
&& access_type == OS_FILE_READ_WRITE
&& !my_disable_locking
&& os_file_lock(file, name)) {
*success = false;
......@@ -3162,7 +3152,7 @@ os_file_create_simple_no_error_handling_func(
file = -1;
}
#endif /* USE_FILE_LOCK */
#endif /* !_WIN32 */
return(file);
}
......
......@@ -481,7 +481,9 @@ create_log_files(
log_sys.log.create(srv_n_log_files);
fil_open_log_and_system_tablespace_files();
if (dberr_t err = fil_open_log_and_system_tablespace_files()) {
return err;
}
/* Create a log checkpoint. */
log_mutex_enter();
......@@ -568,8 +570,9 @@ create_log_files_rename(
DBUG_EXECUTE_IF("innodb_log_abort_10", err = DB_ERROR;);
if (err == DB_SUCCESS) {
fil_open_log_and_system_tablespace_files();
err = fil_open_log_and_system_tablespace_files();
ib::info() << "New log files created, LSN=" << lsn;
ut_a(err == DB_SUCCESS);
}
return(err);
......@@ -1889,10 +1892,12 @@ dberr_t srv_start(bool create_new_db)
tablespace: we keep them open until database
shutdown */
fil_open_log_and_system_tablespace_files();
err = fil_open_log_and_system_tablespace_files();
ut_d(fil_system.sys_space->recv_size = srv_sys_space_size_debug);
if (err == DB_SUCCESS) {
err = srv_undo_tablespaces_init(create_new_db);
}
/* If the force recovery is set very high then we carry on regardless
of all errors. Basically this is fingers crossed mode. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment