Commit 41d17ea8 authored by Sergey Vojtovich's avatar Sergey Vojtovich

InnoDB redo log IO methods

normal - conventional IO using read()/write()/fdatasync()
mmap - memory mapped IO, expected to be faster but less secure
parent cc276f17
......@@ -1257,6 +1257,18 @@ NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST NULL
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LOG_IO_METHOD
SESSION_VALUE NULL
DEFAULT_VALUE normal
VARIABLE_SCOPE GLOBAL
VARIABLE_TYPE ENUM
VARIABLE_COMMENT InnoDB redo log IO method: normal (default), mmap.
NUMERIC_MIN_VALUE NULL
NUMERIC_MAX_VALUE NULL
NUMERIC_BLOCK_SIZE NULL
ENUM_VALUE_LIST normal,mmap
READ_ONLY YES
COMMAND_LINE_ARGUMENT REQUIRED
VARIABLE_NAME INNODB_LOG_OPTIMIZE_DDL
SESSION_VALUE NULL
DEFAULT_VALUE ON
......
......@@ -155,7 +155,8 @@ MYSQL_ADD_PLUGIN(innobase ${INNOBASE_SOURCES} STORAGE_ENGINE
${CRC32_LIBRARY}
${NUMA_LIBRARY}
${LIBSYSTEMD}
${LINKER_SCRIPT})
${LINKER_SCRIPT}
${LIBPMEM})
IF(NOT TARGET innobase)
RETURN()
......
......@@ -406,6 +406,21 @@ static TYPELIB innodb_change_buffering_typelib = {
NULL
};
static const char *innodb_log_io_method_names[]=
{
"normal",
"mmap",
NullS
};
static TYPELIB innodb_log_io_method_typelib=
{
array_elements(innodb_log_io_method_names) - 1,
"innodb_log_io_method_typelib",
innodb_log_io_method_names,
NULL
};
/** Retrieve the FTS Relevance Ranking result for doc with doc_id
of m_prebuilt->fts_doc_id
@param[in,out] fts_hdl FTS handler
......@@ -19413,6 +19428,12 @@ static MYSQL_SYSVAR_BOOL(log_optimize_ddl, innodb_log_optimize_ddl,
" allows concurrent backup.",
NULL, NULL, TRUE);
static MYSQL_SYSVAR_ENUM(log_io_method, innodb_log_io_method,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"InnoDB redo log IO method: normal (default), mmap.",
NULL, NULL, 0,
&innodb_log_io_method_typelib);
static MYSQL_SYSVAR_ULONG(autoextend_increment,
sys_tablespace_auto_extend_increment,
PLUGIN_VAR_RQCMDARG,
......@@ -20303,6 +20324,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_group_home_dir),
MYSQL_SYSVAR(log_compressed_pages),
MYSQL_SYSVAR(log_optimize_ddl),
MYSQL_SYSVAR(log_io_method),
MYSQL_SYSVAR(max_dirty_pages_pct),
MYSQL_SYSVAR(max_dirty_pages_pct_lwm),
MYSQL_SYSVAR(adaptive_flushing_lwm),
......
......@@ -40,6 +40,7 @@ Created 12/9/1995 Heikki Tuuri
#include "os0file.h"
#include "span.h"
#include <atomic>
#include <memory>
using st_::span;
......@@ -449,6 +450,7 @@ typedef ib_mutex_t LogSysMutex;
typedef ib_mutex_t FlushOrderMutex;
extern my_bool srv_read_only_mode;
extern ulong innodb_log_io_method;
/** Redo log buffer */
struct log_t{
......@@ -516,6 +518,22 @@ struct log_t{
/** Log files. Protected by mutex or write_mutex. */
struct files {
class file_io
{
protected:
bool durable_writes;
public:
virtual ~file_io() {}
virtual dberr_t open(const char *path)= 0;
virtual dberr_t close()= 0;
virtual dberr_t read(os_offset_t offset, span<byte> buf)= 0;
virtual dberr_t write(const char *path, os_offset_t offset,
span<byte> buf)= 0;
virtual dberr_t flush_data_only()= 0;
bool writes_are_durable() const { return durable_writes; }
};
/** number of files */
ulint n_files;
/** format of the redo log: e.g., FORMAT_10_4 */
......@@ -530,14 +548,14 @@ struct log_t{
lsn_t lsn;
/** the byte offset of the above lsn */
lsn_t lsn_offset;
/** file descriptors for all log files */
std::vector<std::unique_ptr<file_io>> files;
public:
/** used only in recovery: recovery scan succeeded up to this
lsn in this log group */
lsn_t scanned_lsn;
/** file descriptors for all log files */
std::vector<pfs_os_file_t> files;
/** file names for all log files */
std::vector<std::string> file_names;
......@@ -553,6 +571,11 @@ struct log_t{
@param[in] total_offset offset in log files treated as a single file
@param[in] buf buffer from which to write */
void write(size_t total_offset, span<byte> buf);
/** checks whether flush_data_only() is needed to make data persistend */
bool writes_are_durable() const
{
return files.front()->writes_are_durable();
}
/** flushes OS page cache (excluding metadata!) for all log files */
void flush_data_only();
/** closes all log files */
......
......@@ -84,6 +84,8 @@ reduce the size of the log.
/** Redo log system */
log_t log_sys;
ulong innodb_log_io_method;
/* Next log block number to do dummy record filling if no log records written
for a while */
static ulint next_lbn_to_pad = 0;
......@@ -587,6 +589,112 @@ void log_t::create()
}
}
class file_os_io final: public log_t::files::file_io
{
pfs_os_file_t fd;
public:
dberr_t open(const char *path) final
{
bool success;
fd= os_file_create(innodb_log_file_key, path,
OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
OS_FILE_NORMAL, OS_LOG_FILE,
srv_read_only_mode, &success);
durable_writes= srv_file_flush_method == SRV_O_DSYNC;
return success ? DB_SUCCESS : DB_ERROR;
}
dberr_t close() final { return os_file_close(fd) ? DB_SUCCESS : DB_ERROR; }
dberr_t read(os_offset_t offset, span<byte> buf) final
{
return os_file_read(IORequestRead, fd, buf.data(), offset, buf.size());
}
dberr_t write(const char *path, os_offset_t offset, span<byte> buf) final
{
return os_file_write(IORequestWrite, path, fd, buf.data(), offset,
buf.size());
}
dberr_t flush_data_only() final
{
return os_file_flush_data(fd) ? DB_SUCCESS : DB_ERROR;
}
};
#ifdef HAVE_PMEM
#include <libpmem.h>
#endif
class file_mmap_io final: public log_t::files::file_io
{
File fd;
void *addr;
size_t length;
public:
dberr_t open(const char *path) final
{
fd= mysql_file_open(innodb_log_file_key, path,
srv_read_only_mode ? O_RDONLY : O_RDWR, MYF(MY_WME));
if (fd >= 0)
{
MY_STAT sb;
if (!mysql_file_fstat(fd, &sb, MYF(0)))
{
int prot= srv_read_only_mode ? PROT_READ : PROT_READ | PROT_WRITE;
length= sb.st_size;
addr= my_mmap(0, length, prot, MAP_SHARED_VALIDATE | MAP_SYNC, fd, 0);
if (addr != MAP_FAILED)
{
#ifdef HAVE_PMEM
durable_writes= true;
ib::info() << "The redo log file is located on a DAX storage. "
"Writes are durable, sync disabled.";
#else
durable_writes= false;
ib::info() << "The redo log file is located on a DAX storage, "
"but persistent memory features were disabled "
"(WITH_PMEM=OFF). Page cache is bypassed, sync is "
"required to make writes durable.";
#endif
return DB_SUCCESS;
}
addr= my_mmap(0, length, prot, MAP_SHARED, fd, 0);
if (addr != MAP_FAILED)
{
durable_writes= false;
return DB_SUCCESS;
}
}
mysql_file_close(fd, MYF(MY_WME));
}
return DB_ERROR;
}
dberr_t close() final
{
int err= my_munmap(addr, length);
return (!mysql_file_close(fd, MYF(MY_WME)) && !err) ? DB_SUCCESS : DB_ERROR;
}
dberr_t read(os_offset_t offset, span<byte> buf) final
{
memcpy(buf.data(), (char*) addr + offset, buf.size());
return DB_SUCCESS;
}
dberr_t write(const char *, os_offset_t offset, span<byte> buf) final
{
#ifdef HAVE_PMEM
pmem_memcpy_persist((char*) addr + offset, buf.data(), buf.size());
#else
memcpy((char*) addr + offset, buf.data(), buf.size());
#endif
return DB_SUCCESS;
}
dberr_t flush_data_only() final
{
ut_ad(!durable_writes);
return my_msync(fd, addr, length, MS_SYNC) ? DB_ERROR : DB_SUCCESS;
}
};
void log_t::files::set_file_names(std::vector<std::string> names)
{
file_names= std::move(names);
......@@ -598,15 +706,18 @@ void log_t::files::open_files()
files.reserve(file_names.size());
for (const auto &name : file_names)
{
bool success;
files.push_back(os_file_create(innodb_log_file_key, name.c_str(),
OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT,
OS_FILE_NORMAL, OS_LOG_FILE,
srv_read_only_mode, &success));
if (!success)
file_io *io;
switch (innodb_log_io_method)
{
ib::fatal() << "os_file_create(" << name << ") failed";
case 1: io= new file_mmap_io; break;
default: io= new file_os_io;
}
ut_a(io);
if (io->open(name.c_str()))
ib::fatal() << "open(" << name << ") failed";
files.emplace_back(io);
}
}
......@@ -617,12 +728,8 @@ void log_t::files::read(size_t total_offset, span<byte> buf)
const size_t file_idx= total_offset / static_cast<size_t>(file_size);
const size_t offset= total_offset % static_cast<size_t>(file_size);
if (const dberr_t err= os_file_read(IORequestRead, files[file_idx],
buf.data(), offset, buf.size()))
{
ib::fatal() << "os_file_read(" << file_names[file_idx] << ") returned "
<< err;
}
if (const dberr_t err= files[file_idx]->read(offset, buf))
ib::fatal() << "read(" << file_names[file_idx] << ") returned " << err;
}
void log_t::files::write(size_t total_offset, span<byte> buf)
......@@ -632,13 +739,9 @@ void log_t::files::write(size_t total_offset, span<byte> buf)
const size_t file_idx= total_offset / static_cast<size_t>(file_size);
const size_t offset= total_offset % static_cast<size_t>(file_size);
if (const dberr_t err=
os_file_write(IORequestWrite, file_names[file_idx].c_str(),
files[file_idx], buf.data(), offset, buf.size()))
{
ib::fatal() << "os_file_write(" << file_names[file_idx] << ") returned "
<< err;
}
if (const dberr_t err= files[file_idx]->write(file_names[file_idx].c_str(),
offset, buf))
ib::fatal() << "write(" << file_names[file_idx] << ") returned " << err;
}
void log_t::files::flush_data_only()
......@@ -648,10 +751,10 @@ void log_t::files::flush_data_only()
log_sys.pending_flushes.fetch_add(1, std::memory_order_acquire);
for (auto it= files.begin(), end= files.end(); it != end; ++it)
{
if (!os_file_flush_data(*it))
if ((*it)->flush_data_only())
{
const auto idx= std::distance(files.begin(), it);
ib::fatal() << "os_file_flush_data(" << file_names[idx] << ") failed";
ib::fatal() << "flush_data_only(" << file_names[idx] << ") failed";
}
}
log_sys.pending_flushes.fetch_sub(1, std::memory_order_release);
......@@ -662,10 +765,10 @@ void log_t::files::close_files()
{
for (auto it= files.begin(), end= files.end(); it != end; ++it)
{
if (!os_file_close(*it))
if ((*it)->close())
{
const auto idx= std::distance(files.begin(), it);
ib::fatal() << "os_file_close(" << file_names[idx] << ") failed";
ib::fatal() << "close(" << file_names[idx] << ") failed";
}
}
files.clear();
......@@ -934,6 +1037,11 @@ void log_write_up_to(lsn_t lsn, bool flush_to_disk, bool rotate_key)
return;
}
/* FIXME!!! */
if (log_sys.log.writes_are_durable()) {
flush_to_disk= false;
}
loop:
ut_ad(++loop_count < 128);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment