Commit 39e3ca8b authored by Marko Mäkelä's avatar Marko Mäkelä

MDEV-31826 InnoDB may fail to recover after being killed in fil_delete_tablespace()

InnoDB was violating the write-ahead-logging protocol when a file
was being deleted, like this:

1. fil_delete_tablespace() set the fil_space_t::STOPPING flag
2. The buf_flush_page_cleaner() thread discards some changed pages for
this tablespace advances the log checkpoint a little.
3. The server process is killed before fil_delete_tablespace() wrote
a FILE_DELETE record.
4. Recovery will try to apply log to pages of the tablespace, because
there was no FILE_DELETE record. This will fail, because some pages
that had been modified since the latest checkpoint had not been written
by the page cleaner.

Page writes must not be stopped before a FILE_DELETE record has been
durably written.

fil_space_t::drop(): Replaces fil_space_t::check_pending_operations().
Add the parameter detached_handle, and return a tablespace pointer
if this thread was the first one to stop I/O on the tablespace.

mtr_t::commit_file(): Remove the parameter detached_handle, and
move some handling to fil_space_t::drop().

fil_space_t: STOPPING_READS, STOPPING_WRITES: Separate flags for STOPPING.
We want to stop reads (and encryption) before stopping page writes.

fil_space_t::is_stopping_writes(), fil_space_t::get_for_write():
Special accessors for the write path.

fil_space_t::flush_low(): Ignore the STOPPING_READS flag and only
stop if STOPPING_WRITES is set, to avoid an infinite loop in
fil_flush_file_spaces(), which was occasionally repeated by
running the test encryption.create_or_replace.

Reviewed by: Vladislav Lesin
Tested by: Matthias Leich
parent ec2574fd
......@@ -530,7 +530,7 @@ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s)
static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page)
{
if (fil_space_t *space= fil_space_t::get(b.id().space()))
if (fil_space_t *space= fil_space_t::get_for_write(b.id().space()))
{
buf_dblwr_check_page_lsn(page, *space);
space->release();
......
......@@ -1110,7 +1110,7 @@ static ulint buf_flush_try_neighbors(fil_space_t *space,
for (ulint id_fold= id.fold(); id < high; ++id, ++id_fold)
{
if (UNIV_UNLIKELY(space->is_stopping()))
if (UNIV_UNLIKELY(space->is_stopping_writes()))
{
if (bpage)
bpage->lock.u_unlock(true);
......@@ -1215,12 +1215,31 @@ static ulint buf_free_from_unzip_LRU_list_batch()
return(count);
}
/** Acquire a tablespace reference for writing.
@param id tablespace identifier
@return tablespace
@retval nullptr if the tablespace is missing or inaccessible */
fil_space_t *fil_space_t::get_for_write(ulint id)
{
mysql_mutex_lock(&fil_system.mutex);
fil_space_t *space= fil_space_get_by_id(id);
const uint32_t n= space ? space->acquire_low(STOPPING_WRITES) : 0;
if (n & STOPPING_WRITES)
space= nullptr;
else if ((n & CLOSING) && !space->prepare_acquired())
space= nullptr;
mysql_mutex_unlock(&fil_system.mutex);
return space;
}
/** Start writing out pages for a tablespace.
@param id tablespace identifier
@return tablespace and number of pages written */
static std::pair<fil_space_t*, uint32_t> buf_flush_space(const uint32_t id)
{
if (fil_space_t *space= fil_space_t::get(id))
if (fil_space_t *space= fil_space_t::get_for_write(id))
return {space, space->flush_freed(true)};
return {nullptr, 0};
}
......@@ -1349,7 +1368,7 @@ static void buf_flush_LRU_list_batch(ulint max, bool evict,
goto no_space;
}
}
else if (space->is_stopping())
else if (space->is_stopping_writes())
{
space->release();
space= nullptr;
......@@ -1501,7 +1520,7 @@ static ulint buf_do_flush_list_batch(ulint max_n, lsn_t lsn)
else
ut_ad(!space);
}
else if (space->is_stopping())
else if (space->is_stopping_writes())
{
space->release();
space= nullptr;
......@@ -1633,7 +1652,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
ulint max_n_flush= srv_io_capacity;
ulint n_flush= 0;
bool acquired= space->acquire();
bool acquired= space->acquire_for_write();
{
const uint32_t written{space->flush_freed(acquired)};
mysql_mutex_lock(&buf_pool.mutex);
......@@ -1676,7 +1695,7 @@ bool buf_flush_list_space(fil_space_t *space, ulint *n_flushed)
buf_flush_discard_page(bpage);
else
{
if (space->is_stopping())
if (space->is_stopping_writes())
{
space->release();
acquired= false;
......
......@@ -99,11 +99,9 @@ bool fil_space_t::try_to_close(bool print_info)
if (!node->is_open())
continue;
/* Other thread is trying to do fil_delete_tablespace()
concurrently for the same tablespace. So ignore this
tablespace and try to close the other one */
const auto n= space.set_closing();
if (n & STOPPING)
/* Let fil_space_t::drop() in another thread handle this. */
continue;
if (n & (PENDING | NEEDS_FSYNC))
{
......@@ -508,7 +506,7 @@ void fil_space_t::flush_low()
std::memory_order_relaxed))
{
ut_ad(n & PENDING);
if (n & STOPPING)
if (n & STOPPING_WRITES)
return;
if (n & NEEDS_FSYNC)
break;
......@@ -1582,7 +1580,7 @@ fil_name_write(
mtr->log_file_op(FILE_MODIFY, space_id, name);
}
fil_space_t *fil_space_t::check_pending_operations(ulint id)
fil_space_t *fil_space_t::drop(ulint id, pfs_os_file_t *detached_handle)
{
ut_a(!is_system_tablespace(id));
mysql_mutex_lock(&fil_system.mutex);
......@@ -1596,7 +1594,6 @@ fil_space_t *fil_space_t::check_pending_operations(ulint id)
if (space->pending() & STOPPING)
{
being_deleted:
/* A thread executing DDL and another thread executing purge may
be executing fil_delete_tablespace() concurrently for the same
tablespace. Wait for the other thread to complete the operation. */
......@@ -1615,34 +1612,73 @@ fil_space_t *fil_space_t::check_pending_operations(ulint id)
mysql_mutex_lock(&fil_system.mutex);
}
}
else
{
if (space->crypt_data)
{
space->reacquire();
/* We must be the first one to set either STOPPING flag on the .ibd file,
because the flags are only being set here, within a critical section of
fil_system.mutex. */
unsigned pending;
ut_d(pending=)
space->n_pending.fetch_add(STOPPING_READS + 1, std::memory_order_relaxed);
ut_ad(!(pending & STOPPING));
mysql_mutex_unlock(&fil_system.mutex);
if (space->crypt_data)
fil_space_crypt_close_tablespace(space);
mysql_mutex_lock(&fil_system.mutex);
space->release();
}
if (space->set_stopping_check())
goto being_deleted;
if (space->purpose == FIL_TYPE_TABLESPACE)
{
/* Before deleting the file, persistently write a log record. */
mtr_t mtr;
mtr.start();
mtr.log_file_op(FILE_DELETE, id, space->chain.start->name);
mtr.commit_file(*space, nullptr);
if (FSP_FLAGS_HAS_DATA_DIR(space->flags))
RemoteDatafile::delete_link_file(space->name());
os_file_delete(innodb_data_file_key, space->chain.start->name);
}
else
ut_ad(space->purpose == FIL_TYPE_IMPORT);
mysql_mutex_unlock(&fil_system.mutex);
if (char *cfg_name= fil_make_filepath(space->chain.start->name,
fil_space_t::name_type{}, CFG, false))
{
os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr);
ut_free(cfg_name);
}
mysql_mutex_lock(&fil_system.mutex);
ut_ad(space == fil_space_get_by_id(id));
pending=
space->n_pending.fetch_add(STOPPING_WRITES - 1, std::memory_order_relaxed);
ut_ad((pending & STOPPING) == STOPPING_READS);
ut_ad(pending & PENDING);
pending&= PENDING;
if (--pending)
{
for (ulint count= 0;; count++)
{
const unsigned pending= space->referenced();
ut_ad(space == fil_space_get_by_id(id));
pending= space->n_pending.load(std::memory_order_relaxed) & PENDING;
if (!pending)
return space;
break;
mysql_mutex_unlock(&fil_system.mutex);
/* Issue a warning every 10.24 seconds, starting after 2.56 seconds */
if ((count & 511) == 128)
sql_print_warning("InnoDB: Trying to delete tablespace '%s' "
"but there are %u pending operations",
space->chain.start->name, id);
space->chain.start->name, pending);
std::this_thread::sleep_for(std::chrono::milliseconds(20));
mysql_mutex_lock(&fil_system.mutex);
}
}
pfs_os_file_t handle= fil_system.detach(space, true);
mysql_mutex_unlock(&fil_system.mutex);
if (detached_handle)
*detached_handle = handle;
return space;
}
/** Close a single-table tablespace on failed IMPORT TABLESPACE.
......@@ -1651,34 +1687,22 @@ Free all pages used by the tablespace. */
void fil_close_tablespace(ulint id)
{
ut_ad(!is_system_tablespace(id));
fil_space_t* space = fil_space_t::check_pending_operations(id);
fil_space_t* space = fil_space_t::drop(id, nullptr);
if (!space) {
return;
}
space->x_lock();
ut_ad(space->is_stopping());
/* Invalidate in the buffer pool all pages belonging to the
tablespace. Since we have invoked space->set_stopping(), readahead
tablespace. Since space->is_stopping() holds, readahead
can no longer read more pages of this tablespace to buf_pool.
Thus we can clean the tablespace out of buf_pool
completely and permanently. */
while (buf_flush_list_space(space));
ut_ad(space->is_stopping());
/* If it is a delete then also delete any generated files, otherwise
when we drop the database the remove directory will fail. */
if (char* cfg_name = fil_make_filepath(space->chain.start->name,
fil_space_t::name_type{},
CFG, false)) {
os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL);
ut_free(cfg_name);
}
/* If the free is successful, the wrlock will be released before
the space memory data structure is freed. */
/* If the free is successful, the latch will be released there. */
if (!fil_space_free(id, true)) {
space->x_unlock();
}
......@@ -1692,16 +1716,8 @@ pfs_os_file_t fil_delete_tablespace(ulint id)
{
ut_ad(!is_system_tablespace(id));
pfs_os_file_t handle= OS_FILE_CLOSED;
if (fil_space_t *space= fil_space_t::check_pending_operations(id))
{
/* Before deleting the file(s), persistently write a log record. */
mtr_t mtr;
mtr.start();
mtr.log_file_op(FILE_DELETE, id, space->chain.start->name);
mtr.commit_file(*space, nullptr, &handle);
if (fil_space_t *space= fil_space_t::drop(id, &handle))
fil_space_free_low(space);
}
return handle;
}
......
......@@ -397,16 +397,22 @@ struct fil_space_t final
/** Number of pending operations on the file.
The tablespace cannot be freed while (n_pending & PENDING) != 0. */
std::atomic<uint32_t> n_pending;
/** Flag in n_pending that indicates that the tablespace is about to be
deleted, and no further operations should be performed */
static constexpr uint32_t STOPPING_READS= 1U << 31;
/** Flag in n_pending that indicates that the tablespace is being
deleted, and no further operations should be performed */
static constexpr uint32_t STOPPING= 1U << 31;
static constexpr uint32_t STOPPING_WRITES= 1U << 30;
/** Flags in n_pending that indicate that the tablespace is being
deleted, and no further operations should be performed */
static constexpr uint32_t STOPPING= STOPPING_READS | STOPPING_WRITES;
/** Flag in n_pending that indicates that the tablespace is a candidate
for being closed, and fil_node_t::is_open() can only be trusted after
acquiring fil_system.mutex and resetting the flag */
static constexpr uint32_t CLOSING= 1U << 30;
static constexpr uint32_t CLOSING= 1U << 29;
/** Flag in n_pending that indicates that the tablespace needs fsync().
This must be the least significant flag bit; @see release_flush() */
static constexpr uint32_t NEEDS_FSYNC= 1U << 29;
static constexpr uint32_t NEEDS_FSYNC= 1U << 28;
/** The reference count */
static constexpr uint32_t PENDING= ~(STOPPING | CLOSING | NEEDS_FSYNC);
/** latch protecting all page allocation bitmap pages */
......@@ -517,20 +523,19 @@ struct fil_space_t final
/** Close each file. Only invoked on fil_system.temp_space. */
void close();
/** Note that operations on the tablespace must stop.
@return whether the operations were already stopped */
inline bool set_stopping_check();
/** Note that operations on the tablespace must stop. */
inline void set_stopping();
/** Note that operations on the tablespace can resume after truncation */
inline void clear_stopping();
/** Look up the tablespace and wait for pending operations to cease
/** Drop the tablespace and wait for any pending operations to cease
@param id tablespace identifier
@return tablespace
@retval nullptr if no tablespace was found */
static fil_space_t *check_pending_operations(ulint id);
@param detached_handle pointer to file to be closed later, or nullptr
@return tablespace to invoke fil_space_free() on
@retval nullptr if no tablespace was found, or it was deleted by
another concurrent thread */
static fil_space_t *drop(ulint id, pfs_os_file_t *detached_handle);
private:
MY_ATTRIBUTE((warn_unused_result))
......@@ -554,13 +559,19 @@ struct fil_space_t final
MY_ATTRIBUTE((warn_unused_result))
/** Acquire a tablespace reference for I/O.
@param avoid when these flags are set, nothing will be acquired
@return whether the file is usable */
bool acquire()
bool acquire(uint32_t avoid= STOPPING | CLOSING)
{
const auto flags= acquire_low(STOPPING | CLOSING) & (STOPPING | CLOSING);
const auto flags= acquire_low(avoid) & (avoid);
return UNIV_LIKELY(!flags) || (flags == CLOSING && acquire_and_prepare());
}
/** Acquire a tablespace reference for writing.
@param avoid when these flags are set, nothing will be acquired
@return whether the file is writable */
bool acquire_for_write() { return acquire(STOPPING_WRITES | CLOSING); }
/** Acquire another tablespace reference for I/O. */
inline void reacquire();
......@@ -577,12 +588,12 @@ struct fil_space_t final
void clear_flush()
{
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
__asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
__asm__ __volatile__("lock btrl $28, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(NEEDS_FSYNC == 1U << 29, "compatibility");
static_assert(NEEDS_FSYNC == 1U << 28, "compatibility");
_interlockedbittestandreset(reinterpret_cast<volatile long*>
(&n_pending), 29);
(&n_pending), 28);
#else
n_pending.fetch_and(~NEEDS_FSYNC, std::memory_order_release);
#endif
......@@ -593,12 +604,12 @@ struct fil_space_t final
void clear_closing()
{
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(CLOSING == 1U << 30, "compatibility");
__asm__ __volatile__("lock btrl $30, %0" : "+m" (n_pending));
static_assert(CLOSING == 1U << 29, "compatibility");
__asm__ __volatile__("lock btrl $29, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(CLOSING == 1U << 30, "compatibility");
static_assert(CLOSING == 1U << 29, "compatibility");
_interlockedbittestandreset(reinterpret_cast<volatile long*>
(&n_pending), 30);
(&n_pending), 29);
#else
n_pending.fetch_and(~CLOSING, std::memory_order_relaxed);
#endif
......@@ -609,8 +620,10 @@ struct fil_space_t final
public:
/** @return whether close() of the file handle has been requested */
bool is_closing() const { return pending() & CLOSING; }
/** @return whether the tablespace is going to be dropped */
/** @return whether the tablespace is about to be dropped */
bool is_stopping() const { return pending() & STOPPING; }
/** @return whether the tablespace is going to be dropped */
bool is_stopping_writes() const { return pending() & STOPPING_WRITES; }
/** @return number of pending operations */
bool is_ready_to_close() const
{ return (pending() & (PENDING | CLOSING)) == CLOSING; }
......@@ -619,7 +632,7 @@ struct fil_space_t final
/** @return whether fsync() or similar is needed, and the tablespace is
not being dropped */
bool needs_flush_not_stopping() const
{ return (pending() & (NEEDS_FSYNC | STOPPING)) == NEEDS_FSYNC; }
{ return (pending() & (NEEDS_FSYNC | STOPPING_WRITES)) == NEEDS_FSYNC; }
uint32_t referenced() const { return pending() & PENDING; }
private:
......@@ -657,7 +670,7 @@ struct fil_space_t final
std::memory_order_relaxed))
{
ut_ad(n & PENDING);
if (n & (NEEDS_FSYNC | STOPPING))
if (n & (NEEDS_FSYNC | STOPPING_WRITES))
return false;
}
......@@ -969,6 +982,11 @@ struct fil_space_t final
@return tablespace
@retval nullptr if the tablespace is missing or inaccessible */
static fil_space_t *get(ulint id);
/** Acquire a tablespace reference for writing.
@param id tablespace identifier
@return tablespace
@retval nullptr if the tablespace is missing or inaccessible */
static fil_space_t *get_for_write(ulint id);
/** Add/remove the free page in the freed ranges list.
@param[in] offset page number to be added
......@@ -1592,52 +1610,27 @@ inline void fil_space_t::reacquire()
#endif /* SAFE_MUTEX */
}
/** Note that operations on the tablespace must stop.
@return whether the operations were already stopped */
inline bool fil_space_t::set_stopping_check()
{
mysql_mutex_assert_owner(&fil_system.mutex);
#if (defined __clang_major__ && __clang_major__ < 10) || defined __APPLE_CC__
/* Only clang-10 introduced support for asm goto */
return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(STOPPING == 1U << 31, "compatibility");
__asm__ goto("lock btsl $31, %0\t\njnc %l1" : : "m" (n_pending)
: "cc", "memory" : not_stopped);
return true;
not_stopped:
return false;
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(STOPPING == 1U << 31, "compatibility");
return _interlockedbittestandset(reinterpret_cast<volatile long*>
(&n_pending), 31);
#else
return n_pending.fetch_or(STOPPING, std::memory_order_relaxed) & STOPPING;
#endif
}
/** Note that operations on the tablespace must stop.
@return whether the operations were already stopped */
/** Note that operations on the tablespace must stop. */
inline void fil_space_t::set_stopping()
{
mysql_mutex_assert_owner(&fil_system.mutex);
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
static_assert(STOPPING == 1U << 31, "compatibility");
__asm__ __volatile__("lock btsl $31, %0" : "+m" (n_pending));
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
__asm__ __volatile__("lock btsl $30, %0" : "+m" (n_pending));
#elif defined _MSC_VER && (defined _M_IX86 || defined _M_X64)
static_assert(STOPPING == 1U << 31, "compatibility");
_interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 31);
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
_interlockedbittestandset(reinterpret_cast<volatile long*>(&n_pending), 30);
#else
n_pending.fetch_or(STOPPING, std::memory_order_relaxed);
n_pending.fetch_or(STOPPING_WRITES, std::memory_order_relaxed);
#endif
}
inline void fil_space_t::clear_stopping()
{
mysql_mutex_assert_owner(&fil_system.mutex);
static_assert(STOPPING == 1U << 31, "compatibility");
ut_d(auto n=) n_pending.fetch_sub(STOPPING, std::memory_order_relaxed);
ut_ad(n & STOPPING);
static_assert(STOPPING_WRITES == 1U << 30, "compatibility");
ut_d(auto n=) n_pending.fetch_sub(STOPPING_WRITES, std::memory_order_relaxed);
ut_ad((n & STOPPING) == STOPPING_WRITES);
}
/** Flush pending writes from the file system cache to the file. */
......
......@@ -95,14 +95,8 @@ struct mtr_t {
/** Commit a mini-transaction that is deleting or renaming a file.
@param space tablespace that is being renamed or deleted
@param name new file name (nullptr=the file will be deleted)
@param detached_handle if detached_handle != nullptr and if space is detached
during the function execution the file handle if its
node will be set to OS_FILE_CLOSED, and the previous
value of the file handle will be assigned to the
address, pointed by detached_handle.
@return whether the operation succeeded */
ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name,
pfs_os_file_t *detached_handle= nullptr);
ATTRIBUTE_COLD bool commit_file(fil_space_t &space, const char *name);
/** Commit a mini-transaction that did not modify any pages,
but generated some redo log on a higher level, such as
......
......@@ -327,7 +327,7 @@ void mtr_t::commit_shrink(fil_space_t &space)
mysql_mutex_lock(&fil_system.mutex);
ut_ad(space.is_being_truncated);
ut_ad(space.is_stopping());
ut_ad(space.is_stopping_writes());
space.clear_stopping();
space.is_being_truncated= false;
mysql_mutex_unlock(&fil_system.mutex);
......@@ -340,14 +340,8 @@ void mtr_t::commit_shrink(fil_space_t &space)
/** Commit a mini-transaction that is deleting or renaming a file.
@param space tablespace that is being renamed or deleted
@param name new file name (nullptr=the file will be deleted)
@param detached_handle if detached_handle != nullptr and if space is detached
during the function execution the file handle if its
node will be set to OS_FILE_CLOSED, and the previous
value of the file handle will be assigned to the
address, pointed by detached_handle.
@return whether the operation succeeded */
bool mtr_t::commit_file(fil_space_t &space, const char *name,
pfs_os_file_t *detached_handle)
bool mtr_t::commit_file(fil_space_t &space, const char *name)
{
ut_ad(is_active());
ut_ad(!is_inside_ibuf());
......@@ -377,7 +371,7 @@ bool mtr_t::commit_file(fil_space_t &space, const char *name,
log_write_and_flush();
char *old_name= space.chain.start->name;
bool success;
bool success= true;
if (name)
{
......@@ -391,37 +385,6 @@ bool mtr_t::commit_file(fil_space_t &space, const char *name,
mysql_mutex_unlock(&fil_system.mutex);
ut_free(old_name);
}
else
{
/* Remove any additional files. */
if (char *cfg_name= fil_make_filepath(old_name,
fil_space_t::name_type{}, CFG,
false))
{
os_file_delete_if_exists(innodb_data_file_key, cfg_name, nullptr);
ut_free(cfg_name);
}
if (FSP_FLAGS_HAS_DATA_DIR(space.flags))
RemoteDatafile::delete_link_file(space.name());
/* Remove the directory entry. The file will actually be deleted
when our caller closes the handle. */
os_file_delete(innodb_data_file_key, old_name);
mysql_mutex_lock(&fil_system.mutex);
/* Sanity checks after reacquiring fil_system.mutex */
ut_ad(&space == fil_space_get_by_id(space.id));
ut_ad(!space.referenced());
ut_ad(space.is_stopping());
pfs_os_file_t handle = fil_system.detach(&space, true);
if (detached_handle)
*detached_handle = handle;
mysql_mutex_unlock(&fil_system.mutex);
success= true;
}
mysql_mutex_unlock(&buf_pool.flush_list_mutex);
release_resources();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment