Commit 80aefe16 authored by Marko Mäkelä's avatar Marko Mäkelä

Introduce btr_set_instant() and a new root page format

For instant DROP COLUMN and ROW_FORMAT=COMPACT or ROW_FORMAT=REDUNDANT,
we must store the n_core_null_bytes in the root page, so that the
chain of node pointer records can be followed in order to reach the
leftmost leaf page where the metadata record is located.

If the PRIMARY KEY contains any variable-length column and some
nullable columns were instantly dropped, the dict_index_t::n_nullable
in the data dictionary could be smaller than it actually is in the
non-leaf pages. Because of this, the non-leaf pages could use more
bytes for the null flags than the data dictionary expects, and we
could be reading the lengths of the variable-length columns from the
wrong offset, and thus reading the child page number from wrong place.
This is the result of two design mistakes that involve unnecessary
storage of data: First, it is nonsense to store any data fields for
the leftmost node pointer records, because the comparisons would be
resolved by the MIN_REC_FLAG alone. Second, there cannot be any null
fields in the clustered index node pointer fields, but we nevertheless
reserve space for all the null flags.

btr_set_instant(): Convert a root page into "instant ALTER TABLE"
format. This replaces page_set_instant().

btr_cur_instant_init_metadata(): Do not read any lengths from the
metadata record header before reading the BLOB. At this point, we
would not actually know how many nullable fields the metadata record
contains.

btr_cur_instant_init_low(): Minimize differences from the 10.3 version.

btr_cur_instant_root_init(): Initialize n_core_null_bytes in one
of two possible ways.

FIXME: innobase_instant_try(): When needed, convert the root page
from the ADD COLUMN format to the generic instant ALTER TABLE format.
parent 7eb67d41
......@@ -1895,6 +1895,42 @@ btr_page_empty(
}
}
/** Write instant ALTER TABLE metadata to a root page.
@param[in,out] root clustered index root page
@param[in] index clustered index with instant ALTER TABLE
@param[in,out] mtr mini-transaction */
void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr)
{
ut_ad(index.n_core_fields > 0);
ut_ad(index.n_core_fields < REC_MAX_N_FIELDS);
ut_ad(index.is_instant());
ut_ad(page_is_root(root->frame));
ut_ad(!page_is_comp(root->frame) || !page_get_instant(root->frame));
rec_t* infimum = page_get_infimum_rec(root->frame);
rec_t* supremum = page_get_supremum_rec(root->frame);
ut_ad(!memcmp(infimum, "infimum", 8));
ut_ad(!memcmp(supremum, "supremum", 8));
byte* page_type = root->frame + FIL_PAGE_TYPE;
ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX);
mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
MLOG_2BYTES, mtr);
uint16_t i = page_header_get_field(root->frame, PAGE_INSTANT);
ut_ad(i <= PAGE_NO_DIRECTION);
i |= index.n_core_fields << 3;
mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + root->frame, i,
MLOG_2BYTES, mtr);
if (index.table->instant) {
mlog_memset(root, infimum - root->frame, 8, 0, mtr);
mlog_memset(root, supremum - root->frame, 7, 0, mtr);
mlog_write_ulint(&supremum[7], index.n_core_null_bytes,
MLOG_1BYTE, mtr);
}
}
/*************************************************************//**
Makes tree one level higher by splitting the root, and inserts
the tuple. It is assumed that mtr contains an x-latch on the tree.
......@@ -2080,11 +2116,7 @@ btr_root_raise_and_insert(
if (index->is_instant()) {
ut_ad(!root_page_zip);
byte* page_type = root_block->frame + FIL_PAGE_TYPE;
ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX);
mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
MLOG_2BYTES, mtr);
page_set_instant(root_block->frame, index->n_core_fields, mtr);
btr_set_instant(root_block, *index, mtr);
}
/* Set the next node and previous node fields, although
......@@ -3569,12 +3601,7 @@ btr_lift_page_up(
if (page_level == 0 && index->is_instant()) {
ut_ad(!father_page_zip);
byte* page_type = father_block->frame + FIL_PAGE_TYPE;
ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX);
mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
MLOG_2BYTES, mtr);
page_set_instant(father_block->frame,
index->n_core_fields, mtr);
btr_set_instant(father_block, *index, mtr);
}
page_level++;
......
......@@ -399,18 +399,19 @@ btr_cur_instant_init_metadata(
const rec_t* rec,
dict_index_t* index)
{
ulint* offsets;
ulint len;
mem_heap_t* heap = mem_heap_create(UNIV_PAGE_SIZE_MAX);
ulint trx_id_offset = index->trx_id_offset;
if (!trx_id_offset) {
ulint* offsets = rec_get_offsets(
rec, index, NULL, true, index->n_uniq, &heap);
trx_id_offset = rec_offs_size(offsets);
mem_heap_empty(heap);
/* The PRIMARY KEY contains variable-length columns.
For the metadata record, variable-length columns are
always written with zero length. The DB_TRX_ID will
start right after any fixed-length columns. */
for (uint i = index->n_uniq; i--; ) {
trx_id_offset += index->fields[0].fixed_len;
}
}
mem_heap_t* heap = mem_heap_create(UNIV_PAGE_SIZE_MAX);
ulint len;
const byte* data = btr_copy_externally_stored_field(
&len, rec + trx_id_offset
+ (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
......@@ -426,10 +427,10 @@ btr_cur_instant_init_metadata(
/* FIXME: Do we really need rec_get_offsets() here?
Better read the metadata record header directly. */
offsets = rec_get_offsets(rec, index, NULL, true,
ULINT_UNDEFINED, &heap);
ulint* offsets = rec_get_offsets(rec, index, NULL, true,
ULINT_UNDEFINED, &heap);
for (unsigned i = index->n_uniq + DATA_ROLL_PTR;
for (unsigned i = index->first_user_field();
i < index->n_fields; i++) {
const byte* data = rec_get_nth_field(
rec, offsets, i + 1, &len);
......@@ -477,79 +478,6 @@ btr_cur_instant_init_metadata(
return DB_SUCCESS;
}
/** Initialize the clustered index from the ADD COLUMN metadata
and store the default value for the instantly added columns.
@param[in] rec metadata record
@param[in,out] index clustered index definition
@return error code
@retval DB_SUCCESS if no error occurred
@retval DB_CORRUPTION if any corruption was noticed */
static
dberr_t
btr_cur_instant_init_add_column(
const rec_t* rec,
dict_index_t* index)
{
mem_heap_t* heap = NULL;
ulint* offsets = rec_get_offsets(rec, index, NULL, true,
ULINT_UNDEFINED, &heap);
if (rec_offs_any_default(offsets)) {
inconsistent:
mem_heap_free(heap);
ib::error() << "Table " << index->table->name
<< " contains unrecognizable "
"instant ALTER metadata";
index->table->corrupted = true;
return DB_CORRUPTION;
}
/* In fact, because we only ever append fields to the metadata
record, it is also OK to perform READ UNCOMMITTED and
then ignore any extra fields, provided that
trx_sys.is_registered(DB_TRX_ID). */
if (rec_offs_n_fields(offsets) > index->n_fields
&& !trx_sys.is_registered(current_trx(),
row_get_rec_trx_id(rec, index,
offsets))) {
goto inconsistent;
}
for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
ulint len;
const byte* data = rec_get_nth_field(rec, offsets, i, &len);
dict_col_t* col = index->fields[i].col;
ut_ad(!col->is_added());
ut_ad(!col->def_val.data);
col->def_val.len = len;
switch (len) {
case UNIV_SQL_NULL:
continue;
case 0:
col->def_val.data = field_ref_zero;
continue;
}
ut_ad(len != UNIV_SQL_DEFAULT);
if (!rec_offs_nth_extern(offsets, i)) {
col->def_val.data = mem_heap_dup(
index->table->heap, data, len);
} else if (len < BTR_EXTERN_FIELD_REF_SIZE
|| !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
field_ref_zero,
BTR_EXTERN_FIELD_REF_SIZE)) {
col->def_val.len = UNIV_SQL_DEFAULT;
goto inconsistent;
} else {
col->def_val.data = btr_copy_externally_stored_field(
&col->def_val.len, data,
dict_table_page_size(index->table),
len, index->table->heap);
}
}
mem_heap_free(heap);
return DB_SUCCESS;
}
/** Load the instant ALTER TABLE metadata from the clustered index
when loading a table definition.
@param[in,out] index clustered index definition
......@@ -575,10 +503,10 @@ btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
return DB_CORRUPTION;
}
index->n_core_null_bytes = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
if (!index->is_instant()) {
if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
ut_ad(!index->is_instant());
return DB_SUCCESS;
}
......@@ -596,26 +524,24 @@ btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
page_cur_move_to_next(&cur.page_cur);
const rec_t* rec = cur.page_cur.rec;
const ulint comp = dict_table_is_comp(index->table);
const ulint info_bits = rec_get_info_bits(rec, comp);
if (page_rec_is_supremum(rec) || !rec_is_metadata(rec, index)) {
if (page_rec_is_supremum(rec)
|| !(info_bits & REC_INFO_MIN_REC_FLAG)) {
ib::error() << "Table " << index->table->name
<< " is missing instant ALTER metadata";
index->table->corrupted = true;
return DB_CORRUPTION;
}
if (dict_table_is_comp(index->table)) {
if (rec_get_info_bits(rec, true) != REC_INFO_MIN_REC_FLAG
&& rec_get_status(rec) != REC_STATUS_INSTANT) {
if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
|| (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
incompatible:
ib::error() << "Table " << index->table->name
<< " contains unrecognizable "
"instant ALTER metadata";
index->table->corrupted = true;
return DB_CORRUPTION;
}
} else if (rec_get_info_bits(rec, false) != REC_INFO_MIN_REC_FLAG) {
goto incompatible;
ib::error() << "Table " << index->table->name
<< " contains unrecognizable instant ALTER metadata";
index->table->corrupted = true;
return DB_CORRUPTION;
}
/* Read the metadata. We can get here on server restart
......@@ -628,14 +554,64 @@ btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
concurrent operations on the table, including table eviction
from the cache. */
if (rec_is_alter_metadata(rec, index)) {
if (info_bits & REC_INFO_DELETED_FLAG) {
return btr_cur_instant_init_metadata(rec, index);
}
index->n_core_null_bytes = UT_BITS_IN_BYTES(
index->get_n_nullable(index->n_core_fields));
mem_heap_t* heap = NULL;
ulint* offsets = rec_get_offsets(rec, index, NULL, true,
ULINT_UNDEFINED, &heap);
if (rec_offs_any_default(offsets)) {
inconsistent:
mem_heap_free(heap);
goto incompatible;
}
/* In fact, because we only ever append fields to the metadata
record, it is also OK to perform READ UNCOMMITTED and
then ignore any extra fields, provided that
trx_sys.is_registered(DB_TRX_ID). */
if (rec_offs_n_fields(offsets) > index->n_fields
&& !trx_sys.is_registered(current_trx(),
row_get_rec_trx_id(rec, index,
offsets))) {
goto inconsistent;
}
for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
ulint len;
const byte* data = rec_get_nth_field(rec, offsets, i, &len);
dict_col_t* col = index->fields[i].col;
ut_ad(!col->is_added());
ut_ad(!col->def_val.data);
col->def_val.len = len;
switch (len) {
case UNIV_SQL_NULL:
continue;
case 0:
col->def_val.data = field_ref_zero;
continue;
}
ut_ad(len != UNIV_SQL_DEFAULT);
if (!rec_offs_nth_extern(offsets, i)) {
col->def_val.data = mem_heap_dup(
index->table->heap, data, len);
} else if (len < BTR_EXTERN_FIELD_REF_SIZE
|| !memcmp(data + len - BTR_EXTERN_FIELD_REF_SIZE,
field_ref_zero,
BTR_EXTERN_FIELD_REF_SIZE)) {
col->def_val.len = UNIV_SQL_DEFAULT;
goto inconsistent;
} else {
col->def_val.data = btr_copy_externally_stored_field(
&col->def_val.len, data,
dict_table_page_size(index->table),
len, index->table->heap);
}
}
return btr_cur_instant_init_add_column(rec, index);
mem_heap_free(heap);
return DB_SUCCESS;
}
/** Load the instant ALTER TABLE metadata from the clustered index
......@@ -693,18 +669,46 @@ btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
break;
}
uint16_t n = page_get_instant(page);
const uint16_t n = page_get_instant(page);
if (n < index->n_uniq + DATA_ROLL_PTR) {
/* The PRIMARY KEY (or hidden DB_ROW_ID) and
DB_TRX_ID,DB_ROLL_PTR columns must always be present
as 'core' fields. All fields, including those for
instantly added columns, must be present in the data
dictionary. */
as 'core' fields. */
return true;
}
const rec_t* infimum = page_get_infimum_rec(page);
const rec_t* supremum = page_get_supremum_rec(page);
if (!memcmp(infimum, "infimum", 8)
&& !memcmp(supremum, "supremum", 8)) {
if (n > index->n_fields) {
/* All fields, including those for instantly
added columns, must be present in the
data dictionary. */
return true;
}
index->n_core_fields = n;
ut_ad(!index->is_dummy);
ut_d(index->is_dummy = true);
index->n_core_null_bytes = UT_BITS_IN_BYTES(
index->get_n_nullable(n));
ut_d(index->is_dummy = false);
return false;
}
if (memcmp(infimum, field_ref_zero, 8)
|| memcmp(supremum, field_ref_zero, 7)) {
/* The infimum and supremum records must either contain
the original strings, or they must be filled with zero
bytes, except for the bytes that we have repurposed. */
return true;
}
index->n_core_fields = n;
return false;
index->n_core_null_bytes = supremum[7];
return index->n_core_null_bytes > 128;
}
/** Optimistically latches the leaf page or pages requested.
......@@ -4738,7 +4742,8 @@ btr_cur_pessimistic_update(
&n_ext, entry_heap,
update->info_bits);
ut_ad(new_entry->n_fields
== index->n_fields + update->is_alter_metadata());
== ulint(index->n_fields)
+ update->is_alter_metadata());
} else {
new_entry = row_rec_to_index_entry(rec, index, *offsets,
&n_ext, entry_heap);
......
......@@ -1212,7 +1212,7 @@ void dict_index_t::reconstruct_fields()
if (col_no == 0) {
/* Dropped Column */
temp_fields[i].col = &table->instant->dropped[j++];
ut_ad(i == unsigned(temp_fields[i].col->ind));
ut_ad(temp_fields[i].col->is_dropped());
} else {
field = fields[o++];
temp_fields[i].col = &table->cols[col_no - 1];
......
......@@ -5044,22 +5044,19 @@ static bool innobase_instant_try(
goto func_exit;
}
/* Convert the table to the instant ADD COLUMN format. */
/* Convert the table to the instant ALTER TABLE format. */
ut_ad(user_table->is_instant());
mtr.commit();
mtr.start();
index->set_modified(mtr);
if (page_t* root = btr_root_get(index, &mtr)) {
if (fil_page_get_type(root) != FIL_PAGE_INDEX) {
if (buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, &mtr)) {
if (root->page.encrypted
|| fil_page_get_type(root->frame) != FIL_PAGE_INDEX) {
DBUG_ASSERT(!"wrong page type");
goto err_exit;
}
DBUG_ASSERT(!page_is_comp(root) || !page_get_instant(root));
mlog_write_ulint(root + FIL_PAGE_TYPE,
FIL_PAGE_TYPE_INSTANT, MLOG_2BYTES,
&mtr);
page_set_instant(root, index->n_core_fields, &mtr);
btr_set_instant(root, *index, &mtr);
mtr.commit();
mtr.start();
index->set_modified(mtr);
......
......@@ -421,6 +421,12 @@ void
btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset = false)
MY_ATTRIBUTE((nonnull));
/** Write instant ALTER TABLE metadata to a root page.
@param[in,out] root clustered index root page
@param[in] index clustered index with instant ALTER TABLE
@param[in,out] mtr mini-transaction */
void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr);
/*************************************************************//**
Makes tree one level higher by splitting the root, and inserts
the tuple. It is assumed that mtr contains an x-latch on the tree.
......
......@@ -1025,13 +1025,6 @@ page_get_direction(const page_t* page)
inline
uint16_t
page_get_instant(const page_t* page);
/** Assign the PAGE_INSTANT field.
@param[in,out] page clustered index root page
@param[in] n original number of clustered index fields
@param[in,out] mtr mini-transaction */
inline
void
page_set_instant(page_t* page, unsigned n, mtr_t* mtr);
/**********************************************************//**
Create an uncompressed B-tree index page.
......
......@@ -1098,24 +1098,6 @@ page_get_instant(const page_t* page)
#endif /* UNIV_DEBUG */
return(i >> 3);
}
/** Assign the PAGE_INSTANT field.
@param[in,out] page clustered index root page
@param[in] n original number of clustered index fields
@param[in,out] mtr mini-transaction */
inline
void
page_set_instant(page_t* page, unsigned n, mtr_t* mtr)
{
ut_ad(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT);
ut_ad(n > 0);
ut_ad(n < REC_MAX_N_FIELDS);
uint16_t i = page_header_get_field(page, PAGE_INSTANT);
ut_ad(i <= PAGE_NO_DIRECTION);
i |= n << 3;
mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page, i,
MLOG_2BYTES, mtr);
}
#endif /* !UNIV_INNOCHECKSUM */
#ifdef UNIV_MATERIALIZE
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment