Commit a1661efb authored by marko's avatar marko

branches/zip: Initial steps towards disk-based storage of compressed pages.

dict_mem_table_create(): Account for DICT_TF_COMPRESSED in a debug assertion.

btr_store_big_rec_extern_fields(), btr_free_externally_stored_field(),
btr_copy_externally_stored_field(): Implement the disk format for
compressed BLOB pages.

btr_copy_externally_stored_field(): Improve error reporting and handling
when decompressing BLOB pages.

buf_flush_init_for_writing(), buf_page_is_corrupted(), buf_page_print():
Account for compressed BLOB pages (FIL_PAGE_TYPE_ZBLOB).

buf_calc_zblob_page_checksum(): New function.
parent 7b57df22
......@@ -3575,9 +3575,6 @@ btr_store_big_rec_extern_fields(
return(DB_OUT_OF_FILE_SPACE);
}
mlog_write_ulint(page + FIL_PAGE_TYPE,
FIL_PAGE_TYPE_BLOB, MLOG_2BYTES, &mtr);
page_no = buf_frame_get_page_no(page);
if (prev_page_no != FIL_NULL) {
......@@ -3592,7 +3589,7 @@ btr_store_big_rec_extern_fields(
#endif /* UNIV_SYNC_DEBUG */
if (UNIV_LIKELY_NULL(page_zip)) {
next_ptr = prev_page;
next_ptr = prev_page + FIL_PAGE_NEXT;
} else {
next_ptr = prev_page + FIL_PAGE_DATA
+ BTR_BLOB_HDR_NEXT_PAGE_NO;
......@@ -3605,8 +3602,13 @@ btr_store_big_rec_extern_fields(
if (UNIV_LIKELY_NULL(page_zip)) {
int err;
c_stream.next_out = page + 4;
c_stream.avail_out = UNIV_PAGE_SIZE - 4;
mach_write_to_4(page + FIL_PAGE_TYPE,
FIL_PAGE_TYPE_ZBLOB);
c_stream.next_out = page
+ FIL_PAGE_FILE_FLUSH_LSN;
c_stream.avail_out = page_zip->size
- FIL_PAGE_FILE_FLUSH_LSN;
err = deflate(&c_stream, Z_FINISH);
ut_a(err == Z_OK || err == Z_STREAM_END);
......@@ -3614,12 +3616,14 @@ btr_store_big_rec_extern_fields(
|| c_stream.avail_out == 0);
/* Write the "next BLOB page" pointer */
mach_write_to_4(page, FIL_NULL);
mlog_write_ulint(page + FIL_PAGE_NEXT,
FIL_NULL, MLOG_4BYTES, &mtr);
/* Zero out the unused part of the page. */
memset(page + UNIV_PAGE_SIZE
memset(page + page_zip->size
- c_stream.avail_out,
0, c_stream.avail_out);
mlog_log_string(page, UNIV_PAGE_SIZE, &mtr);
mlog_log_string(page + FIL_PAGE_TYPE,
page_zip->size - FIL_PAGE_TYPE, &mtr);
if (err == Z_OK && prev_page_no != FIL_NULL) {
......@@ -3663,7 +3667,7 @@ btr_store_big_rec_extern_fields(
mlog_write_ulint(field_ref
+ BTR_EXTERN_OFFSET,
FIL_PAGE_DATA,
FIL_PAGE_NEXT,
MLOG_4BYTES, &mtr);
}
......@@ -3680,6 +3684,10 @@ btr_store_big_rec_extern_fields(
break;
}
} else {
mlog_write_ulint(page + FIL_PAGE_TYPE,
FIL_PAGE_TYPE_BLOB,
MLOG_2BYTES, &mtr);
if (extern_len > (UNIV_PAGE_SIZE
- FIL_PAGE_DATA
- BTR_BLOB_HDR_SIZE
......@@ -3851,7 +3859,7 @@ btr_free_externally_stored_field(
if (dict_table_is_zip(index->table)) {
/* Note that page_zip will be NULL
in row_purge_upd_exist_or_extern(). */
next_page_no = mach_read_from_4(page);
next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
btr_page_free_low(index->tree, page,
space_id, page_no, 0, &mtr);
......@@ -4015,7 +4023,8 @@ btr_copy_externally_stored_field(
byte* data, /* in: 'internally' stored part of the
field containing also the reference to
the external part */
ibool zip, /* in: TRUE=compressed BLOB */
ulint zip_size,/* in: nonzero=compressed BLOB page size,
zero for uncompressed BLOBs */
ulint local_len,/* in: length of data */
mem_heap_t* heap) /* in: mem heap */
{
......@@ -4055,7 +4064,7 @@ btr_copy_externally_stored_field(
return(buf);
}
if (UNIV_UNLIKELY(zip)) {
if (UNIV_UNLIKELY(zip_size)) {
int err;
d_stream.zalloc = (alloc_func) 0;
d_stream.zfree = (free_func) 0;
......@@ -4076,12 +4085,36 @@ btr_copy_externally_stored_field(
#ifdef UNIV_SYNC_DEBUG
buf_page_dbg_add_level(page, SYNC_EXTERN_STORAGE);
#endif /* UNIV_SYNC_DEBUG */
if (UNIV_UNLIKELY(zip)) {
if (UNIV_UNLIKELY(zip_size)) {
int err;
d_stream.next_in = page + 4;
d_stream.avail_in = UNIV_PAGE_SIZE - 4;/* TODO */
page_no = mach_read_from_4(page);
if (UNIV_UNLIKELY(fil_page_get_type(page)
!= FIL_PAGE_TYPE_ZBLOB)) {
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: Unknown type %lu of compressed BLOB page %lu space %lu\n",
(ulong) fil_page_get_type(page),
(ulong) page_no, (ulong) space_id);
}
page_no = mach_read_from_4(page + offset);
if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
/* When the BLOB begins at page header,
the compressed data payload does not
immediately follow the next page pointer. */
offset = FIL_PAGE_FILE_FLUSH_LSN;
} else {
offset += 4;
}
d_stream.next_in = page + offset;
d_stream.avail_in = zip_size - offset;
/* On other BLOB pages except the first
the BLOB header always is at the page header: */
offset = FIL_PAGE_NEXT;
err = inflate(&d_stream, Z_NO_FLUSH);
switch (err) {
......@@ -4095,16 +4128,23 @@ btr_copy_externally_stored_field(
default:
mtr_commit(&mtr);
inflateEnd(&d_stream);
ut_error;/* TODO: report error */
inflate_error:
ut_print_timestamp(stderr);
fprintf(stderr,
" InnoDB: inflate() of compressed BLOB page %lu space %lu returned %d\n",
(ulong) page_no, (ulong) space_id,
err);
*len = 0;
return(buf);
}
if (page_no == FIL_NULL) {
err = inflate(&d_stream, Z_FINISH);
/* TODO: report error instead of
assertion failure? */
ut_a(err == Z_STREAM_END);
end_of_blob:
if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
goto inflate_error;
}
end_of_blob:
mtr_commit(&mtr);
ut_a(!d_stream.avail_out);
......@@ -4160,8 +4200,9 @@ btr_rec_copy_externally_stored_field(
ulint* len, /* out: length of the field */
mem_heap_t* heap) /* in: mem heap */
{
ulint local_len;
byte* data;
ulint local_len;
byte* data;
page_zip_des_t* page_zip;
ut_ad(rec_offs_validate(rec, NULL, offsets));
ut_a(rec_offs_nth_extern(offsets, no));
......@@ -4177,7 +4218,9 @@ btr_rec_copy_externally_stored_field(
data = rec_get_nth_field(rec, offsets, no, &local_len);
page_zip = buf_block_get_page_zip(buf_block_align(rec));
return(btr_copy_externally_stored_field(len, data,
!!buf_block_get_page_zip(buf_block_align(rec)),
page_zip ? page_zip->size : 0,
local_len, heap));
}
......@@ -232,6 +232,22 @@ ibool buf_debug_prints = FALSE; /* If this is set TRUE,
the program prints info whenever
read-ahead or flush occurs */
#endif /* UNIV_DEBUG */
/************************************************************************
Calculates a compressed BLOB page checksum which is stored to the page
when it is written to a file. Note that we must be careful to calculate
the same value on 32-bit and 64-bit architectures. */
ulint
buf_calc_zblob_page_checksum(
/*=========================*/
/* out: checksum */
const byte* page, /* in: compressed BLOB page */
ulint zip_size) /* in: size of the page, in bytes */
{
return(ut_fold_binary(page + FIL_PAGE_SPACE_OR_CHKSUM,
zip_size - FIL_PAGE_SPACE_OR_CHKSUM) & 0xFFFFFFFFUL);
}
/************************************************************************
Calculates a page checksum which is stored to the page when it is written
to a file. Note that we must be careful to calculate the same value on
......@@ -293,18 +309,19 @@ ibool
buf_page_is_corrupted(
/*==================*/
/* out: TRUE if corrupted */
byte* read_buf) /* in: a database page */
byte* read_buf, /* in: a database page */
ulint zip_size) /* in: size of compressed page;
0 for uncompressed pages */
{
ulint checksum;
ulint old_checksum;
ulint checksum_field;
ulint old_checksum_field;
#ifndef UNIV_HOTBACKUP
dulint current_lsn;
#endif
if (mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
!= mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4)) {
if (UNIV_LIKELY(!zip_size)
&& memcmp(read_buf + FIL_PAGE_LSN + 4,
read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
/* Stored log sequence numbers at the start and the end
of page do not match */
......@@ -341,8 +358,16 @@ buf_page_is_corrupted(
BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
disabled. Otherwise, skip checksum calculation and return FALSE */
if (srv_use_checksums) {
old_checksum = buf_calc_page_old_checksum(read_buf);
if (UNIV_LIKELY(srv_use_checksums)) {
checksum_field = mach_read_from_4(read_buf
+ FIL_PAGE_SPACE_OR_CHKSUM);
if (UNIV_UNLIKELY(zip_size)) {
return(checksum_field != BUF_NO_CHECKSUM_MAGIC
&& checksum_field
!= buf_calc_zblob_page_checksum(
read_buf, zip_size));
}
old_checksum_field = mach_read_from_4(read_buf + UNIV_PAGE_SIZE
- FIL_PAGE_END_LSN_OLD_CHKSUM);
......@@ -357,21 +382,20 @@ buf_page_is_corrupted(
if (old_checksum_field != mach_read_from_4(read_buf
+ FIL_PAGE_LSN)
&& old_checksum_field != old_checksum
&& old_checksum_field != BUF_NO_CHECKSUM_MAGIC) {
&& old_checksum_field != BUF_NO_CHECKSUM_MAGIC
&& old_checksum_field
!= buf_calc_page_old_checksum(read_buf)) {
return(TRUE);
}
checksum = buf_calc_page_new_checksum(read_buf);
checksum_field = mach_read_from_4(read_buf +
FIL_PAGE_SPACE_OR_CHKSUM);
/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
(always equal to 0), to FIL_PAGE_SPACE_SPACE_OR_CHKSUM */
if (checksum_field != 0 && checksum_field != checksum
&& checksum_field != BUF_NO_CHECKSUM_MAGIC) {
if (checksum_field != 0
&& checksum_field != BUF_NO_CHECKSUM_MAGIC
&& checksum_field
!= buf_calc_page_new_checksum(read_buf)) {
return(TRUE);
}
......@@ -398,6 +422,7 @@ buf_page_print(
ut_print_buf(stderr, read_buf, UNIV_PAGE_SIZE);
fputs("InnoDB: End of page dump\n", stderr);
/* TODO: print zipped pages differently, esp. BLOB pages */
checksum = srv_use_checksums ?
buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
old_checksum = srv_use_checksums ?
......@@ -489,6 +514,10 @@ buf_page_print(
fputs("InnoDB: Page may be a BLOB page\n",
stderr);
break;
case FIL_PAGE_TYPE_ZBLOB:
fputs("InnoDB: Page may be a compressed BLOB page\n",
stderr);
break;
}
}
......@@ -1885,7 +1914,8 @@ buf_page_io_complete(
/* From version 3.23.38 up we store the page checksum
to the 4 first bytes of the page end lsn field */
if (buf_page_is_corrupted(block->frame)) {
if (buf_page_is_corrupted(block->frame,
block->space ? 16384 : 0/* TODO */)) {
fprintf(stderr,
"InnoDB: Database page corruption on disk or a failed\n"
"InnoDB: file read of page %lu.\n", (ulong) block->offset);
......
......@@ -455,6 +455,25 @@ buf_flush_init_for_writing(
{
page_zip_des_t* page_zip = page_zip_;
if (space/* TODO: space_is_zip */) {
switch (fil_page_get_type(page)) {
case FIL_PAGE_TYPE_ZBLOB:
ut_ad(!page_zip);
mach_write_to_4(page + FIL_PAGE_OFFSET, page_no);
mach_write_to_4(page + FIL_PAGE_PREV, space);
mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
srv_use_checksums
? buf_calc_zblob_page_checksum(
page, 16384/* TODO */)
: BUF_NO_CHECKSUM_MAGIC);
return;
case FIL_PAGE_INDEX:
/* TODO: special handling */
break;
}
}
/* Write the newest modification lsn to the page header and trailer */
mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
......
......@@ -42,7 +42,7 @@ dict_mem_table_create(
mem_heap_t* heap;
ut_ad(name);
ut_ad(!(flags & ~DICT_TF_COMPACT));
ut_ad(!(flags & ~(DICT_TF_COMPACT | DICT_TF_COMPRESSED)));
heap = mem_heap_create(DICT_HEAP_SIZE);
......
......@@ -207,8 +207,7 @@ btr_page_reorganize(
/* out: TRUE on success, FALSE on failure */
page_t* page, /* in: page to be reorganized */
dict_index_t* index, /* in: record descriptor */
mtr_t* mtr) /* in: mtr */
__attribute__((nonnull, warn_unused_result));
mtr_t* mtr); /* in: mtr */
/*****************************************************************
Decides if the page should be split at the convergence point of
inserts converging to left. */
......
......@@ -390,6 +390,17 @@ buf_block_get_modify_clock(
/* out: value */
buf_block_t* block); /* in: block */
/************************************************************************
Calculates a compressed BLOB page checksum which is stored to the page
when it is written to a file. Note that we must be careful to calculate
the same value on 32-bit and 64-bit architectures. */
ulint
buf_calc_zblob_page_checksum(
/*=========================*/
/* out: checksum */
const byte* page, /* in: compressed BLOB page */
ulint zip_size); /* in: size of the page, in bytes */
/************************************************************************
Calculates a page checksum which is stored to the page when it is written
to a file. Note that we must be careful to calculate the same value
on 32-bit and 64-bit architectures. */
......@@ -419,7 +430,9 @@ ibool
buf_page_is_corrupted(
/*==================*/
/* out: TRUE if corrupted */
byte* read_buf); /* in: a database page */
byte* read_buf, /* in: a database page */
ulint zip_size); /* in: size of compressed page;
0 for uncompressed pages */
/**************************************************************************
Gets the page number of a pointer pointing within a buffer frame containing
a file page. */
......
......@@ -105,6 +105,7 @@ extern fil_addr_t fil_addr_null;
#define FIL_PAGE_TYPE_FSP_HDR 8 /* File space header */
#define FIL_PAGE_TYPE_XDES 9 /* Extent descriptor page */
#define FIL_PAGE_TYPE_BLOB 10 /* Uncompressed BLOB page */
#define FIL_PAGE_TYPE_ZBLOB 11 /* Compressed BLOB page */
/* Space types */
#define FIL_TABLESPACE 501
......
......@@ -968,7 +968,8 @@ page_zip_fields_decode(
return(NULL);
}
table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n, TRUE);
table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
DICT_TF_COMPACT | DICT_TF_COMPRESSED);
index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
DICT_HDR_SPACE, 0, n);
index->table = table;
......
......@@ -465,13 +465,21 @@ trx_sys_doublewrite_init_or_restore_pages(
/* It is an unwritten doublewrite buffer page:
do nothing */
} else {
ulint zip_size;
/* Read in the actual page from the data files */
fil_io(OS_FILE_READ, TRUE, space_id, page_no, 0,
UNIV_PAGE_SIZE, read_buf, NULL);
/* Check if the page is corrupt */
if (buf_page_is_corrupted(read_buf)) {
if (space_id && fil_page_get_type(read_buf)
== FIL_PAGE_TYPE_ZBLOB) {
zip_size = 16384; /* TODO */
} else {
zip_size = 0;
}
if (buf_page_is_corrupted(read_buf, zip_size)) {
fprintf(stderr,
"InnoDB: Warning: database page corruption or a failed\n"
......@@ -479,7 +487,7 @@ trx_sys_doublewrite_init_or_restore_pages(
fprintf(stderr,
"InnoDB: Trying to recover it from the doublewrite buffer.\n");
if (buf_page_is_corrupted(page)) {
if (buf_page_is_corrupted(page, zip_size)) {
fprintf(stderr,
"InnoDB: Dump of the page:\n");
buf_page_print(read_buf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment