Commit 87217329 authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

refs #5842, merge Direct I/O to main

git-svn-id: file:///svn/toku/tokudb@52238 c7de825b-a66e-492c-adef-691d508d4ae1
parent dfe8a560
......@@ -693,6 +693,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
printf("int log_compare (const DB_LSN*, const DB_LSN *) %s;\n", VISIBLE);
printf("int toku_set_trace_file (const char *fname) %s;\n", VISIBLE);
printf("int toku_close_trace_file (void) %s;\n", VISIBLE);
printf("void db_env_set_direct_io (bool direct_io_on) %s;\n", VISIBLE);
printf("void db_env_set_func_fsync (int (*)(int)) %s;\n", VISIBLE);
printf("void db_env_set_func_free (void (*)(void*)) %s;\n", VISIBLE);
printf("void db_env_set_func_malloc (void *(*)(size_t)) %s;\n", VISIBLE);
......
......@@ -53,6 +53,9 @@ include(CheckSymbolExists)
check_symbol_exists(M_MMAP_THRESHOLD "malloc.h" HAVE_M_MMAP_THRESHOLD)
## check whether we have CLOCK_REALTIME
check_symbol_exists(CLOCK_REALTIME "time.h" HAVE_CLOCK_REALTIME)
## check how to do direct I/O
check_symbol_exists(O_DIRECT "fcntl.h" HAVE_O_DIRECT)
check_symbol_exists(F_NOCACHE "fcntl.h" HAVE_F_NOCACHE)
include(CheckFunctionExists)
......
......@@ -108,7 +108,7 @@ set_ldflags_if_supported(
## set extra debugging flags and preprocessor definitions
set(CMAKE_C_FLAGS_DEBUG "-g3 -O0 ${CMAKE_C_FLAGS_DEBUG}")
set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0 ${CMAKE_CXX_FLAGS_DEBUG}")
set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2)
#set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2)
## set extra release flags
if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)
......
......@@ -58,6 +58,7 @@ block_allocator_print (BLOCK_ALLOCATOR ba) {
void
create_block_allocator (BLOCK_ALLOCATOR *ba, uint64_t reserve_at_beginning, uint64_t alignment) {
assert(alignment>=512 && 0==(alignment%512)); // the alignment must be at least 512 and aligned with 512 to make DIRECT_IO happy.
BLOCK_ALLOCATOR XMALLOC(result);
result->reserve_at_beginning = reserve_at_beginning;
result->alignment = alignment;
......@@ -176,8 +177,9 @@ align (uint64_t value, BLOCK_ALLOCATOR ba)
return ((value+ba->alignment-1)/ba->alignment)*ba->alignment;
}
void
block_allocator_alloc_block (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset) {
void block_allocator_alloc_block(BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset)
// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
{
invariant(size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block.
grow_blocks_array(ba);
ba->n_bytes_in_use += size;
......
......@@ -459,9 +459,11 @@ pair_is_unallocated(struct block_translation_pair *pair) {
return pair->size == 0 && pair->u.diskoff == diskoff_unused;
}
// Purpose of this function is to figure out where to put the inprogress btt on disk, allocate space for it there.
static void
blocknum_alloc_translation_on_disk_unlocked (BLOCK_TABLE bt) {
static void blocknum_alloc_translation_on_disk_unlocked(BLOCK_TABLE bt)
// Effect: figure out where to put the inprogress btt on disk, allocate space for it there.
// The space must be 512-byte aligned (both the starting address and the size).
// As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt.
{
toku_mutex_assert_locked(&bt->mutex);
struct translation *t = &bt->inprogress;
......@@ -479,24 +481,29 @@ PRNTF("blokAllokator", 1L, size, offset, bt);
t->block_translation[b.b].size = size;
}
//Fills wbuf with bt
//A clean shutdown runs checkpoint start so that current and inprogress are copies.
void
toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
int64_t *address, int64_t *size) {
void toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
int64_t *address, int64_t *size)
// Effect: Fills wbuf (which starts uninitialized) with bt
// A clean shutdown runs checkpoint start so that current and inprogress are copies.
// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
// The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
// It *is* guaranteed that we can read up to the next 512-byte boundary, however
{
lock_for_blocktable(bt);
struct translation *t = &bt->inprogress;
BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
blocknum_alloc_translation_on_disk_unlocked(bt);
blocknum_alloc_translation_on_disk_unlocked(bt); // The allocated block must be 512-byte aligned to make O_DIRECT happy.
uint64_t size_translation = calculate_size_on_disk(t);
uint64_t size_aligned = roundup_to_multiple(512, size_translation);
assert((int64_t)size_translation==t->block_translation[b.b].size);
{
//Init wbuf
uint64_t size_translation = calculate_size_on_disk(t);
assert((int64_t)size_translation==t->block_translation[b.b].size);
if (0)
printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff);
wbuf_init(w, toku_malloc(size_translation), size_translation);
assert(w->size==size_translation);
char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros.
wbuf_init(w, buf, size_aligned);
}
wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum);
wbuf_BLOCKNUM(w, t->blocknum_freelist_head);
......@@ -510,9 +517,10 @@ toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
uint32_t checksum = x1764_finish(&w->checksum);
wbuf_int(w, checksum);
*address = t->block_translation[b.b].u.diskoff;
*size = t->block_translation[b.b].size;
*size = size_translation;
assert((*address)%512 == 0);
ensure_safe_write_unlocked(bt, fd, *size, *address);
ensure_safe_write_unlocked(bt, fd, size_aligned, *address);
unlock_for_blocktable(bt);
}
......
......@@ -3397,6 +3397,20 @@ int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *ft_handle_
return r;
}
static bool use_direct_io = true;
void toku_ft_set_direct_io (bool direct_io_on) {
use_direct_io = direct_io_on;
}
static inline int ft_open_maybe_direct(const char *filename, int oflag, int mode) {
if (use_direct_io) {
return toku_os_open_direct(filename, oflag, mode);
} else {
return toku_os_open(filename, oflag, mode);
}
}
// open a file for use by the brt
// Requires: File does not exist.
static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
......@@ -3404,12 +3418,12 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
int r;
int fd;
int er;
fd = open(fname, O_RDWR | O_BINARY, mode);
fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
assert(fd==-1);
if ((er = get_maybe_error_errno()) != ENOENT) {
return er;
}
fd = open(fname, O_RDWR | O_CREAT | O_BINARY, mode);
fd = ft_open_maybe_direct(fname, O_RDWR | O_CREAT | O_BINARY, mode);
if (fd==-1) {
r = get_error_errno();
return r;
......@@ -3426,7 +3440,7 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
static int ft_open_file(const char *fname, int *fdp) {
mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
int fd;
fd = open(fname, O_RDWR | O_BINARY, mode);
fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
if (fd==-1) {
return get_error_errno();
}
......
......@@ -257,4 +257,5 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);
extern bool garbage_collection_debug;
void toku_ft_set_direct_io(bool direct_io_on);
#endif
......@@ -39,8 +39,11 @@ void
toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset) {
// make the checksum
int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
int64_t size_aligned = roundup_to_multiple(512, size);
struct wbuf w;
wbuf_init(&w, toku_xmalloc(size), size);
char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
wbuf_init(&w, aligned_buf, size);
toku_serialize_descriptor_contents_to_wbuf(&w, desc);
{
//Add checksum
......@@ -50,7 +53,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF
lazy_assert(w.ndone==w.size);
{
//Actual Write translation table
toku_os_full_pwrite(fd, w.buf, size, offset);
toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
}
toku_free(w.buf);
}
......@@ -88,10 +91,12 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
if (size > 0) {
lazy_assert(size>=4); //4 for checksum
{
XMALLOC_N(size, dbuf);
ssize_t size_to_malloc = roundup_to_multiple(512, size);
XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
{
ssize_t sz_read = toku_os_pread(fd, dbuf, size, offset);
lazy_assert(sz_read==size);
ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
lazy_assert(sz_read==size_to_malloc);
}
{
// check the checksum
......@@ -118,9 +123,9 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
return r;
}
// We only deserialize brt header once and then share everything with all the brts.
int
deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
// Effect: Deserialize the ft header.
// We deserialize brt header only once and then share everything with all the brts.
{
int r;
FT ft = NULL;
......@@ -179,14 +184,16 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
//Load translation table
{
unsigned char *XMALLOC_N(translation_size_on_disk, tbuf);
size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
{
// This cast is messed up in 32-bits if the block translation
// table is ever more than 4GB. But in that case, the
// translation table itself won't fit in main memory.
ssize_t readsz = toku_os_pread(fd, tbuf, translation_size_on_disk,
ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
translation_address_on_disk);
lazy_assert(readsz == translation_size_on_disk);
assert(readsz >= translation_size_on_disk);
assert(readsz <= (ssize_t)size_to_read);
}
// Create table and read in data.
r = toku_blocktable_create_from_buffer(fd,
......@@ -427,28 +434,30 @@ serialize_ft_min_size (uint32_t version) {
return size;
}
// Simply reading the raw bytes of the header into an rbuf is insensitive
// to disk format version. If that ever changes, then modify this.
int deserialize_ft_from_fd_into_rbuf(int fd,
toku_off_t offset_of_header,
struct rbuf *rb,
uint64_t *checkpoint_count,
LSN *checkpoint_lsn,
uint32_t * version_p)
// Effect: Read and parse the header of a fractalal tree
//
// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
// file AND the header is useless
int
deserialize_ft_from_fd_into_rbuf(int fd,
toku_off_t offset_of_header,
struct rbuf *rb,
uint64_t *checkpoint_count,
LSN *checkpoint_lsn,
uint32_t * version_p)
// Simply reading the raw bytes of the header into an rbuf is insensitive
// to disk format version. If that ever changes, then modify this.
//
// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
// file AND the header is useless
{
int r = 0;
const int64_t prefix_size = 8 + // magic ("tokudata")
4 + // version
4 + // build_id
4; // size
unsigned char prefix[prefix_size];
const int64_t read_size = roundup_to_multiple(512, prefix_size);
unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
rb->buf = NULL;
int64_t n = toku_os_pread(fd, prefix, prefix_size, offset_of_header);
if (n != prefix_size) {
int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
if (n != read_size) {
if (n==0) {
r = TOKUDB_DICTIONARY_NO_HEADER;
} else if (n<0) {
......@@ -504,16 +513,21 @@ deserialize_ft_from_fd_into_rbuf(int fd,
lazy_assert(rb->ndone==prefix_size);
rb->size = size;
XMALLOC_N(rb->size, rb->buf);
n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header);
if (n != rb->size) {
if (n < 0) {
r = get_error_errno();
} else {
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
{
toku_free(rb->buf);
uint32_t size_to_read = roundup_to_multiple(512, size);
XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
assert(offset_of_header%512==0);
n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
if (n != size_to_read) {
if (n < 0) {
r = get_error_errno();
} else {
r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
}
goto exit;
}
goto exit;
}
//It's version 14 or later. Magic looks OK.
//We have an rbuf that represents the header.
......@@ -549,9 +563,7 @@ deserialize_ft_from_fd_into_rbuf(int fd,
exit:
if (r != 0 && rb->buf != NULL) {
if (rb->buf != prefix) { // don't free prefix, it's stack alloc'd
toku_free(rb->buf);
}
toku_free(rb->buf);
rb->buf = NULL;
}
return r;
......@@ -718,16 +730,23 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
toku_serialize_translation_to_wbuf(blocktable, fd, &w_translation,
&address_translation,
&size_translation);
lazy_assert(size_translation == w_translation.size);
assert(size_translation == w_translation.ndone); // the bytes written are the size
assert(w_translation.size % 512 == 0); // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
struct wbuf w_main;
size_t size_main = toku_serialize_ft_size(h);
wbuf_init(&w_main, toku_xmalloc(size_main), size_main);
size_t size_main = toku_serialize_ft_size(h);
size_t size_main_aligned = roundup_to_multiple(512, size_main);
assert(size_main_aligned<BLOCK_ALLOCATOR_HEADER_RESERVE);
char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
wbuf_init(&w_main, mainbuf, size_main);
toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
lazy_assert(w_main.ndone == size_main);
//Actual Write translation table
toku_os_full_pwrite(fd, w_translation.buf, size_translation, address_translation);
// Actually write translation table
// This write is guaranteed to read good data at the end of the buffer, since the
// w_translation.buf is padded with zeros to a 512-byte boundary.
toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
//Everything but the header MUST be on disk before header starts.
//Otherwise we will think the header is good and some blocks might not
......@@ -746,7 +765,7 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
// Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
toku_off_t main_offset;
main_offset = (h->checkpoint_count & 0x1) ? 0 : BLOCK_ALLOCATOR_HEADER_RESERVE;
toku_os_full_pwrite(fd, w_main.buf, w_main.ndone, main_offset);
toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
toku_free(w_main.buf);
toku_free(w_translation.buf);
}
......@@ -134,7 +134,9 @@ toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int
to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width);
}
if (to_write > 0) {
char *XCALLOC_N(to_write, wbuf);
assert(to_write%512==0);
char *XMALLOC_N_ALIGNED(512, to_write, wbuf);
memset(wbuf, 0, to_write);
toku_off_t start_write = alignup64(file_size, stripe_width);
invariant(start_write >= file_size);
toku_os_full_pwrite(fd, wbuf, to_write, start_write);
......@@ -773,20 +775,21 @@ serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb,
st->compress_time += t2 - t1;
}
// Writes out each child to a separate malloc'd buffer, then compresses
// all of them, and writes the uncompressed header, to bytes_to_write,
// which is malloc'd.
int toku_serialize_ftnode_to_memory(FTNODE node,
FTNODE_DISK_DATA* ndd,
unsigned int basementnodesize,
enum toku_compression_method compression_method,
bool do_rebalancing,
bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false
/*out*/ size_t *n_bytes_to_write,
/*out*/ size_t *n_uncompressed_bytes,
/*out*/ char **bytes_to_write)
// Effect: Writes out each child to a separate malloc'd buffer, then compresses
// all of them, and writes the uncompressed header, to bytes_to_write,
// which is malloc'd.
//
int
toku_serialize_ftnode_to_memory (FTNODE node,
FTNODE_DISK_DATA* ndd,
unsigned int basementnodesize,
enum toku_compression_method compression_method,
bool do_rebalancing,
bool in_parallel, // for loader is true, for toku_ftnode_flush_callback, is false
/*out*/ size_t *n_bytes_to_write,
/*out*/ size_t *n_uncompressed_bytes,
/*out*/ char **bytes_to_write)
// The resulting buffer is guaranteed to be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needed).
// 512-byte padding is for O_DIRECT to work.
{
toku_assert_entire_node_in_memory(node);
......@@ -849,7 +852,9 @@ toku_serialize_ftnode_to_memory (FTNODE node,
total_uncompressed_size += sb[i].uncompressed_size + 4;
}
char *XMALLOC_N(total_node_size, data);
uint32_t total_buffer_size = roundup_to_multiple(512, total_node_size); // make the buffer be 512 bytes.
char *XMALLOC_N_ALIGNED(512, total_buffer_size, data);
char *curr_ptr = data;
// now create the final serialized node
......@@ -874,9 +879,14 @@ toku_serialize_ftnode_to_memory (FTNODE node,
*(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum);
curr_ptr += sizeof(sb[i].xsum);
}
// Zero the rest of the buffer
for (uint32_t i=total_node_size; i<total_buffer_size; i++) {
data[i]=0;
}
assert(curr_ptr - data == total_node_size);
*bytes_to_write = data;
*n_bytes_to_write = total_node_size;
*n_bytes_to_write = total_buffer_size;
*n_uncompressed_bytes = total_uncompressed_size;
//
......@@ -890,6 +900,8 @@ toku_serialize_ftnode_to_memory (FTNODE node,
toku_free(sb[i].uncompressed_ptr);
}
assert(0 == (*n_bytes_to_write)%512);
assert(0 == ((unsigned long long)(*bytes_to_write))%512);
toku_free(sb);
return 0;
}
......@@ -1152,11 +1164,13 @@ void read_block_from_fd_into_rbuf(
// get the file offset and block size for the block
DISKOFF offset, size;
toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size);
uint8_t *XMALLOC_N(size, raw_block);
DISKOFF size_aligned = roundup_to_multiple(512, size);
uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block);
rbuf_init(rb, raw_block, size);
// read the block
ssize_t rlen = toku_os_pread(fd, raw_block, size, offset);
lazy_assert((DISKOFF)rlen == size);
ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
assert((DISKOFF)rlen >= size);
assert((DISKOFF)rlen <= size_aligned);
}
static const int read_header_heuristic_max = 32*1024;
......@@ -1170,8 +1184,8 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN
{
DISKOFF offset, size;
toku_translate_blocknum_to_offset_size(ft->blocktable, blocknum, &offset, &size);
DISKOFF read_size = MIN(read_header_heuristic_max, size);
uint8_t *XMALLOC_N(size, raw_block);
DISKOFF read_size = roundup_to_multiple(512, MIN(read_header_heuristic_max, size));
uint8_t *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, size), raw_block);
rbuf_init(rb, raw_block, read_size);
// read the block
......@@ -2418,14 +2432,20 @@ toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, i
uint32_t curr_size = BP_SIZE (ndd, childnum);
struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};
uint8_t *XMALLOC_N(curr_size, raw_block);
rbuf_init(&rb, raw_block, curr_size);
uint32_t pad_at_beginning = (node_offset+curr_offset)%512;
uint32_t padded_size = roundup_to_multiple(512, pad_at_beginning + curr_size);
uint8_t *XMALLOC_N_ALIGNED(512, padded_size, raw_block);
rbuf_init(&rb, pad_at_beginning+raw_block, curr_size);
tokutime_t t0 = toku_time_now();
// read
ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset);
lazy_assert((DISKOFF)rlen == curr_size);
// read the block
assert(0==((unsigned long long)raw_block)%512); // for O_DIRECT
assert(0==(padded_size)%512);
assert(0==(node_offset+curr_offset-pad_at_beginning)%512);
ssize_t rlen = toku_os_pread(fd, raw_block, padded_size, node_offset+curr_offset-pad_at_beginning);
assert((DISKOFF)rlen >= pad_at_beginning + curr_size); // we read in at least enough to get what we wanted
assert((DISKOFF)rlen <= padded_size); // we didn't read in too much.
tokutime_t t1 = toku_time_now();
......@@ -2627,12 +2647,14 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
struct sub_block sub_block[/*n_sub_blocks*/],
enum toku_compression_method method,
/*out*/ size_t *n_bytes_to_write,
/*out*/ char **bytes_to_write) {
/*out*/ char **bytes_to_write)
// Guarantees that the malloc'd BYTES_TO_WRITE is 512-byte aligned (so that O_DIRECT will work)
{
// allocate space for the compressed uncompressed_buf
size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method);
size_t sub_block_header_len = sub_block_header_size(n_sub_blocks);
size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum
char *XMALLOC_N(header_len + compressed_len, compressed_buf);
char *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, header_len + compressed_len), compressed_buf);
// copy the header
memcpy(compressed_buf, uncompressed_buf, node_header_overhead);
......@@ -2662,7 +2684,12 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
uint32_t xsum = x1764_memory(compressed_buf, header_length);
*ptr = toku_htod32(xsum);
*n_bytes_to_write = header_len + compressed_len;
uint32_t padded_len = roundup_to_multiple(512, header_len + compressed_len);
// Zero out padding.
for (uint32_t i = header_len+compressed_len; i < padded_len; i++) {
compressed_buf[i] = 0;
}
*n_bytes_to_write = padded_len;
*bytes_to_write = compressed_buf;
}
......@@ -2933,11 +2960,13 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
int r = 0;
if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
uint8_t *XMALLOC_N(size, raw_block);
DISKOFF size_aligned = roundup_to_multiple(512, size);
uint8_t *XMALLOC_N_ALIGNED(512, size, raw_block);
{
// read the (partially compressed) block
ssize_t rlen = toku_os_pread(fd, raw_block, size, offset);
lazy_assert((DISKOFF)rlen == size);
ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
lazy_assert((DISKOFF)rlen >= size);
lazy_assert((DISKOFF)rlen <= size_aligned);
}
// get the layout_version
int layout_version;
......
......@@ -2102,6 +2102,7 @@ static void allocate_node (struct subtrees_info *sts, int64_t b) {
sts->n_subtrees++;
}
// dbuf will always contained 512-byte aligned buffer, but the length might not be a multiple of 512 bytes. If that's what you want, then pad it.
struct dbuf {
unsigned char *buf;
int buflen;
......@@ -2225,7 +2226,7 @@ static void putbuf_bytes (struct dbuf *dbuf, const void *bytes, int nbytes) {
int oldbuflen = dbuf->buflen;
dbuf->buflen += dbuf->off + nbytes;
dbuf->buflen *= 2;
REALLOC_N(dbuf->buflen, dbuf->buf);
REALLOC_N_ALIGNED(512, dbuf->buflen, dbuf->buf);
if (dbuf->buf == NULL) {
dbuf->error = get_error_errno();
dbuf->buf = oldbuf;
......@@ -2905,9 +2906,17 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
}
unsigned int checksum = x1764_memory(ttable.buf, ttable.off);
putbuf_int32(&ttable, checksum);
// pad it to 512 zeros
long long encoded_length = ttable.off;
{
int nbytes_to_add = roundup_to_multiple(512, ttable.off) - encoded_length;
char zeros[nbytes_to_add];
for (int i=0; i<nbytes_to_add; i++) zeros[i]=0;
putbuf_bytes(&ttable, zeros, nbytes_to_add);
}
int result = ttable.error;
if (result == 0) {
invariant(bt_size_on_disk==ttable.off);
invariant(bt_size_on_disk==encoded_length);
result = toku_os_pwrite(out->fd, ttable.buf, ttable.off, off_of_translation);
}
dbuf_destroy(&ttable);
......@@ -2919,18 +2928,22 @@ static int
write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) {
int result = 0;
size_t size = toku_serialize_ft_size(out->h->h);
size_t alloced_size = roundup_to_multiple(512, size);
struct wbuf wbuf;
char *MALLOC_N(size, buf);
char *MALLOC_N_ALIGNED(512, alloced_size, buf);
if (buf == NULL) {
result = get_error_errno();
} else {
wbuf_init(&wbuf, buf, size);
out->h->h->on_disk_stats = out->h->in_memory_stats;
toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk);
for (size_t i=size; i<alloced_size; i++) buf[i]=0; // initialize all those unused spots to zero
if (wbuf.ndone != size)
result = EINVAL;
else
result = toku_os_pwrite(out->fd, wbuf.buf, wbuf.ndone, 0);
else {
assert(wbuf.ndone <= alloced_size);
result = toku_os_pwrite(out->fd, wbuf.buf, alloced_size, 0);
}
toku_free(buf);
}
return result;
......
......@@ -8,19 +8,22 @@
static void ba_alloc_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset) {
block_allocator_validate(ba);
block_allocator_alloc_block_at(ba, size, offset);
block_allocator_alloc_block_at(ba, size*512, offset*512);
block_allocator_validate(ba);
}
static void ba_alloc (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *answer) {
block_allocator_validate(ba);
block_allocator_alloc_block(ba, size, answer);
uint64_t actual_answer;
block_allocator_alloc_block(ba, 512*size, &actual_answer);
block_allocator_validate(ba);
assert(actual_answer%512==0);
*answer = actual_answer/512;
}
static void ba_free (BLOCK_ALLOCATOR ba, uint64_t offset) {
block_allocator_validate(ba);
block_allocator_free_block(ba, offset);
block_allocator_free_block(ba, offset*512);
block_allocator_validate(ba);
}
......@@ -30,8 +33,8 @@ ba_check_l (BLOCK_ALLOCATOR ba, uint64_t blocknum_in_layout_order, uint64_t expe
uint64_t actual_offset, actual_size;
int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size);
assert(r==0);
assert(expected_offset == actual_offset);
assert(expected_size == actual_size);
assert(expected_offset*512 == actual_offset);
assert(expected_size *512 == actual_size);
}
static void
......@@ -48,10 +51,10 @@ static void
test_ba0 (void) {
BLOCK_ALLOCATOR ba;
uint64_t b0, b1;
create_block_allocator(&ba, 100, 1);
assert(block_allocator_allocated_limit(ba)==100);
create_block_allocator(&ba, 100*512, 1*512);
assert(block_allocator_allocated_limit(ba)==100*512);
ba_alloc_at(ba, 50, 100);
assert(block_allocator_allocated_limit(ba)==150);
assert(block_allocator_allocated_limit(ba)==150*512);
ba_alloc_at(ba, 25, 150);
ba_alloc (ba, 10, &b0);
ba_check_l (ba, 0, 0, 100);
......@@ -66,9 +69,9 @@ test_ba0 (void) {
assert(b0==160);
ba_alloc(ba, 10, &b0);
ba_alloc(ba, 113, &b1);
assert(113==block_allocator_block_size(ba, b1));
assert(10==block_allocator_block_size(ba, b0));
assert(50==block_allocator_block_size(ba, 100));
assert(113*512==block_allocator_block_size(ba, b1 *512));
assert(10 *512==block_allocator_block_size(ba, b0 *512));
assert(50 *512==block_allocator_block_size(ba, 100*512));
uint64_t b2, b3, b4, b5, b6, b7;
ba_alloc(ba, 100, &b2);
......@@ -103,7 +106,7 @@ test_ba0 (void) {
static void
test_ba1 (int n_initial) {
BLOCK_ALLOCATOR ba;
create_block_allocator(&ba, 0, 1);
create_block_allocator(&ba, 0*512, 1*512);
int i;
int n_blocks=0;
uint64_t blocks[1000];
......@@ -136,8 +139,8 @@ test_ba2 (void)
BLOCK_ALLOCATOR ba;
uint64_t b[6];
enum { BSIZE = 1024 };
create_block_allocator(&ba, 100, BSIZE);
assert(block_allocator_allocated_limit(ba)==100);
create_block_allocator(&ba, 100*512, BSIZE*512);
assert(block_allocator_allocated_limit(ba)==100*512);
ba_check_l (ba, 0, 0, 100);
ba_check_none (ba, 1);
......
......@@ -188,7 +188,7 @@ static void *my_malloc(size_t n) {
}
}
}
return malloc(n);
return os_malloc(n);
}
static int do_realloc_errors = 1;
......@@ -207,7 +207,7 @@ static void *my_realloc(void *p, size_t n) {
}
}
}
return realloc(p, n);
return os_realloc(p, n);
}
......
......@@ -28,7 +28,7 @@ static void *my_malloc(size_t n) {
errno = ENOSPC;
return NULL;
} else
return malloc(n);
return os_malloc(n);
}
static int my_compare(DB *UU(desc), const DBT *UU(akey), const DBT *UU(bkey)) {
......
......@@ -17,6 +17,7 @@ target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT
add_library(tokuportability_static_conv STATIC ${tokuportability_srcs})
set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON)
add_dependencies(tokuportability_static_conv build_jemalloc)
set(tokuportability_source_libs tokuportability_static_conv jemalloc ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}")
......
......@@ -196,6 +196,8 @@ toku_os_write (int fd, const void *buf, size_t len) {
void
toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
assert(0==((long long)buf)%512);
assert((len%512 == 0) && (off%512)==0); // to make pwrite work.
const char *bp = (const char *) buf;
while (len > 0) {
ssize_t r;
......@@ -218,6 +220,9 @@ toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
ssize_t
toku_os_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
assert(0==((long long)buf)%512); // these asserts are to ensure that direct I/O will work.
assert(0==len %512);
assert(0==off %512);
const char *bp = (const char *) buf;
ssize_t result = 0;
while (len > 0) {
......@@ -269,6 +274,25 @@ toku_os_open(const char *path, int oflag, int mode) {
return rval;
}
int
toku_os_open_direct(const char *path, int oflag, int mode) {
int rval;
#if defined(HAVE_O_DIRECT)
rval = toku_os_open(path, oflag | O_DIRECT, mode);
#elif defined(HAVE_F_NOCACHE)
rval = toku_os_open(path, oflag, mode);
if (rval >= 0) {
int r = fcntl(rval, F_NOCACHE, 1);
if (r == -1) {
perror("setting F_NOCACHE");
}
}
#else
# error "No direct I/O implementation found."
#endif
return rval;
}
int
toku_os_fclose(FILE * stream) {
int rval = -1;
......@@ -310,6 +334,9 @@ toku_os_read(int fd, void *buf, size_t count) {
ssize_t
toku_os_pread (int fd, void *buf, size_t count, off_t offset) {
assert(0==((long long)buf)%512);
assert(0==count%512);
assert(0==offset%512);
ssize_t r;
if (t_pread) {
r = t_pread(fd, buf, count, offset);
......
......@@ -21,17 +21,17 @@
#include <portability/toku_atomic.h>
static malloc_fun_t t_malloc = 0;
static malloc_aligned_fun_t t_malloc_aligned = 0;
static malloc_fun_t t_xmalloc = 0;
static malloc_aligned_fun_t t_xmalloc_aligned = 0;
static free_fun_t t_free = 0;
static realloc_fun_t t_realloc = 0;
static realloc_aligned_fun_t t_realloc_aligned = 0;
static realloc_fun_t t_xrealloc = 0;
static LOCAL_MEMORY_STATUS_S status;
int toku_memory_do_stats = 0;
typedef size_t (*malloc_usable_size_fun_t)(const void *);
static malloc_usable_size_fun_t malloc_usable_size_f;
static bool memory_startup_complete;
int
......@@ -76,14 +76,6 @@ toku_memory_startup(void) {
}
}
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
if (!malloc_usable_size_f) {
result = EINVAL; // couldn't find a malloc size function
}
}
return result;
}
......@@ -105,7 +97,7 @@ toku_memory_get_status(LOCAL_MEMORY_STATUS s) {
// jemalloc's malloc_usable_size does not work with a NULL pointer, so we implement a version that works
static size_t
my_malloc_usable_size(void *p) {
return p == NULL ? 0 : malloc_usable_size_f(p);
return p == NULL ? 0 : os_malloc_usable_size(p);
}
// Note that max_in_use may be slightly off because use of max_in_use is not thread-safe.
......@@ -162,6 +154,23 @@ toku_malloc(size_t size) {
return p;
}
void *toku_malloc_aligned(size_t alignment, size_t size) {
void *p = t_malloc_aligned ? t_malloc_aligned(alignment, size) : os_malloc_aligned(alignment, size);
if (p) {
TOKU_ANNOTATE_NEW_MEMORY(p, size); // see #4671 and https://bugs.kde.org/show_bug.cgi?id=297147
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(p);
toku_sync_add_and_fetch(&status.malloc_count, 1);
toku_sync_add_and_fetch(&status.requested,size);
toku_sync_add_and_fetch(&status.used, used);
set_max(status.used, status.freed);
}
} else {
toku_sync_add_and_fetch(&status.malloc_fail, 1);
}
return p;
}
void *
toku_calloc(size_t nmemb, size_t size) {
size_t newsize = nmemb * size;
......@@ -189,6 +198,25 @@ toku_realloc(void *p, size_t size) {
return q;
}
void *toku_realloc_aligned(size_t alignment, void *p, size_t size) {
size_t used_orig = p ? my_malloc_usable_size(p) : 0;
void *q = t_realloc_aligned ? t_realloc_aligned(alignment, p, size) : os_realloc_aligned(alignment, p, size);
if (q) {
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(q);
toku_sync_add_and_fetch(&status.realloc_count, 1);
toku_sync_add_and_fetch(&status.requested, size);
toku_sync_add_and_fetch(&status.used, used);
toku_sync_add_and_fetch(&status.freed, used_orig);
set_max(status.used, status.freed);
}
} else {
toku_sync_add_and_fetch(&status.realloc_fail, 1);
}
return q;
}
void *
toku_memdup(const void *v, size_t len) {
void *p = toku_malloc(len);
......@@ -232,6 +260,23 @@ toku_xmalloc(size_t size) {
return p;
}
void* toku_xmalloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Fail with a resource_assert if the allocation fails (don't return an error code).
// Requires: alignment is a power of two.
{
void *p = t_xmalloc_aligned ? t_xmalloc_aligned(alignment, size) : os_malloc_aligned(alignment,size);
resource_assert(p);
if (toku_memory_do_stats) {
size_t used = my_malloc_usable_size(p);
toku_sync_add_and_fetch(&status.malloc_count, 1);
toku_sync_add_and_fetch(&status.requested, size);
toku_sync_add_and_fetch(&status.used, used);
set_max(status.used, status.freed);
}
return p;
}
void *
toku_xcalloc(size_t nmemb, size_t size) {
size_t newsize = nmemb * size;
......
......@@ -8,11 +8,174 @@
#include <toku_portability.h>
#include <stdlib.h>
#include <jemalloc/include/jemalloc/jemalloc.h>
#if defined(HAVE_MALLOC_H)
# include <malloc.h>
#elif defined(HAVE_SYS_MALLOC_H)
# include <sys/malloc.h>
#endif
#include <dlfcn.h>
#include <string.h>
// #define this to use a version of os_malloc that helps to debug certain features.
// This version uses the real malloc (so that valgrind should still work) but it forces things to be slightly
// misaligned (in particular, avoiding 512-byte alignment if possible, to find situations where O_DIRECT will fail.
// #define USE_DEBUGGING_MALLOCS
#ifdef USE_DEBUGGING_MALLOCS
#include <pthread.h>
// Make things misaligned on 512-byte boundaries
static size_t malloced_now_count=0, malloced_now_size=0;
struct malloc_pair {
void *returned_pointer;
void *true_pointer;
size_t requested_size = 0;
};
static struct malloc_pair *malloced_now;
static pthread_mutex_t malloc_mutex = PTHREAD_MUTEX_INITIALIZER;
static void malloc_lock(void) {
int r = pthread_mutex_lock(&malloc_mutex);
assert(r==0);
}
static void malloc_unlock(void) {
int r = pthread_mutex_unlock(&malloc_mutex);
assert(r==0);
}
static void push_to_malloced_memory(void *returned_pointer, void *true_pointer, size_t requested_size) {
malloc_lock();
if (malloced_now_count == malloced_now_size) {
malloced_now_size = 2*malloced_now_size + 1;
malloced_now = (struct malloc_pair *)realloc(malloced_now, malloced_now_size * sizeof(*malloced_now));
}
malloced_now[malloced_now_count].returned_pointer = returned_pointer;
malloced_now[malloced_now_count].true_pointer = true_pointer;
malloced_now[malloced_now_count].requested_size = requested_size;
malloced_now_count++;
malloc_unlock();
}
static struct malloc_pair *find_malloced_pair(const void *p)
// Requires: Lock must be held before calling.
{
for (size_t i=0; i<malloced_now_count; i++) {
if (malloced_now[i].returned_pointer==p) return &malloced_now[i];
}
return 0;
}
void *os_malloc(size_t size) {
void *raw_ptr = malloc(size+16); // allocate 16 extra bytes
size_t raw_ptr_i = (size_t) raw_ptr;
if (raw_ptr_i%512==0) {
push_to_malloced_memory(16+(char*)raw_ptr, raw_ptr, size);
return 16+(char*)raw_ptr;
} else {
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
}
}
void *os_malloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
void *p;
int r = posix_memalign(&p, alignment, size);
if (r != 0) {
errno = r;
p = nullptr;
}
return p;
if (alignment%512==0) {
void *raw_ptr;
int r = posix_memalign(&raw_ptr, alignment, size);
if (r != 0) {
errno = r;
return nullptr;
}
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
} else {
// Make sure it isn't 512-byte aligned
void *raw_ptr;
int r = posix_memalign(&raw_ptr, alignment, size+alignment);
if (r != 0) {
errno = r;
return nullptr;
}
size_t raw_ptr_i = (size_t) raw_ptr;
if (raw_ptr_i%512==0) {
push_to_malloced_memory(alignment+(char*)raw_ptr, raw_ptr, size);
return alignment+(char*)raw_ptr;
} else {
push_to_malloced_memory(raw_ptr, raw_ptr, size);
return raw_ptr;
}
}
}
static size_t min(size_t a, size_t b) {
if (a<b) return a;
else return b;
}
void *os_realloc(void *p, size_t size) {
size_t alignment;
if (size<4) {
alignment = 1;
} else if (size<8) {
alignment = 4;
} else if (size<16) {
alignment = 8;
} else {
alignment = 16;
}
return os_realloc_aligned(alignment, p, size);
}
void * os_realloc_aligned(size_t alignment, void *p, size_t size)
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
void *result = os_malloc_aligned(alignment, size);
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
// now copy all the good stuff from p to result
memcpy(result, p, min(size, mp->requested_size));
malloc_unlock();
os_free(p);
return result;
}
}
void os_free(void* p) {
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
free(mp->true_pointer);
*mp = malloced_now[--malloced_now_count];
malloc_unlock();
}
size_t os_malloc_usable_size(const void *p) {
malloc_lock();
struct malloc_pair *mp = find_malloced_pair(p);
assert(mp);
size_t size = mp->requested_size;
malloc_unlock();
return size;
}
#else
void *
os_malloc(size_t size)
......@@ -20,14 +183,81 @@ os_malloc(size_t size)
return malloc(size);
}
void *os_malloc_aligned(size_t alignment, size_t size)
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
void *p;
int r = posix_memalign(&p, alignment, size);
if (r != 0) {
errno = r;
p = nullptr;
}
return p;
}
void *
os_realloc(void *p, size_t size)
{
return realloc(p, size);
}
void * os_realloc_aligned(size_t alignment, void *p, size_t size)
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
{
#if 1
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
void *newp = realloc(p, size);
if (0!=((long long)newp%alignment)) {
// it's not aligned, so align it ourselves.
void *newp2 = os_malloc_aligned(alignment, size);
memcpy(newp2, newp, size);
free(newp);
newp = newp2;
}
return newp;
}
#else
// THIS STUFF SEEMS TO FAIL VALGRIND
if (p==NULL) {
return os_malloc_aligned(alignment, size);
} else {
size_t ignore;
int r = rallocm(&p, // returned pointer
&ignore, // actual size of returned object.
size, // the size we want
0, // extra bytes to "try" to allocate at the end
ALLOCM_ALIGN(alignment));
if (r!=0) return NULL;
else return p;
}
#endif
}
void
os_free(void* p)
{
free(p);
}
typedef size_t (*malloc_usable_size_fun_t)(const void *);
static malloc_usable_size_fun_t malloc_usable_size_f = NULL;
size_t os_malloc_usable_size(const void *p) {
if (p==NULL) return 0;
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
if (!malloc_usable_size_f) {
malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
if (!malloc_usable_size_f) {
abort(); // couldn't find a malloc size function
}
}
}
return malloc_usable_size_f(p);
}
#endif
......@@ -50,9 +50,8 @@ int test_main(int UU(argc), char *const argv[] UU()) {
}
{
struct unpackedsevenbytestruct *usevenbytestructs;
int r = posix_memalign((void **) &usevenbytestructs, cachelinesize, sizeof(unpackedsevenbytestruct) * 10);
if (r) {
struct unpackedsevenbytestruct *MALLOC_N_ALIGNED(cachelinesize, 10, usevenbytestructs);
if (usevenbytestructs == NULL) {
// this test is supposed to crash, so exiting cleanly is a failure
perror("posix_memalign");
exit(EXIT_FAILURE);
......@@ -65,8 +64,9 @@ int test_main(int UU(argc), char *const argv[] UU()) {
toku_free(usevenbytestructs);
}
int r = posix_memalign((void **) &psevenbytestructs, cachelinesize, sizeof(packedsevenbytestruct) * 10);
if (r) {
MALLOC_N_ALIGNED(cachelinesize, 10, psevenbytestructs);
if (psevenbytestructs == NULL) {
// this test is supposed to crash, so exiting cleanly is a failure
perror("posix_memalign");
exit(EXIT_FAILURE);
......
......@@ -7,6 +7,7 @@
#include <test.h>
#include <fcntl.h>
#include <toku_assert.h>
#include <memory.h>
#include <string.h>
#include <stdio.h>
......@@ -28,8 +29,9 @@ int test_main(int argc, char *const argv[]) {
unlink(fname);
int fd = open(fname, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO);
assert(fd>=0);
char buf[] = "hello";
int64_t offset = (1LL<<32) + 100;
char *XMALLOC_N_ALIGNED(512, 512, buf);
strcpy(buf, "hello");
int64_t offset = (1LL<<32) + 512;
toku_os_full_pwrite(fd, buf, sizeof buf, offset);
char newbuf[sizeof buf];
r = pread(fd, newbuf, sizeof newbuf, 100);
......@@ -42,6 +44,7 @@ int test_main(int argc, char *const argv[]) {
r = toku_os_get_file_size(fd, &fsize);
assert(r == 0);
assert(fsize > 100 + (signed)sizeof(buf));
toku_free(buf);
r = close(fd);
assert(r==0);
return 0;
......
......@@ -4,6 +4,7 @@
db_env_create;
db_strerror;
db_version;
db_env_set_direct_io;
db_env_set_func_fsync;
db_env_set_func_malloc;
db_env_set_func_realloc;
......
......@@ -27,6 +27,10 @@ void * checkpoint_callback2_extra = NULL;
bool engine_status_enable = true; // if false, suppress engine status output on failed assert, for test programs only
void db_env_set_direct_io (bool direct_io_on) {
toku_ft_set_direct_io(direct_io_on);
}
void db_env_set_func_fsync (int (*fsync_function)(int)) {
toku_set_func_fsync(fsync_function);
}
......
......@@ -44,6 +44,8 @@
#cmakedefine HAVE_M_MMAP_THRESHOLD 1
#cmakedefine HAVE_CLOCK_REALTIME 1
#cmakedefine HAVE_O_DIRECT 1
#cmakedefine HAVE_F_NOCACHE 1
#cmakedefine HAVE_MALLOC_SIZE 1
#cmakedefine HAVE_MALLOC_USABLE_SIZE 1
......
......@@ -18,17 +18,28 @@ void toku_memory_shutdown(void) __attribute__((destructor));
/* Generally: errno is set to 0 or a value to indicate problems. */
/* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */
// Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc()
// That way the tests can can, e.g., replace the malloc function using toku_set_func_malloc().
void *toku_calloc(size_t nmemb, size_t size) __attribute__((__visibility__("default")));
void *toku_xcalloc(size_t nmemb, size_t size) __attribute__((__visibility__("default")));
void *toku_malloc(size_t size) __attribute__((__visibility__("default")));
void *toku_malloc_aligned(size_t alignment, size_t size) __attribute__((__visibility__("default")));
// xmalloc aborts instead of return NULL if we run out of memory
void *toku_xmalloc(size_t size);
void *toku_xmalloc(size_t size) __attribute__((__visibility__("default")));
void *toku_xrealloc(void*, size_t size) __attribute__((__visibility__("default")));
void *toku_xmalloc_aligned(size_t alignment, size_t size) __attribute__((__visibility__("default")));
// Effect: Perform a os_malloc_aligned(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Fail with a resource_assert if the allocation fails (don't return an error code).
// If the alloc_aligned function has been set then call it instead.
// Requires: alignment is a power of two.
void toku_free(void*) __attribute__((__visibility__("default")));
void *toku_realloc(void *, size_t size) __attribute__((__visibility__("default")));
void *toku_realloc_aligned(size_t alignment, void *p, size_t size) __attribute__((__visibility__("default")));
// Effect: Perform a os_realloc_aligned(alignment, p, size) which has the additional property that the returned pointer is a multiple of ALIGNMENT.
// If the malloc_aligned function has been set then call it instead.
// Requires: alignment is a power of two.
size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")));
......@@ -50,6 +61,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
* to make an array of 5 integers.
*/
#define MALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_malloc((n)*sizeof(*v)))
#define MALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n)*sizeof(*v)))
//CALLOC_N is like calloc with auto-figuring out size of members
#define CALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
......@@ -57,6 +70,7 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
#define CALLOC(v) CALLOC_N(1,v)
#define REALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_realloc(v, (n)*sizeof(*v)))
#define REALLOC_N_ALIGNED(align, n,v) CAST_FROM_VOIDP(v, toku_realloc_aligned((align), v, (n)*sizeof(*v)))
// XMALLOC macros are like MALLOC except they abort if the operation fails
#define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
......@@ -66,6 +80,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
#define XREALLOC(v,s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
#define XREALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n)*sizeof(*v)))
#define XMALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n)*sizeof(*v)))
#define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
#define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))
......@@ -94,6 +110,8 @@ void toku_do_memory_check(void);
typedef void *(*malloc_fun_t)(size_t);
typedef void (*free_fun_t)(void*);
typedef void *(*realloc_fun_t)(void*,size_t);
typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void */*pointer*/, size_t /*size*/);
void toku_set_func_malloc(malloc_fun_t f);
void toku_set_func_xmalloc_only(malloc_fun_t f);
......
......@@ -247,8 +247,26 @@ extern void *realloc(void*, size_t) __THROW __attribute__((__deprecat
#endif
void *os_malloc(size_t) __attribute__((__visibility__("default")));
// Effect: See man malloc(2)
void *os_malloc_aligned(size_t /*alignment*/, size_t /*size*/) __attribute__((__visibility__("default")));
// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
void *os_realloc(void*,size_t) __attribute__((__visibility__("default")));
// Effect: See man realloc(2)
void *os_realloc_aligned(size_t/*alignment*/, void*,size_t) __attribute__((__visibility__("default")));
// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
// Requires: alignment is a power of two.
void os_free(void*) __attribute__((__visibility__("default")));
// Effect: See man free(2)
size_t os_malloc_usable_size(const void *p) __attribute__((__visibility__("default")));
// Effect: Return an estimate of the usable size inside a pointer. If this function is not defined the memory.cc will
// look for the jemalloc, libc, or darwin versions of the function for computing memory footprint.
// full_pwrite and full_write performs a pwrite, and checks errors. It doesn't return unless all the data was written. */
void toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) __attribute__((__visibility__("default")));
......@@ -262,6 +280,7 @@ int toku_os_write (int fd, const void *buf, size_t len) __attribute__((__visibil
FILE * toku_os_fdopen(int fildes, const char *mode);
FILE * toku_os_fopen(const char *filename, const char *mode);
int toku_os_open(const char *path, int oflag, int mode);
int toku_os_open_direct(const char *path, int oflag, int mode);
int toku_os_close(int fd);
int toku_os_fclose(FILE * stream);
ssize_t toku_os_read(int fd, void *buf, size_t count);
......@@ -293,4 +312,17 @@ void toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t));
int toku_portability_init(void);
void toku_portability_destroy(void);
static inline uint64_t roundup_to_multiple(uint64_t alignment, uint64_t v)
// Effect: Return X, where X the smallest multiple of ALIGNMENT such that X>=V.
// Requires: ALIGNMENT is a power of two
{
assert(0==(alignment&(alignment-1))); // alignment must be a power of two
uint64_t result = (v+alignment-1)&~(alignment-1);
assert(result>=v); // The result is >=V.
assert(result%alignment==0); // The result is a multiple of alignment.
assert(result<v+alignment); // The result is the smallest such multiple of alignment.
return result;
}
#endif /* TOKU_PORTABILITY_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment