refs #5842, merge Direct I/O to main

git-svn-id: file:///svn/toku/tokudb@52238 c7de825b-a66e-492c-adef-691d508d4ae1

refs #5842, merge Direct I/O to main
git-svn-id: file:///svn/toku/tokudb@52238 c7de825b-a66e-492c-adef-691d508d4ae1
87217329 · Zardosht Kasheff · Yoni Fogel · dfe8a560 · 87217329 · 87217329
Commit 87217329 authored Jan 18, 2013 by Zardosht Kasheff Committed by Yoni Fogel Apr 17, 2013
24 changed files
--- a/buildheader/make_tdb.cc
+++ b/buildheader/make_tdb.cc
@@ -693,6 +693,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
    printf("int log_compare (const DB_LSN*, const DB_LSN *) %s;\n", VISIBLE);
    printf("int toku_set_trace_file (const char *fname) %s;\n", VISIBLE);
    printf("int toku_close_trace_file (void) %s;\n", VISIBLE);
+    printf("void db_env_set_direct_io (bool direct_io_on) %s;\n", VISIBLE);
    printf("void db_env_set_func_fsync (int (*)(int)) %s;\n", VISIBLE);
    printf("void db_env_set_func_free (void (*)(void*)) %s;\n", VISIBLE);
    printf("void db_env_set_func_malloc (void *(*)(size_t)) %s;\n", VISIBLE);

--- a/cmake_modules/TokuFeatureDetection.cmake
+++ b/cmake_modules/TokuFeatureDetection.cmake
@@ -53,6 +53,9 @@ include(CheckSymbolExists)
 check_symbol_exists(M_MMAP_THRESHOLD "malloc.h" HAVE_M_MMAP_THRESHOLD)
 ## check whether we have CLOCK_REALTIME
 check_symbol_exists(CLOCK_REALTIME "time.h" HAVE_CLOCK_REALTIME)
+## check how to do direct I/O
+check_symbol_exists(O_DIRECT "fcntl.h" HAVE_O_DIRECT)
+check_symbol_exists(F_NOCACHE "fcntl.h" HAVE_F_NOCACHE)

 include(CheckFunctionExists)


--- a/cmake_modules/TokuSetupCompiler.cmake
+++ b/cmake_modules/TokuSetupCompiler.cmake
@@ -108,7 +108,7 @@ set_ldflags_if_supported(
 ## set extra debugging flags and preprocessor definitions
 set(CMAKE_C_FLAGS_DEBUG "-g3 -O0 ${CMAKE_C_FLAGS_DEBUG}")
 set(CMAKE_CXX_FLAGS_DEBUG "-g3 -O0 ${CMAKE_CXX_FLAGS_DEBUG}")
-set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2)
+#set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS_DEBUG _FORTIFY_SOURCE=2)

 ## set extra release flags
 if (APPLE AND CMAKE_CXX_COMPILER_ID STREQUAL Clang)

--- a/ft/block_allocator.cc
+++ b/ft/block_allocator.cc
@@ -58,6 +58,7 @@ block_allocator_print (BLOCK_ALLOCATOR ba) {

 void
 create_block_allocator (BLOCK_ALLOCATOR *ba, uint64_t reserve_at_beginning, uint64_t alignment) {
+    assert(alignment>=512 && 0==(alignment%512)); // the alignment must be at least 512 and aligned with 512 to make DIRECT_IO happy.
    BLOCK_ALLOCATOR XMALLOC(result);
    result->reserve_at_beginning = reserve_at_beginning;
    result->alignment = alignment;
@@ -176,8 +177,9 @@ align (uint64_t value, BLOCK_ALLOCATOR ba)
    return ((value+ba->alignment-1)/ba->alignment)*ba->alignment;
 }

-void
-block_allocator_alloc_block (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset) {
+void block_allocator_alloc_block(BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *offset)
+// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
+{
    invariant(size > 0); //Allocator does not support size 0 blocks. See block_allocator_free_block.
    grow_blocks_array(ba);
    ba->n_bytes_in_use += size;

--- a/ft/block_table.cc
+++ b/ft/block_table.cc
@@ -459,9 +459,11 @@ pair_is_unallocated(struct block_translation_pair *pair) {
    return pair->size == 0 && pair->u.diskoff == diskoff_unused;
 }

-// Purpose of this function is to figure out where to put the inprogress btt on disk, allocate space for it there.
-static void
-blocknum_alloc_translation_on_disk_unlocked (BLOCK_TABLE bt) {
+static void blocknum_alloc_translation_on_disk_unlocked(BLOCK_TABLE bt)
+// Effect: figure out where to put the inprogress btt on disk, allocate space for it there.
+//   The space must be 512-byte aligned (both the starting address and the size).
+//   As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt.
+{
    toku_mutex_assert_locked(&bt->mutex);

    struct translation *t = &bt->inprogress;
@@ -479,24 +481,29 @@ PRNTF("blokAllokator", 1L, size, offset, bt);
    t->block_translation[b.b].size      = size;
 }

-//Fills wbuf with bt
-//A clean shutdown runs checkpoint start so that current and inprogress are copies.
-void
-toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
-                                            int64_t *address, int64_t *size) {
+void toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
+                                        int64_t *address, int64_t *size) 
+// Effect: Fills wbuf (which starts uninitialized) with bt
+//   A clean shutdown runs checkpoint start so that current and inprogress are copies.
+//   The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
+//   The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
+//   It *is* guaranteed that we can read up to the next 512-byte boundary, however
+{
    lock_for_blocktable(bt);
    struct translation *t = &bt->inprogress;

    BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
-    blocknum_alloc_translation_on_disk_unlocked(bt);
-    {
-        //Init wbuf
+    blocknum_alloc_translation_on_disk_unlocked(bt); // The allocated block must be 512-byte aligned to make O_DIRECT happy.
    uint64_t size_translation = calculate_size_on_disk(t);
+    uint64_t size_aligned     = roundup_to_multiple(512, size_translation);
    assert((int64_t)size_translation==t->block_translation[b.b].size);
+    {
+        //Init wbuf
        if (0)
            printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff);
-        wbuf_init(w, toku_malloc(size_translation), size_translation);
-        assert(w->size==size_translation);
+        char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
+        for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros.
+        wbuf_init(w, buf, size_aligned);
    }
    wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); 
    wbuf_BLOCKNUM(w, t->blocknum_freelist_head); 
@@ -510,9 +517,10 @@ toku_serialize_translation_to_wbuf(BLOCK_TABLE bt, int fd, struct wbuf *w,
    uint32_t checksum = x1764_finish(&w->checksum);
    wbuf_int(w, checksum);
    *address = t->block_translation[b.b].u.diskoff;
-    *size    = t->block_translation[b.b].size;
+    *size    = size_translation;
+    assert((*address)%512 == 0);

-    ensure_safe_write_unlocked(bt, fd, *size, *address);
+    ensure_safe_write_unlocked(bt, fd, size_aligned, *address);
    unlock_for_blocktable(bt);
 }


--- a/ft/ft-ops.cc
+++ b/ft/ft-ops.cc
@@ -3397,6 +3397,20 @@ int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *ft_handle_
    return r;
 }

+static bool use_direct_io = true;
+
+void toku_ft_set_direct_io (bool direct_io_on) {
+    use_direct_io = direct_io_on;
+}
+
+static inline int ft_open_maybe_direct(const char *filename, int oflag, int mode) {
+    if (use_direct_io) {
+        return toku_os_open_direct(filename, oflag, mode);
+    } else {
+        return toku_os_open(filename, oflag, mode);
+    }
+}
+
 // open a file for use by the brt
 // Requires:  File does not exist.
 static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
@@ -3404,12 +3418,12 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
    int r;
    int fd;
    int er;
-    fd = open(fname, O_RDWR | O_BINARY, mode);
+    fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
    assert(fd==-1);
    if ((er = get_maybe_error_errno()) != ENOENT) {
        return er;
    }
-    fd = open(fname, O_RDWR | O_CREAT | O_BINARY, mode);
+    fd = ft_open_maybe_direct(fname, O_RDWR | O_CREAT | O_BINARY, mode);
    if (fd==-1) {
        r = get_error_errno();
        return r;
@@ -3426,7 +3440,7 @@ static int ft_create_file(FT_HANDLE UU(brt), const char *fname, int *fdp) {
 static int ft_open_file(const char *fname, int *fdp) {
    mode_t mode = S_IRWXU|S_IRWXG|S_IRWXO;
    int fd;
-    fd = open(fname, O_RDWR | O_BINARY, mode);
+    fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, mode);
    if (fd==-1) {
        return get_error_errno();
    }

--- a/ft/ft-ops.h
+++ b/ft/ft-ops.h
@@ -257,4 +257,5 @@ int toku_ft_strerror_r(int error, char *buf, size_t buflen);

 extern bool garbage_collection_debug;

+void toku_ft_set_direct_io(bool direct_io_on);
 #endif
--- a/ft/ft-serialize.cc
+++ b/ft/ft-serialize.cc
@@ -39,8 +39,11 @@ void
 toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF offset) {
    // make the checksum
    int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
+    int64_t size_aligned = roundup_to_multiple(512, size);
    struct wbuf w;
-    wbuf_init(&w, toku_xmalloc(size), size);
+    char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
+    for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
+    wbuf_init(&w, aligned_buf, size);
    toku_serialize_descriptor_contents_to_wbuf(&w, desc);
    {
        //Add checksum
@@ -50,7 +53,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF
    lazy_assert(w.ndone==w.size);
    {
        //Actual Write translation table
-        toku_os_full_pwrite(fd, w.buf, size, offset);
+        toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
    }
    toku_free(w.buf);
 }
@@ -88,10 +91,12 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
    if (size > 0) {
        lazy_assert(size>=4); //4 for checksum
        {
-            XMALLOC_N(size, dbuf);
+            ssize_t size_to_malloc = roundup_to_multiple(512, size);
+            XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
            {
-                ssize_t sz_read = toku_os_pread(fd, dbuf, size, offset);
-                lazy_assert(sz_read==size);
+
+                ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
+                lazy_assert(sz_read==size_to_malloc);
            }
            {
                // check the checksum
@@ -118,9 +123,9 @@ deserialize_descriptor_from(int fd, BLOCK_TABLE bt, DESCRIPTOR desc, int layout_
    return r;
 }

-// We only deserialize brt header once and then share everything with all the brts.
-int
-deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
+int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
+// Effect: Deserialize the ft header.
+//   We deserialize brt header only once and then share everything with all the brts.
 {
    int r;
    FT ft = NULL;
@@ -179,14 +184,16 @@ deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)

    //Load translation table
    {
-        unsigned char *XMALLOC_N(translation_size_on_disk, tbuf);
+        size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
+        unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
        {
            // This cast is messed up in 32-bits if the block translation
            // table is ever more than 4GB.  But in that case, the
            // translation table itself won't fit in main memory.
-            ssize_t readsz = toku_os_pread(fd, tbuf, translation_size_on_disk,
+            ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
                                           translation_address_on_disk);
-            lazy_assert(readsz == translation_size_on_disk);
+            assert(readsz >= translation_size_on_disk);
+            assert(readsz <= (ssize_t)size_to_read);
        }
        // Create table and read in data.
        r = toku_blocktable_create_from_buffer(fd,
@@ -427,28 +434,30 @@ serialize_ft_min_size (uint32_t version) {
    return size;
 }

-// Simply reading the raw bytes of the header into an rbuf is insensitive
-// to disk format version.  If that ever changes, then modify this.
-//
-// TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
-// file AND the header is useless
-int
-deserialize_ft_from_fd_into_rbuf(int fd,
+int deserialize_ft_from_fd_into_rbuf(int fd,
                                     toku_off_t offset_of_header,
                                     struct rbuf *rb,
                                     uint64_t *checkpoint_count,
                                     LSN *checkpoint_lsn,
                                     uint32_t * version_p)
+// Effect: Read and parse the header of a fractalal tree
+//
+//  Simply reading the raw bytes of the header into an rbuf is insensitive
+//  to disk format version.  If that ever changes, then modify this.
+//
+//  TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
+//  file AND the header is useless
 {
    int r = 0;
    const int64_t prefix_size = 8 + // magic ("tokudata")
                                4 + // version
                                4 + // build_id
                                4;  // size
-    unsigned char prefix[prefix_size];
+    const int64_t read_size = roundup_to_multiple(512, prefix_size);
+    unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
    rb->buf = NULL;
-    int64_t n = toku_os_pread(fd, prefix, prefix_size, offset_of_header);
-    if (n != prefix_size) {
+    int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
+    if (n != read_size) {
        if (n==0) {
            r = TOKUDB_DICTIONARY_NO_HEADER;
        } else if (n<0) {
@@ -504,10 +513,14 @@ deserialize_ft_from_fd_into_rbuf(int fd,

    lazy_assert(rb->ndone==prefix_size);
    rb->size = size;
-    XMALLOC_N(rb->size, rb->buf);
+    {
+        toku_free(rb->buf);
+        uint32_t size_to_read = roundup_to_multiple(512, size);
+        XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);

-    n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header);
-    if (n != rb->size) {
+        assert(offset_of_header%512==0);
+        n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
+        if (n != size_to_read) {
            if (n < 0) {
                r = get_error_errno();
            } else {
@@ -515,6 +528,7 @@ deserialize_ft_from_fd_into_rbuf(int fd,
            }
            goto exit;
        }
+    }
    //It's version 14 or later.  Magic looks OK.
    //We have an rbuf that represents the header.
    //Size is within acceptable bounds.
@@ -549,9 +563,7 @@ deserialize_ft_from_fd_into_rbuf(int fd,

 exit:
    if (r != 0 && rb->buf != NULL) {
-        if (rb->buf != prefix) { // don't free prefix, it's stack alloc'd
        toku_free(rb->buf);
-        }
        rb->buf = NULL;
    }
    return r;
@@ -718,16 +730,23 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
    toku_serialize_translation_to_wbuf(blocktable, fd, &w_translation,
                                               &address_translation,
                                               &size_translation);
-    lazy_assert(size_translation == w_translation.size);
+    assert(size_translation == w_translation.ndone); // the bytes written are the size
+    assert(w_translation.size % 512 == 0);           // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.

    struct wbuf w_main;
    size_t size_main       = toku_serialize_ft_size(h);
-    wbuf_init(&w_main, toku_xmalloc(size_main), size_main);
+    size_t size_main_aligned = roundup_to_multiple(512, size_main);
+    assert(size_main_aligned<BLOCK_ALLOCATOR_HEADER_RESERVE);
+    char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
+    for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
+    wbuf_init(&w_main, mainbuf, size_main);
    toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
    lazy_assert(w_main.ndone == size_main);

-    //Actual Write translation table
-    toku_os_full_pwrite(fd, w_translation.buf, size_translation, address_translation);
+    // Actually write translation table
+    // This write is guaranteed to read good data at the end of the buffer, since the
+    // w_translation.buf is padded with zeros to a 512-byte boundary.
+    toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);

    //Everything but the header MUST be on disk before header starts.
    //Otherwise we will think the header is good and some blocks might not
@@ -746,7 +765,7 @@ void toku_serialize_ft_to (int fd, FT_HEADER h, BLOCK_TABLE blocktable, CACHEFIL
    //   Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
    toku_off_t main_offset;
    main_offset = (h->checkpoint_count & 0x1) ? 0 : BLOCK_ALLOCATOR_HEADER_RESERVE;
-    toku_os_full_pwrite(fd, w_main.buf, w_main.ndone, main_offset);
+    toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
    toku_free(w_main.buf);
    toku_free(w_translation.buf);
 }
--- a/ft/ft_node-serialize.cc
+++ b/ft/ft_node-serialize.cc
@@ -134,7 +134,9 @@ toku_maybe_preallocate_in_file (int fd, int64_t size, int64_t expected_size, int
        to_write += alignup64(min64(file_size + to_write, FILE_CHANGE_INCREMENT), stripe_width);
    }
    if (to_write > 0) {
-        char *XCALLOC_N(to_write, wbuf);
+        assert(to_write%512==0);
+        char *XMALLOC_N_ALIGNED(512, to_write, wbuf);
+        memset(wbuf, 0, to_write);
        toku_off_t start_write = alignup64(file_size, stripe_width);
        invariant(start_write >= file_size);
        toku_os_full_pwrite(fd, wbuf, to_write, start_write);
@@ -773,12 +775,7 @@ serialize_and_compress_sb_node_info(FTNODE node, struct sub_block *sb,
    st->compress_time += t2 - t1;
 }

-// Writes out each child to a separate malloc'd buffer, then compresses
-// all of them, and writes the uncompressed header, to bytes_to_write,
-// which is malloc'd.
-//
-int
-toku_serialize_ftnode_to_memory (FTNODE node,
+int toku_serialize_ftnode_to_memory(FTNODE node,
                                    FTNODE_DISK_DATA* ndd,
                                    unsigned int basementnodesize,
                                    enum toku_compression_method compression_method,
@@ -787,6 +784,12 @@ toku_serialize_ftnode_to_memory (FTNODE node,
                            /*out*/ size_t *n_bytes_to_write,
                            /*out*/ size_t *n_uncompressed_bytes,
                            /*out*/ char  **bytes_to_write)
+// Effect: Writes out each child to a separate malloc'd buffer, then compresses
+//   all of them, and writes the uncompressed header, to bytes_to_write,
+//   which is malloc'd.
+//
+//   The resulting buffer is guaranteed to be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needed).
+//   512-byte padding is for O_DIRECT to work.
 {
    toku_assert_entire_node_in_memory(node);

@@ -849,7 +852,9 @@ toku_serialize_ftnode_to_memory (FTNODE node,
        total_uncompressed_size += sb[i].uncompressed_size + 4;
    }

-    char *XMALLOC_N(total_node_size, data);
+    uint32_t total_buffer_size = roundup_to_multiple(512, total_node_size); // make the buffer be 512 bytes.
+    
+    char *XMALLOC_N_ALIGNED(512, total_buffer_size, data);
    char *curr_ptr = data;
    // now create the final serialized node

@@ -874,9 +879,14 @@ toku_serialize_ftnode_to_memory (FTNODE node,
        *(uint32_t *)curr_ptr = toku_htod32(sb[i].xsum);
        curr_ptr += sizeof(sb[i].xsum);
    }
+    // Zero the rest of the buffer
+    for (uint32_t i=total_node_size; i<total_buffer_size; i++) {
+        data[i]=0;
+    }
+            
    assert(curr_ptr - data == total_node_size);
    *bytes_to_write = data;
-    *n_bytes_to_write = total_node_size;
+    *n_bytes_to_write = total_buffer_size;
    *n_uncompressed_bytes = total_uncompressed_size;

    //
@@ -890,6 +900,8 @@ toku_serialize_ftnode_to_memory (FTNODE node,
        toku_free(sb[i].uncompressed_ptr);
    }

+    assert(0 == (*n_bytes_to_write)%512);
+    assert(0 == ((unsigned long long)(*bytes_to_write))%512);
    toku_free(sb);
    return 0;
 }
@@ -1152,11 +1164,13 @@ void read_block_from_fd_into_rbuf(
    // get the file offset and block size for the block
    DISKOFF offset, size;
    toku_translate_blocknum_to_offset_size(h->blocktable, blocknum, &offset, &size);
-    uint8_t *XMALLOC_N(size, raw_block);
+    DISKOFF size_aligned = roundup_to_multiple(512, size);
+    uint8_t *XMALLOC_N_ALIGNED(512, size_aligned, raw_block);
    rbuf_init(rb, raw_block, size);
    // read the block
-    ssize_t rlen = toku_os_pread(fd, raw_block, size, offset);
-    lazy_assert((DISKOFF)rlen == size);
+    ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
+    assert((DISKOFF)rlen >= size);
+    assert((DISKOFF)rlen <= size_aligned);
 }

 static const int read_header_heuristic_max = 32*1024;
@@ -1170,8 +1184,8 @@ static void read_ftnode_header_from_fd_into_rbuf_if_small_enough (int fd, BLOCKN
 {
    DISKOFF offset, size;
    toku_translate_blocknum_to_offset_size(ft->blocktable, blocknum, &offset, &size);
-    DISKOFF read_size = MIN(read_header_heuristic_max, size);
-    uint8_t *XMALLOC_N(size, raw_block);
+    DISKOFF read_size = roundup_to_multiple(512, MIN(read_header_heuristic_max, size));
+    uint8_t *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, size), raw_block);
    rbuf_init(rb, raw_block, read_size);

    // read the block
@@ -2418,14 +2432,20 @@ toku_deserialize_bp_from_disk(FTNODE node, FTNODE_DISK_DATA ndd, int childnum, i
    uint32_t curr_size   = BP_SIZE (ndd, childnum);
    struct rbuf rb = {.buf = NULL, .size = 0, .ndone = 0};

-    uint8_t *XMALLOC_N(curr_size, raw_block);
-    rbuf_init(&rb, raw_block, curr_size);
+    uint32_t pad_at_beginning = (node_offset+curr_offset)%512;
+    uint32_t padded_size = roundup_to_multiple(512, pad_at_beginning + curr_size);

+    uint8_t *XMALLOC_N_ALIGNED(512, padded_size, raw_block);
+    rbuf_init(&rb, pad_at_beginning+raw_block, curr_size);
    tokutime_t t0 = toku_time_now();

-    // read
-    ssize_t rlen = toku_os_pread(fd, raw_block, curr_size, node_offset+curr_offset);
-    lazy_assert((DISKOFF)rlen == curr_size);
+    // read the block
+    assert(0==((unsigned long long)raw_block)%512); // for O_DIRECT
+    assert(0==(padded_size)%512);
+    assert(0==(node_offset+curr_offset-pad_at_beginning)%512);
+    ssize_t rlen = toku_os_pread(fd, raw_block, padded_size, node_offset+curr_offset-pad_at_beginning);
+    assert((DISKOFF)rlen >= pad_at_beginning + curr_size); // we read in at least enough to get what we wanted
+    assert((DISKOFF)rlen <= padded_size);                  // we didn't read in too much.

    tokutime_t t1 = toku_time_now();

@@ -2627,12 +2647,14 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
                                       struct sub_block sub_block[/*n_sub_blocks*/],
                                       enum toku_compression_method method,
                               /*out*/ size_t *n_bytes_to_write,
-                               /*out*/ char  **bytes_to_write) {
+                               /*out*/ char  **bytes_to_write)
+// Guarantees that the malloc'd BYTES_TO_WRITE is 512-byte aligned (so that O_DIRECT will work)
+{
    // allocate space for the compressed uncompressed_buf
    size_t compressed_len = get_sum_compressed_size_bound(n_sub_blocks, sub_block, method);
    size_t sub_block_header_len = sub_block_header_size(n_sub_blocks);
    size_t header_len = node_header_overhead + sub_block_header_len + sizeof (uint32_t); // node + sub_block + checksum
-    char *XMALLOC_N(header_len + compressed_len, compressed_buf);
+    char *XMALLOC_N_ALIGNED(512, roundup_to_multiple(512, header_len + compressed_len), compressed_buf);

    // copy the header
    memcpy(compressed_buf, uncompressed_buf, node_header_overhead);
@@ -2662,7 +2684,12 @@ serialize_uncompressed_block_to_memory(char * uncompressed_buf,
    uint32_t xsum = x1764_memory(compressed_buf, header_length);
    *ptr = toku_htod32(xsum);

-    *n_bytes_to_write = header_len + compressed_len;
+    uint32_t padded_len = roundup_to_multiple(512, header_len + compressed_len);
+    // Zero out padding.
+    for (uint32_t i = header_len+compressed_len; i < padded_len; i++) {
+        compressed_buf[i] = 0;
+    }
+    *n_bytes_to_write = padded_len;
    *bytes_to_write   = compressed_buf;
 }

@@ -2933,11 +2960,13 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
    int r = 0;
    if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);

-    uint8_t *XMALLOC_N(size, raw_block);
+    DISKOFF size_aligned = roundup_to_multiple(512, size);
+    uint8_t *XMALLOC_N_ALIGNED(512, size, raw_block);
    {
        // read the (partially compressed) block
-        ssize_t rlen = toku_os_pread(fd, raw_block, size, offset);
-        lazy_assert((DISKOFF)rlen == size);
+        ssize_t rlen = toku_os_pread(fd, raw_block, size_aligned, offset);
+        lazy_assert((DISKOFF)rlen >= size);
+        lazy_assert((DISKOFF)rlen <= size_aligned);
    }
    // get the layout_version
    int layout_version;

--- a/ft/ftloader.cc
+++ b/ft/ftloader.cc
@@ -2102,6 +2102,7 @@ static void allocate_node (struct subtrees_info *sts, int64_t b) {
    sts->n_subtrees++;
 }

+// dbuf will always contained 512-byte aligned buffer, but the length might not be a multiple of 512 bytes.  If that's what you want, then pad it.
 struct dbuf {
    unsigned char *buf;
    int buflen;
@@ -2225,7 +2226,7 @@ static void putbuf_bytes (struct dbuf *dbuf, const void *bytes, int nbytes) {
        int oldbuflen = dbuf->buflen;
        dbuf->buflen += dbuf->off + nbytes;
        dbuf->buflen *= 2;
-        REALLOC_N(dbuf->buflen, dbuf->buf);
+        REALLOC_N_ALIGNED(512, dbuf->buflen, dbuf->buf);
        if (dbuf->buf == NULL) {
            dbuf->error = get_error_errno();
            dbuf->buf = oldbuf;
@@ -2905,9 +2906,17 @@ static int write_translation_table (struct dbout *out, long long *off_of_transla
    }
    unsigned int checksum = x1764_memory(ttable.buf, ttable.off);
    putbuf_int32(&ttable, checksum);
+    // pad it to 512 zeros
+    long long encoded_length = ttable.off;
+    {
+        int nbytes_to_add = roundup_to_multiple(512, ttable.off) - encoded_length;
+        char zeros[nbytes_to_add];
+        for (int i=0; i<nbytes_to_add; i++) zeros[i]=0;
+        putbuf_bytes(&ttable, zeros, nbytes_to_add);
+    }
    int result = ttable.error;
    if (result == 0) {
-        invariant(bt_size_on_disk==ttable.off);
+        invariant(bt_size_on_disk==encoded_length);
        result = toku_os_pwrite(out->fd, ttable.buf, ttable.off, off_of_translation);
    }
    dbuf_destroy(&ttable);
@@ -2919,18 +2928,22 @@ static int
 write_header (struct dbout *out, long long translation_location_on_disk, long long translation_size_on_disk) {
    int result = 0;
    size_t size = toku_serialize_ft_size(out->h->h);
+    size_t alloced_size = roundup_to_multiple(512, size);
    struct wbuf wbuf;
-    char *MALLOC_N(size, buf);
+    char *MALLOC_N_ALIGNED(512, alloced_size, buf);
    if (buf == NULL) {
        result = get_error_errno();
    } else {
        wbuf_init(&wbuf, buf, size);
        out->h->h->on_disk_stats = out->h->in_memory_stats;
        toku_serialize_ft_to_wbuf(&wbuf, out->h->h, translation_location_on_disk, translation_size_on_disk);
+        for (size_t i=size; i<alloced_size; i++) buf[i]=0; // initialize all those unused spots to zero
        if (wbuf.ndone != size)
            result = EINVAL;
-        else
-            result = toku_os_pwrite(out->fd, wbuf.buf, wbuf.ndone, 0);
+        else {
+            assert(wbuf.ndone <= alloced_size);
+            result = toku_os_pwrite(out->fd, wbuf.buf, alloced_size, 0);
+        }
        toku_free(buf);
    }
    return result;

--- a/ft/tests/block_allocator_test.cc
+++ b/ft/tests/block_allocator_test.cc
@@ -8,19 +8,22 @@

 static void ba_alloc_at (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t offset) {
    block_allocator_validate(ba);
-    block_allocator_alloc_block_at(ba, size, offset);
+    block_allocator_alloc_block_at(ba, size*512, offset*512);
    block_allocator_validate(ba);
 }

 static void ba_alloc (BLOCK_ALLOCATOR ba, uint64_t size, uint64_t *answer) {
    block_allocator_validate(ba);
-    block_allocator_alloc_block(ba, size, answer);
+    uint64_t actual_answer;
+    block_allocator_alloc_block(ba, 512*size, &actual_answer);
    block_allocator_validate(ba);
+    assert(actual_answer%512==0);
+    *answer = actual_answer/512;
 }

 static void ba_free (BLOCK_ALLOCATOR ba, uint64_t offset) {
    block_allocator_validate(ba);
-    block_allocator_free_block(ba, offset);
+    block_allocator_free_block(ba, offset*512);
    block_allocator_validate(ba);
 }

@@ -30,8 +33,8 @@ ba_check_l (BLOCK_ALLOCATOR ba, uint64_t blocknum_in_layout_order, uint64_t expe
    uint64_t actual_offset, actual_size;
    int r = block_allocator_get_nth_block_in_layout_order(ba, blocknum_in_layout_order, &actual_offset, &actual_size);
    assert(r==0);
-    assert(expected_offset == actual_offset);
-    assert(expected_size   == actual_size);
+    assert(expected_offset*512 == actual_offset);
+    assert(expected_size  *512 == actual_size);
 }

 static void
@@ -48,10 +51,10 @@ static void
 test_ba0 (void) {
    BLOCK_ALLOCATOR ba;
    uint64_t b0, b1;
-    create_block_allocator(&ba, 100, 1);
-    assert(block_allocator_allocated_limit(ba)==100);
+    create_block_allocator(&ba, 100*512, 1*512);
+    assert(block_allocator_allocated_limit(ba)==100*512);
    ba_alloc_at(ba, 50, 100);
-    assert(block_allocator_allocated_limit(ba)==150);
+    assert(block_allocator_allocated_limit(ba)==150*512);
    ba_alloc_at(ba, 25, 150);
    ba_alloc   (ba, 10, &b0);
    ba_check_l (ba, 0, 0,   100);
@@ -66,9 +69,9 @@ test_ba0 (void) {
    assert(b0==160);
    ba_alloc(ba, 10, &b0);
    ba_alloc(ba, 113, &b1);
-    assert(113==block_allocator_block_size(ba, b1));
-    assert(10==block_allocator_block_size(ba, b0));
-    assert(50==block_allocator_block_size(ba, 100));
+    assert(113*512==block_allocator_block_size(ba, b1 *512));
+    assert(10 *512==block_allocator_block_size(ba, b0 *512));
+    assert(50 *512==block_allocator_block_size(ba, 100*512));

    uint64_t b2, b3, b4, b5, b6, b7;
    ba_alloc(ba, 100, &b2);     
@@ -103,7 +106,7 @@ test_ba0 (void) {
 static void
 test_ba1 (int n_initial) {
    BLOCK_ALLOCATOR ba;
-    create_block_allocator(&ba, 0, 1);
+    create_block_allocator(&ba, 0*512, 1*512);
    int i;
    int n_blocks=0;
    uint64_t blocks[1000];
@@ -136,8 +139,8 @@ test_ba2 (void)
    BLOCK_ALLOCATOR ba;
    uint64_t b[6];
    enum { BSIZE = 1024 };
-    create_block_allocator(&ba, 100, BSIZE);
-    assert(block_allocator_allocated_limit(ba)==100);
+    create_block_allocator(&ba, 100*512, BSIZE*512);
+    assert(block_allocator_allocated_limit(ba)==100*512);
    ba_check_l    (ba, 0, 0, 100);
    ba_check_none (ba, 1);


--- a/ft/tests/ftloader-test-merge-files-dbufio.cc
+++ b/ft/tests/ftloader-test-merge-files-dbufio.cc
@@ -188,7 +188,7 @@ static void *my_malloc(size_t n) {
            }
        }
    }
-    return malloc(n);
+    return os_malloc(n);
 }

 static int do_realloc_errors = 1;
@@ -207,7 +207,7 @@ static void *my_realloc(void *p, size_t n) {
            }
        }
    }
-    return realloc(p, n);
+    return os_realloc(p, n);
 }



--- a/ft/tests/ftloader-test-open.cc
+++ b/ft/tests/ftloader-test-open.cc
@@ -28,7 +28,7 @@ static void *my_malloc(size_t n) {
        errno = ENOSPC;
        return NULL;
    } else
-        return malloc(n);
+        return os_malloc(n);
 }

 static int my_compare(DB *UU(desc), const DBT *UU(akey), const DBT *UU(bkey)) {

--- a/portability/CMakeLists.txt
+++ b/portability/CMakeLists.txt
@@ -17,6 +17,7 @@ target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT

 add_library(tokuportability_static_conv STATIC ${tokuportability_srcs})
 set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON)
+add_dependencies(tokuportability_static_conv build_jemalloc)
 set(tokuportability_source_libs tokuportability_static_conv jemalloc ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
 merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}")


--- a/portability/file.cc
+++ b/portability/file.cc
@@ -196,6 +196,8 @@ toku_os_write (int fd, const void *buf, size_t len) {

 void
 toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
+    assert(0==((long long)buf)%512);
+    assert((len%512 == 0) && (off%512)==0); // to make pwrite work.
    const char *bp = (const char *) buf;
    while (len > 0) {
        ssize_t r;
@@ -218,6 +220,9 @@ toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {

 ssize_t
 toku_os_pwrite (int fd, const void *buf, size_t len, toku_off_t off) {
+    assert(0==((long long)buf)%512); // these asserts are to ensure that direct I/O will work.
+    assert(0==len             %512);
+    assert(0==off             %512);
    const char *bp = (const char *) buf;
    ssize_t result = 0;
    while (len > 0) {
@@ -269,6 +274,25 @@ toku_os_open(const char *path, int oflag, int mode) {
    return rval;
 }

+int
+toku_os_open_direct(const char *path, int oflag, int mode) {
+    int rval;
+#if defined(HAVE_O_DIRECT)
+    rval = toku_os_open(path, oflag | O_DIRECT, mode);
+#elif defined(HAVE_F_NOCACHE)
+    rval = toku_os_open(path, oflag, mode);
+    if (rval >= 0) {
+        int r = fcntl(rval, F_NOCACHE, 1);
+        if (r == -1) {
+            perror("setting F_NOCACHE");
+        }
+    }
+#else
+# error "No direct I/O implementation found."
+#endif
+    return rval;
+}
+
 int
 toku_os_fclose(FILE * stream) {  
    int rval = -1;
@@ -310,6 +334,9 @@ toku_os_read(int fd, void *buf, size_t count) {

 ssize_t
 toku_os_pread (int fd, void *buf, size_t count, off_t offset) {
+    assert(0==((long long)buf)%512);
+    assert(0==count%512);
+    assert(0==offset%512);
    ssize_t r;
    if (t_pread) {
 	r = t_pread(fd, buf, count, offset);

--- a/portability/memory.cc
+++ b/portability/memory.cc
@@ -21,17 +21,17 @@
 #include <portability/toku_atomic.h>

 static malloc_fun_t  t_malloc  = 0;
+static malloc_aligned_fun_t t_malloc_aligned = 0;
 static malloc_fun_t  t_xmalloc = 0;
+static malloc_aligned_fun_t t_xmalloc_aligned = 0;
 static free_fun_t    t_free    = 0;
 static realloc_fun_t t_realloc = 0;
+static realloc_aligned_fun_t t_realloc_aligned = 0;
 static realloc_fun_t t_xrealloc = 0;

 static LOCAL_MEMORY_STATUS_S status;
 int toku_memory_do_stats = 0;

-typedef size_t (*malloc_usable_size_fun_t)(const void *);
-static malloc_usable_size_fun_t malloc_usable_size_f;
-
 static bool memory_startup_complete;

 int
@@ -76,14 +76,6 @@ toku_memory_startup(void) {
        }
    }

-    malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
-    if (!malloc_usable_size_f) {
-        malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
-        if (!malloc_usable_size_f) {
-            result = EINVAL; // couldn't find a malloc size function
-        }
-    }
-
    return result;
 }

@@ -105,7 +97,7 @@ toku_memory_get_status(LOCAL_MEMORY_STATUS s) {
 // jemalloc's malloc_usable_size does not work with a NULL pointer, so we implement a version that works
 static size_t
 my_malloc_usable_size(void *p) {
-    return p == NULL ? 0 : malloc_usable_size_f(p);
+    return p == NULL ? 0 : os_malloc_usable_size(p);
 }

 // Note that max_in_use may be slightly off because use of max_in_use is not thread-safe.
@@ -162,6 +154,23 @@ toku_malloc(size_t size) {
  return p;
 }

+void *toku_malloc_aligned(size_t alignment, size_t size) {
+    void *p = t_malloc_aligned ? t_malloc_aligned(alignment, size) : os_malloc_aligned(alignment, size);
+    if (p) {
+	TOKU_ANNOTATE_NEW_MEMORY(p, size); // see #4671 and https://bugs.kde.org/show_bug.cgi?id=297147
+        if (toku_memory_do_stats) {
+            size_t used = my_malloc_usable_size(p);
+            toku_sync_add_and_fetch(&status.malloc_count, 1);
+            toku_sync_add_and_fetch(&status.requested,size);
+            toku_sync_add_and_fetch(&status.used, used);
+            set_max(status.used, status.freed);
+        }
+    } else {
+        toku_sync_add_and_fetch(&status.malloc_fail, 1);
+    }
+  return p;
+}
+
 void *
 toku_calloc(size_t nmemb, size_t size) {
    size_t newsize = nmemb * size;
@@ -189,6 +198,25 @@ toku_realloc(void *p, size_t size) {
    return q;
 }

+void *toku_realloc_aligned(size_t alignment, void *p, size_t size) {
+    size_t used_orig = p ? my_malloc_usable_size(p) : 0;
+    void *q = t_realloc_aligned ? t_realloc_aligned(alignment, p, size) : os_realloc_aligned(alignment, p, size);
+    if (q) {
+        if (toku_memory_do_stats) {
+            size_t used = my_malloc_usable_size(q);
+            toku_sync_add_and_fetch(&status.realloc_count, 1);
+            toku_sync_add_and_fetch(&status.requested, size);
+            toku_sync_add_and_fetch(&status.used, used);
+            toku_sync_add_and_fetch(&status.freed, used_orig);
+            set_max(status.used, status.freed);
+        }
+    } else {
+	toku_sync_add_and_fetch(&status.realloc_fail, 1);
+    }
+    return q;
+}
+
+
 void *
 toku_memdup(const void *v, size_t len) {
    void *p = toku_malloc(len);
@@ -232,6 +260,23 @@ toku_xmalloc(size_t size) {
    return p;
 }

+void* toku_xmalloc_aligned(size_t alignment, size_t size)
+// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+//  Fail with a resource_assert if the allocation fails (don't return an error code).
+// Requires: alignment is a power of two.
+{
+    void *p = t_xmalloc_aligned ? t_xmalloc_aligned(alignment, size) : os_malloc_aligned(alignment,size);
+    resource_assert(p);
+    if (toku_memory_do_stats) {
+        size_t used = my_malloc_usable_size(p);
+        toku_sync_add_and_fetch(&status.malloc_count, 1);
+        toku_sync_add_and_fetch(&status.requested, size);
+        toku_sync_add_and_fetch(&status.used, used);
+        set_max(status.used, status.freed);
+    }
+    return p;
+}
+
 void *
 toku_xcalloc(size_t nmemb, size_t size) {
    size_t newsize = nmemb * size;

--- a/portability/os_malloc.cc
+++ b/portability/os_malloc.cc
@@ -8,11 +8,174 @@

 #include <toku_portability.h>
 #include <stdlib.h>
+#include <jemalloc/include/jemalloc/jemalloc.h>
 #if defined(HAVE_MALLOC_H)
 # include <malloc.h>
 #elif defined(HAVE_SYS_MALLOC_H)
 # include <sys/malloc.h>
 #endif
+#include <dlfcn.h>
+
+#include <string.h>
+
+// #define this to use a version of os_malloc that helps to debug certain features.
+// This version uses the real malloc (so that valgrind should still work) but it forces things to be slightly
+// misaligned (in particular, avoiding 512-byte alignment if possible, to find situations where O_DIRECT will fail.
+// #define USE_DEBUGGING_MALLOCS
+
+#ifdef USE_DEBUGGING_MALLOCS
+#include <pthread.h>
+
+// Make things misaligned on 512-byte boundaries
+static size_t malloced_now_count=0, malloced_now_size=0;
+struct malloc_pair {
+    void *returned_pointer;
+    void *true_pointer;
+    size_t requested_size = 0;
+};
+static struct malloc_pair *malloced_now;
+static pthread_mutex_t malloc_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void malloc_lock(void) {
+    int r = pthread_mutex_lock(&malloc_mutex);
+    assert(r==0);
+}
+static void malloc_unlock(void) {
+    int r = pthread_mutex_unlock(&malloc_mutex);
+    assert(r==0);
+}
+
+static void push_to_malloced_memory(void *returned_pointer, void *true_pointer, size_t requested_size) {
+    malloc_lock();
+    if (malloced_now_count == malloced_now_size) {
+        malloced_now_size = 2*malloced_now_size + 1;
+        malloced_now = (struct malloc_pair *)realloc(malloced_now, malloced_now_size * sizeof(*malloced_now));
+    }
+    malloced_now[malloced_now_count].returned_pointer = returned_pointer;
+    malloced_now[malloced_now_count].true_pointer     = true_pointer;
+    malloced_now[malloced_now_count].requested_size   = requested_size;
+    malloced_now_count++;
+    malloc_unlock();
+}
+
+static struct malloc_pair *find_malloced_pair(const void *p)
+// Requires: Lock must be held before calling.
+{
+    for (size_t i=0; i<malloced_now_count; i++) {
+        if (malloced_now[i].returned_pointer==p) return &malloced_now[i];
+    }
+    return 0;
+}
+
+void *os_malloc(size_t size) {
+    void  *raw_ptr   = malloc(size+16); // allocate 16 extra bytes
+    size_t raw_ptr_i = (size_t) raw_ptr; 
+    if (raw_ptr_i%512==0) {
+        push_to_malloced_memory(16+(char*)raw_ptr, raw_ptr, size);
+        return 16+(char*)raw_ptr;
+    } else {
+        push_to_malloced_memory(raw_ptr,    raw_ptr, size);
+        return raw_ptr;
+    }
+}
+
+void *os_malloc_aligned(size_t alignment, size_t size)
+// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+{
+    void *p;
+    int r = posix_memalign(&p, alignment, size);
+    if (r != 0) {
+        errno = r;
+        p = nullptr;
+    }
+    return p;
+    if (alignment%512==0) {
+        void *raw_ptr;
+        int r = posix_memalign(&raw_ptr, alignment, size);
+        if (r != 0) {
+            errno = r;
+            return nullptr;
+        }
+        push_to_malloced_memory(raw_ptr, raw_ptr, size);
+        return raw_ptr;
+    } else {
+        // Make sure it isn't 512-byte aligned
+        void *raw_ptr;
+        int r = posix_memalign(&raw_ptr, alignment, size+alignment);
+        if (r != 0) {
+            errno = r;
+            return nullptr;
+        }
+        size_t raw_ptr_i = (size_t) raw_ptr;
+        if (raw_ptr_i%512==0) {
+            push_to_malloced_memory(alignment+(char*)raw_ptr, raw_ptr, size);
+            return alignment+(char*)raw_ptr;
+        } else {
+            push_to_malloced_memory(raw_ptr,    raw_ptr, size);
+            return raw_ptr;
+        }
+    }
+}
+
+static size_t min(size_t a, size_t b) {
+    if (a<b) return a;
+    else return b;
+}
+
+void *os_realloc(void *p, size_t size) {
+    size_t alignment;
+    if (size<4) {
+        alignment = 1;
+    } else if (size<8) {
+        alignment = 4;
+    } else if (size<16) {
+        alignment = 8;
+    } else {
+        alignment = 16;
+    }
+    return os_realloc_aligned(alignment, p, size);
+}
+
+void * os_realloc_aligned(size_t alignment, void *p, size_t size)
+// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+{
+    if (p==NULL) {
+        return os_malloc_aligned(alignment, size);
+    } else {
+        void *result = os_malloc_aligned(alignment, size);
+        malloc_lock();
+        struct malloc_pair *mp = find_malloced_pair(p);
+        assert(mp);
+        // now copy all the good stuff from p to result
+        memcpy(result, p, min(size, mp->requested_size));
+        malloc_unlock();
+        os_free(p);
+        return result;
+    }
+}
+
+
+void os_free(void* p) {
+    malloc_lock();
+    struct malloc_pair *mp = find_malloced_pair(p);
+    assert(mp);
+    free(mp->true_pointer);
+    *mp = malloced_now[--malloced_now_count];
+    malloc_unlock();
+}
+
+size_t os_malloc_usable_size(const void *p) {
+    malloc_lock();
+    struct malloc_pair *mp = find_malloced_pair(p);
+    assert(mp);
+    size_t size = mp->requested_size;
+    malloc_unlock();
+    return size;
+}
+
+#else

 void *
 os_malloc(size_t size)
@@ -20,14 +183,81 @@ os_malloc(size_t size)
    return malloc(size);
 }

+void *os_malloc_aligned(size_t alignment, size_t size)
+// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+{
+    void *p;
+    int r = posix_memalign(&p, alignment, size);
+    if (r != 0) {
+        errno = r;
+        p = nullptr;
+    }
+    return p;
+}
+
 void *
 os_realloc(void *p, size_t size)
 {
    return realloc(p, size);
 }

+void * os_realloc_aligned(size_t alignment, void *p, size_t size)
+// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+{
+#if 1
+    if (p==NULL) {
+        return os_malloc_aligned(alignment, size);
+    } else {
+        void *newp = realloc(p, size);
+        if (0!=((long long)newp%alignment)) {
+            // it's not aligned, so align it ourselves.
+            void *newp2 = os_malloc_aligned(alignment, size);
+            memcpy(newp2, newp, size);
+            free(newp);
+            newp = newp2;
+        }
+        return newp;
+    }
+#else
+    // THIS STUFF SEEMS TO FAIL VALGRIND
+    if (p==NULL) {
+        return os_malloc_aligned(alignment, size);
+    } else {
+        size_t ignore;
+        int r = rallocm(&p,        // returned pointer
+                        &ignore,   // actual size of returned object.
+                        size,      // the size we want
+                        0,         // extra bytes to "try" to allocate at the end
+                        ALLOCM_ALIGN(alignment));
+        if (r!=0) return NULL;
+        else return p;
+    }
+#endif
+}
+
+
 void
 os_free(void* p)
 {
    free(p);
 }
+
+typedef size_t (*malloc_usable_size_fun_t)(const void *);
+static malloc_usable_size_fun_t malloc_usable_size_f = NULL;
+
+size_t os_malloc_usable_size(const void *p) {
+    if (p==NULL) return 0;
+    if (!malloc_usable_size_f) {
+        malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_usable_size");
+        if (!malloc_usable_size_f) {
+            malloc_usable_size_f = (malloc_usable_size_fun_t) dlsym(RTLD_DEFAULT, "malloc_size"); // darwin
+            if (!malloc_usable_size_f) {
+                abort(); // couldn't find a malloc size function
+            }
+        }
+    }
+    return malloc_usable_size_f(p);
+}
+#endif
--- a/portability/tests/test-cache-line-boundary-fails.cc
+++ b/portability/tests/test-cache-line-boundary-fails.cc
@@ -50,9 +50,8 @@ int test_main(int UU(argc), char *const argv[] UU()) {
    }

    {
-        struct unpackedsevenbytestruct *usevenbytestructs;
-        int r = posix_memalign((void **) &usevenbytestructs, cachelinesize, sizeof(unpackedsevenbytestruct) * 10);
-        if (r) {
+        struct unpackedsevenbytestruct *MALLOC_N_ALIGNED(cachelinesize, 10, usevenbytestructs);
+        if (usevenbytestructs == NULL) {
            // this test is supposed to crash, so exiting cleanly is a failure
            perror("posix_memalign");
            exit(EXIT_FAILURE);
@@ -65,8 +64,9 @@ int test_main(int UU(argc), char *const argv[] UU()) {
        toku_free(usevenbytestructs);
    }

-    int r = posix_memalign((void **) &psevenbytestructs, cachelinesize, sizeof(packedsevenbytestruct) * 10);
-    if (r) {
+    
+    MALLOC_N_ALIGNED(cachelinesize, 10, psevenbytestructs);
+    if (psevenbytestructs == NULL) {
        // this test is supposed to crash, so exiting cleanly is a failure
        perror("posix_memalign");
        exit(EXIT_FAILURE);

--- a/portability/tests/test-pwrite4g.cc
+++ b/portability/tests/test-pwrite4g.cc
@@ -7,6 +7,7 @@
 #include <test.h>
 #include <fcntl.h>
 #include <toku_assert.h>
+#include <memory.h>
 #include <string.h>
 #include <stdio.h>

@@ -28,8 +29,9 @@ int test_main(int argc, char *const argv[]) {
    unlink(fname);
    int fd = open(fname, O_RDWR | O_CREAT | O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO);
    assert(fd>=0);
-    char buf[] = "hello";
-    int64_t offset = (1LL<<32) + 100;
+    char *XMALLOC_N_ALIGNED(512, 512, buf);
+    strcpy(buf, "hello");
+    int64_t offset = (1LL<<32) + 512;
    toku_os_full_pwrite(fd, buf, sizeof buf, offset);
    char newbuf[sizeof buf];
    r = pread(fd, newbuf, sizeof newbuf, 100);
@@ -42,6 +44,7 @@ int test_main(int argc, char *const argv[]) {
    r = toku_os_get_file_size(fd, &fsize);
    assert(r == 0);
    assert(fsize > 100 + (signed)sizeof(buf));
+    toku_free(buf);
    r = close(fd);
    assert(r==0);
    return 0;

--- a/src/export.map
+++ b/src/export.map
@@ -4,6 +4,7 @@
   db_env_create;
   db_strerror;
   db_version;
+   db_env_set_direct_io;
   db_env_set_func_fsync;
   db_env_set_func_malloc;
   db_env_set_func_realloc;

--- a/src/ydb_env_func.cc
+++ b/src/ydb_env_func.cc
@@ -27,6 +27,10 @@ void * checkpoint_callback2_extra     = NULL;

 bool engine_status_enable = true; // if false, suppress engine status output on failed assert, for test programs only

+void db_env_set_direct_io (bool direct_io_on) {
+    toku_ft_set_direct_io(direct_io_on);
+}
+
 void db_env_set_func_fsync (int (*fsync_function)(int)) {
    toku_set_func_fsync(fsync_function);
 }

--- a/toku_include/config.h.in
+++ b/toku_include/config.h.in
@@ -44,6 +44,8 @@

 #cmakedefine HAVE_M_MMAP_THRESHOLD 1
 #cmakedefine HAVE_CLOCK_REALTIME 1
+#cmakedefine HAVE_O_DIRECT 1
+#cmakedefine HAVE_F_NOCACHE 1

 #cmakedefine HAVE_MALLOC_SIZE 1
 #cmakedefine HAVE_MALLOC_USABLE_SIZE 1

--- a/toku_include/memory.h
+++ b/toku_include/memory.h
@@ -18,17 +18,28 @@ void toku_memory_shutdown(void) __attribute__((destructor));

 /* Generally: errno is set to 0 or a value to indicate problems. */

-/* Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc() */
+// Everything should call toku_malloc() instead of malloc(), and toku_calloc() instead of calloc()
+// That way the tests can can, e.g.,  replace the malloc function using toku_set_func_malloc().
 void *toku_calloc(size_t nmemb, size_t size)  __attribute__((__visibility__("default")));
 void *toku_xcalloc(size_t nmemb, size_t size)  __attribute__((__visibility__("default")));
 void *toku_malloc(size_t size)  __attribute__((__visibility__("default")));
+void *toku_malloc_aligned(size_t alignment, size_t size)  __attribute__((__visibility__("default")));

 // xmalloc aborts instead of return NULL if we run out of memory
-void *toku_xmalloc(size_t size);
+void *toku_xmalloc(size_t size)  __attribute__((__visibility__("default")));
 void *toku_xrealloc(void*, size_t size) __attribute__((__visibility__("default")));
+void *toku_xmalloc_aligned(size_t alignment, size_t size) __attribute__((__visibility__("default")));
+// Effect: Perform a os_malloc_aligned(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+//  Fail with a resource_assert if the allocation fails (don't return an error code).
+//  If the alloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.

 void toku_free(void*) __attribute__((__visibility__("default")));
 void *toku_realloc(void *, size_t size)  __attribute__((__visibility__("default")));
+void *toku_realloc_aligned(size_t alignment, void *p, size_t size) __attribute__((__visibility__("default")));
+// Effect: Perform a os_realloc_aligned(alignment, p, size) which has the additional property that the returned pointer is a multiple of ALIGNMENT.
+//  If the malloc_aligned function has been set then call it instead.
+// Requires: alignment is a power of two.

 size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")));

@@ -50,6 +61,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
 * to make an array of 5 integers.
 */
 #define MALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_malloc((n)*sizeof(*v)))
+#define MALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_malloc_aligned((align), (n)*sizeof(*v)))
+

 //CALLOC_N is like calloc with auto-figuring out size of members
 #define CALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_calloc((n), sizeof(*v)))
@@ -57,6 +70,7 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
 #define CALLOC(v) CALLOC_N(1,v)

 #define REALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_realloc(v, (n)*sizeof(*v)))
+#define REALLOC_N_ALIGNED(align, n,v) CAST_FROM_VOIDP(v, toku_realloc_aligned((align), v, (n)*sizeof(*v)))

 // XMALLOC macros are like MALLOC except they abort if the operation fails
 #define XMALLOC(v) CAST_FROM_VOIDP(v, toku_xmalloc(sizeof(*v)))
@@ -66,6 +80,8 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
 #define XREALLOC(v,s) CAST_FROM_VOIDP(v, toku_xrealloc(v, s))
 #define XREALLOC_N(n,v) CAST_FROM_VOIDP(v, toku_xrealloc(v, (n)*sizeof(*v)))

+#define XMALLOC_N_ALIGNED(align, n, v) CAST_FROM_VOIDP(v, toku_xmalloc_aligned((align), (n)*sizeof(*v)))
+
 #define XMEMDUP(dst, src) CAST_FROM_VOIDP(dst, toku_xmemdup(src, sizeof(*src)))
 #define XMEMDUP_N(dst, src, len) CAST_FROM_VOIDP(dst, toku_xmemdup(src, len))

@@ -94,6 +110,8 @@ void toku_do_memory_check(void);
 typedef void *(*malloc_fun_t)(size_t);
 typedef void  (*free_fun_t)(void*);
 typedef void *(*realloc_fun_t)(void*,size_t);
+typedef void *(*malloc_aligned_fun_t)(size_t /*alignment*/, size_t /*size*/);
+typedef void *(*realloc_aligned_fun_t)(size_t /*alignment*/, void */*pointer*/, size_t /*size*/);

 void toku_set_func_malloc(malloc_fun_t f);
 void toku_set_func_xmalloc_only(malloc_fun_t f);

--- a/toku_include/toku_portability.h
+++ b/toku_include/toku_portability.h
@@ -247,8 +247,26 @@ extern void *realloc(void*, size_t)            __THROW __attribute__((__deprecat
 #endif

 void *os_malloc(size_t) __attribute__((__visibility__("default")));
+// Effect: See man malloc(2)
+
+void *os_malloc_aligned(size_t /*alignment*/, size_t /*size*/) __attribute__((__visibility__("default")));
+// Effect: Perform a malloc(size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+
+
 void *os_realloc(void*,size_t) __attribute__((__visibility__("default")));
+// Effect: See man realloc(2)
+
+void *os_realloc_aligned(size_t/*alignment*/, void*,size_t) __attribute__((__visibility__("default")));
+// Effect: Perform a realloc(p, size) with the additional property that the returned pointer is a multiple of ALIGNMENT.
+// Requires: alignment is a power of two.
+
 void os_free(void*) __attribute__((__visibility__("default")));
+// Effect: See man free(2)
+
+size_t os_malloc_usable_size(const void *p) __attribute__((__visibility__("default")));
+// Effect: Return an estimate of the usable size inside a pointer.  If this function is not defined the memory.cc will
+//  look for the jemalloc, libc, or darwin versions of the function for computing memory footprint.

 // full_pwrite and full_write performs a pwrite, and checks errors.  It doesn't return unless all the data was written. */
 void toku_os_full_pwrite (int fd, const void *buf, size_t len, toku_off_t off) __attribute__((__visibility__("default")));
@@ -262,6 +280,7 @@ int toku_os_write (int fd, const void *buf, size_t len) __attribute__((__visibil
 FILE * toku_os_fdopen(int fildes, const char *mode);    
 FILE * toku_os_fopen(const char *filename, const char *mode);
 int toku_os_open(const char *path, int oflag, int mode);
+int toku_os_open_direct(const char *path, int oflag, int mode);
 int toku_os_close(int fd);
 int toku_os_fclose(FILE * stream);
 ssize_t toku_os_read(int fd, void *buf, size_t count);
@@ -293,4 +312,17 @@ void toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t));
 int toku_portability_init(void);
 void toku_portability_destroy(void);

+static inline uint64_t roundup_to_multiple(uint64_t alignment, uint64_t v)
+// Effect: Return X, where X the smallest multiple of ALIGNMENT such that X>=V.
+// Requires: ALIGNMENT is a power of two
+{
+    assert(0==(alignment&(alignment-1)));  // alignment must be a power of two
+    uint64_t result = (v+alignment-1)&~(alignment-1);
+    assert(result>=v);                     // The result is >=V.
+    assert(result%alignment==0);           // The result is a multiple of alignment.
+    assert(result<v+alignment);            // The result is the smallest such multiple of alignment.
+    return result;
+}
+    
+
 #endif /* TOKU_PORTABILITY_H */