Addresses #1463

Abstracted out the block translation table and the block allocator into BLOCK_TABLE All use is done by accessors surrounded by locks. git-svn-id: file:///svn/toku/tokudb@9360 c7de825b-a66e-492c-adef-691d508d4ae1

Addresses #1463
Abstracted out the block translation table and the block allocator into BLOCK_TABLE All use is done by accessors surrounded by locks. git-svn-id: file:///svn/toku/tokudb@9360 c7de825b-a66e-492c-adef-691d508d4ae1
d33db3f8 · Yoni Fogel · 7cbba1ab · d33db3f8 · d33db3f8 · d33db3f8
Commit d33db3f8 authored Jan 31, 2009 by Yoni Fogel
13 changed files
--- a/newbrt/Makefile
+++ b/newbrt/Makefile
@@ -37,6 +37,7 @@ build default: bins libs $(TEST_NEWBRT)

 BRT_SOURCES = \
  block_allocator \
+  block_table \
  bread \
  brt-serialize \
  brt-verify \

--- a/newbrt/block_allocator.h
+++ b/newbrt/block_allocator.h
@@ -6,6 +6,12 @@

 #include "brttypes.h"

+#define BLOCK_ALLOCATOR_ALIGNMENT 4096
+// How much must be reserved at the beginning for the block?
+//  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
+//  So 4096 should be enough.
+#define BLOCK_ALLOCATOR_HEADER_RESERVE 4096
+
 // A block allocator manages the allocation of variable-sized blocks.
 // The translation of block numbers to addresses is handled elsewhere.
 // The allocation of block numbers is handled elsewhere.

--- a/newbrt/block_table.c
+++ b/newbrt/block_table.c
+//TODO: What about h->block_translation_size_on_disk
+//TODO: What about h->block_translation_address_on_disk
+//TODO: What about h->block_allocator
+
+#include "toku_portability.h"
+#include "brttypes.h"
+#include "block_table.h"
+#include "memory.h"
+#include "toku_assert.h"
+#include "toku_pthread.h"
+#include "block_allocator.h"
+#include "rbuf.h"
+#include "wbuf.h"
+
+struct block_table {
+    // This is the map from block numbers to offsets
+    //int n_blocks, n_blocks_array_size;
+    //struct block_descriptor *blocks;
+    BLOCKNUM free_blocks; // free list for blocks.  Use -1 to indicate that there are no free blocks
+    BLOCKNUM unused_blocks; // first unused block
+
+    u_int64_t translated_blocknum_limit;
+    struct block_translation_pair *block_translation;
+
+    // Where and how big is the block translation vector stored on disk.
+    // The size of the on_disk buffer may no longer match the max_blocknum_translated field, since blocks may have been allocated or freed.
+    // We need to remember this old information so we can free it properly.
+    u_int64_t block_translation_size_on_disk;    // the size of the block containing the translation (i.e. 8 times the number of entries)
+    u_int64_t block_translation_address_on_disk; // 0 if there is no memory allocated
+    
+    // The in-memory data structure  for block allocation
+    BLOCK_ALLOCATOR block_allocator;
+};
+
+static const DISKOFF diskoff_is_null = (DISKOFF)-1; // in a freelist, this indicates end of list
+static const DISKOFF size_is_free = (DISKOFF)-1;
+
+static void
+extend_block_translation(BLOCK_TABLE bt, BLOCKNUM blocknum)
+// Effect: Record a block translation.  This means extending the translation table, and setting the diskoff and size to zero in any of the unused spots.
+{
+    assert(0<=blocknum.b);
+    if (bt->translated_blocknum_limit <= (u_int64_t)blocknum.b) {
+        if (bt->block_translation == 0) assert(bt->translated_blocknum_limit==0);
+        u_int64_t new_limit = blocknum.b + 1;
+        u_int64_t old_limit = bt->translated_blocknum_limit;
+        u_int64_t j;
+        XREALLOC_N(new_limit, bt->block_translation);
+        for (j=old_limit; j<new_limit; j++) {
+            bt->block_translation[j].diskoff = 0;
+            bt->block_translation[j].size    = 0;
+        }
+        bt->translated_blocknum_limit = new_limit;
+    }
+}
+
+static inline void
+verify(BLOCK_TABLE bt, BLOCKNUM b) {
+    // 0<=b<limit (limit is exclusive)
+    assert(0 <= b.b);
+    assert((u_int64_t)b.b < bt->translated_blocknum_limit);
+}
+
+static toku_pthread_mutex_t blocktable_mutex = TOKU_PTHREAD_MUTEX_INITIALIZER;
+static int blocktable_is_locked=0;
+
+void toku_blocktable_lock_init(void) {
+    int r = toku_pthread_mutex_init(&blocktable_mutex, NULL); assert(r == 0);
+}
+
+void toku_blocktable_lock_destroy(void) {
+    int r = toku_pthread_mutex_destroy(&blocktable_mutex); assert(r == 0);
+}
+
+static inline void
+lock_for_blocktable (void) {
+    // Locks the blocktable_mutex. 
+    int r = toku_pthread_mutex_lock(&blocktable_mutex);
+    assert(r==0);
+    blocktable_is_locked = 1;
+}
+
+static inline void
+unlock_for_blocktable (void) {
+    blocktable_is_locked = 0;
+    int r = toku_pthread_mutex_unlock(&blocktable_mutex);
+    assert(r==0);
+}
+
+static void
+block_free(BLOCK_TABLE bt, u_int64_t offset) {
+    block_allocator_free_block(bt->block_allocator, offset);
+}
+
+static void
+block_free_blocknum(BLOCK_TABLE bt, BLOCKNUM b) {
+    verify(bt, b);
+    if (bt->block_translation[b.b].size > 0) {
+        block_free(bt, bt->block_translation[b.b].diskoff);
+        bt->block_translation[b.b].diskoff = 0;
+        bt->block_translation[b.b].size    = 0;
+    }
+}
+
+static void
+block_alloc(BLOCK_TABLE bt, u_int64_t size, u_int64_t *offset) {
+    block_allocator_alloc_block(bt->block_allocator, size, offset);
+}
+
+static void
+block_alloc_and_set_translation(BLOCK_TABLE bt, BLOCKNUM b, u_int64_t size, u_int64_t *offset) {
+    verify(bt, b);
+    block_alloc(bt, size, offset);
+    bt->block_translation[b.b].diskoff = *offset;
+    bt->block_translation[b.b].size = size;
+}
+
+void
+toku_block_alloc(BLOCK_TABLE bt, u_int64_t size, u_int64_t *offset) {
+    lock_for_blocktable();
+    block_alloc(bt, size, offset);
+    unlock_for_blocktable();
+}
+
+void
+toku_block_free(BLOCK_TABLE bt, u_int64_t offset) {
+    lock_for_blocktable();
+    block_free(bt, offset);
+    unlock_for_blocktable();
+}
+
+static void
+update_size_on_disk(BLOCK_TABLE bt) {
+    bt->block_translation_size_on_disk = 4 +//4 for checksum
+                bt->translated_blocknum_limit*sizeof(bt->block_translation[0]);
+}
+
+void
+toku_block_realloc(BLOCK_TABLE bt, BLOCKNUM b, u_int64_t size, u_int64_t *offset) {
+    lock_for_blocktable();
+    extend_block_translation(bt, b);
+    block_free_blocknum(bt, b);
+    block_alloc_and_set_translation(bt, b, size, offset);
+    unlock_for_blocktable();
+}
+
+void
+toku_block_lock_for_multiple_operations(void) {
+    lock_for_blocktable();
+}
+
+void
+toku_block_unlock_for_multiple_operations(void) {
+    assert(blocktable_is_locked);
+    unlock_for_blocktable();
+}
+
+
+void
+toku_block_realloc_translation_unlocked(BLOCK_TABLE bt) {
+    assert(blocktable_is_locked);
+    if (bt->block_translation_address_on_disk != 0) {
+        block_allocator_free_block(bt->block_allocator, bt->block_translation_address_on_disk);
+    }
+    update_size_on_disk(bt);
+    block_allocator_alloc_block(bt->block_allocator,
+                         bt->block_translation_size_on_disk,
+                         &bt->block_translation_address_on_disk);
+}
+
+void
+toku_block_wbuf_free_blocks_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf) {
+    assert(blocktable_is_locked);
+    wbuf_BLOCKNUM(wbuf, bt->free_blocks);
+}
+
+void
+toku_block_wbuf_unused_blocks_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf) {
+    assert(blocktable_is_locked);
+    wbuf_BLOCKNUM(wbuf, bt->unused_blocks);
+}
+
+void
+toku_block_wbuf_translated_blocknum_limit_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf) {
+    assert(blocktable_is_locked);
+    wbuf_ulonglong(wbuf, bt->translated_blocknum_limit);
+}
+
+void
+toku_block_wbuf_block_translation_address_on_disk_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf) {
+    assert(blocktable_is_locked);
+    wbuf_DISKOFF(wbuf, bt->block_translation_address_on_disk);
+}
+
+void
+toku_block_wbuf_init_and_fill_unlocked(BLOCK_TABLE bt, struct wbuf *w,
+        u_int64_t *size, u_int64_t *address) {
+    assert(blocktable_is_locked);
+    update_size_on_disk(bt);
+    u_int64_t size_translation = bt->block_translation_size_on_disk;
+    //printf("%s:%d writing translation table of size_translation %ld at %ld\n", __FILE__, __LINE__, size_translation, bt->block_translation_address_on_disk);
+    wbuf_init(w, toku_malloc(size_translation), size_translation);
+    assert(w->size==size_translation);
+    u_int64_t i;
+    for (i=0; i<bt->translated_blocknum_limit; i++) {
+        //printf("%s:%d %ld,%ld\n", __FILE__, __LINE__, bt->block_translation[i].diskoff, bt->block_translation[i].size_translation);
+        wbuf_ulonglong(w, bt->block_translation[i].diskoff);
+        wbuf_ulonglong(w, bt->block_translation[i].size);
+    }
+    u_int32_t checksum = x1764_finish(&w->checksum);
+    wbuf_int(w, checksum);
+    *size = size_translation;
+    *address = bt->block_translation_address_on_disk;
+}
+
+DISKOFF
+toku_block_get_offset(BLOCK_TABLE bt, BLOCKNUM b) {
+    lock_for_blocktable();
+    verify(bt, b);
+    DISKOFF r = bt->block_translation[b.b].diskoff;
+    unlock_for_blocktable();
+    return r;
+}
+
+DISKOFF
+toku_block_get_size(BLOCK_TABLE bt, BLOCKNUM b) {
+    lock_for_blocktable();
+    verify(bt, b);
+    DISKOFF r = bt->block_translation[b.b].size;
+    unlock_for_blocktable();
+    return r;
+}
+
+int
+toku_allocate_diskblocknumber(BLOCK_TABLE bt, BLOCKNUM *res, int *dirty, TOKULOGGER UU(logger)) {
+    lock_for_blocktable();
+    BLOCKNUM result;
+    if (bt->free_blocks.b == diskoff_is_null) {
+        // no blocks in the free list
+        result = bt->unused_blocks;
+        bt->unused_blocks.b++;
+    } else {
+        result = bt->free_blocks;
+        assert(bt->block_translation[result.b].size = size_is_free);
+        bt->block_translation[result.b].size = 0;
+        bt->free_blocks.b = bt->block_translation[result.b].diskoff; // pop the freelist
+    }
+    assert(result.b>0);
+    *res = result;
+    *dirty = 1;
+    unlock_for_blocktable();
+    return 0;
+}
+////CONVERTED above already
+//TODO: Convert below
+
+
+int
+toku_free_diskblocknumber(BLOCK_TABLE bt, BLOCKNUM *b, int *dirty, TOKULOGGER UU(logger))
+// Effect: Free a diskblock
+//  Watch out for the case where the disk block was never yet written to disk
+{
+    lock_for_blocktable();
+    extend_block_translation(bt, *b);
+    // If the block_translation indicates that the size is <=0
+    // then there is no disk block allocated.
+    if (bt->block_translation[b->b].size > 0) {
+        block_allocator_free_block(bt->block_allocator,
+                                   bt->block_translation[b->b].diskoff);
+    }
+    verify(bt, *b);
+    assert(bt->block_translation[b->b].size != size_is_free);
+    bt->block_translation[b->b].size = size_is_free;
+    bt->block_translation[b->b].diskoff = bt->free_blocks.b;
+    bt->free_blocks.b = b->b;
+    b->b = 0;
+    *dirty = 1;
+    unlock_for_blocktable();
+    return 0;
+}
+
+//Verify there are no free blocks.
+void
+toku_block_verify_no_free_blocks(BLOCK_TABLE bt) {
+    assert(bt->free_blocks.b==-1);
+}
+
+//Verify a block has been allocated at least once.
+void
+toku_verify_diskblocknumber_allocated(BLOCK_TABLE bt, BLOCKNUM b) {
+    lock_for_blocktable();
+    assert(0 <= b.b);
+    assert(     b.b < bt->unused_blocks.b);
+    unlock_for_blocktable();
+}
+
+u_int64_t toku_block_allocator_allocated_limit(BLOCK_TABLE bt) {
+    lock_for_blocktable();
+    u_int64_t r = block_allocator_allocated_limit(bt->block_allocator);
+    unlock_for_blocktable();
+    return r;
+}
+
+void
+toku_block_dump_translation_table(FILE *f, BLOCK_TABLE bt) {
+    lock_for_blocktable();
+    u_int64_t i;
+    fprintf(f, "Block translation:");
+    for (i=0; i<bt->translated_blocknum_limit; i++) {
+        fprintf(f, " %"PRIu64": %"PRId64" %"PRId64"", i, bt->block_translation[i].diskoff, bt->block_translation[i].size);
+    }
+    fprintf(f, "\n");
+    unlock_for_blocktable();
+}
+
+void
+toku_block_dump_translation(BLOCK_TABLE bt, u_int64_t offset) {
+    lock_for_blocktable();
+    if (offset < bt->translated_blocknum_limit) {
+        struct block_translation_pair *bx = &bt->block_translation[offset];
+        printf("%" PRIu64 ": %" PRId64 " %" PRId64 "\n", offset, bx->diskoff, bx->size);
+    }
+    unlock_for_blocktable();
+}
+
+void
+toku_block_recovery_set_unused_blocks(BLOCK_TABLE bt, BLOCKNUM newunused) {
+    lock_for_blocktable();
+    bt->unused_blocks = newunused;
+    unlock_for_blocktable();
+}
+
+void
+toku_block_recovery_set_free_blocks(BLOCK_TABLE bt, BLOCKNUM newfree) {
+    lock_for_blocktable();
+    bt->free_blocks = newfree;
+    unlock_for_blocktable();
+}
+
+void
+toku_block_memcpy_translation_table(BLOCK_TABLE bt, size_t n, void *p) {
+    lock_for_blocktable();
+    memcpy(p, bt->block_translation, n);
+    unlock_for_blocktable();
+}
+
+u_int64_t
+toku_block_get_translated_blocknum_limit(BLOCK_TABLE bt) {
+    lock_for_blocktable();
+    u_int64_t r = bt->translated_blocknum_limit;
+    unlock_for_blocktable();
+    return r;
+}
+
+BLOCKNUM
+toku_block_get_free_blocks(BLOCK_TABLE bt) {
+    lock_for_blocktable();
+    BLOCKNUM r = bt->free_blocks;
+    unlock_for_blocktable();
+    return r;
+}
+
+BLOCKNUM
+toku_block_get_unused_blocks(BLOCK_TABLE bt) {
+    lock_for_blocktable();
+    BLOCKNUM r = bt->unused_blocks;
+    unlock_for_blocktable();
+    return r;
+}
+
+void
+toku_blocktable_destroy(BLOCK_TABLE *btp) {
+    lock_for_blocktable();
+    BLOCK_TABLE bt = *btp;
+    *btp = NULL;
+    toku_free(bt->block_translation);
+    bt->block_translation = NULL;
+    destroy_block_allocator(&bt->block_allocator);
+    unlock_for_blocktable();
+}
+
+void
+toku_blocktable_debug_set_translation(BLOCK_TABLE bt,
+        u_int64_t limit,
+        struct block_translation_pair *table) {
+    lock_for_blocktable();
+    if (bt->block_translation) toku_free(bt->block_translation);
+    bt->translated_blocknum_limit = limit;
+    bt->block_translation = table;
+    unlock_for_blocktable();
+} 
+
+void
+toku_blocktable_create(BLOCK_TABLE *btp,
+        BLOCKNUM free_blocks,
+        BLOCKNUM unused_blocks,
+        u_int64_t translated_blocknum_limit,
+        u_int64_t block_translation_address_on_disk,
+        u_int64_t block_translation_size_on_disk,
+        unsigned char *buffer) {
+    lock_for_blocktable();
+
+    BLOCK_TABLE bt;
+    XMALLOC(bt);
+
+    bt->free_blocks   = free_blocks;
+    bt->unused_blocks = unused_blocks;
+    bt->translated_blocknum_limit = translated_blocknum_limit;
+    bt->block_translation_address_on_disk = block_translation_address_on_disk;
+    update_size_on_disk(bt);
+    if (block_translation_address_on_disk==0 && block_translation_size_on_disk == 0) {
+        bt->block_translation_size_on_disk = 0;
+    }
+    assert(block_translation_size_on_disk==bt->block_translation_size_on_disk);
+
+
+    // Set up the the block translation buffer.
+    create_block_allocator(&bt->block_allocator, BLOCK_ALLOCATOR_HEADER_RESERVE, BLOCK_ALLOCATOR_ALIGNMENT);
+    if (block_translation_address_on_disk==0) {
+        bt->block_translation = NULL;
+        assert(buffer==NULL);
+    }
+    else {
+        XMALLOC_N(translated_blocknum_limit, bt->block_translation);
+        //Mark where the translation table is stored on disk.
+	block_allocator_alloc_block_at(bt->block_allocator, bt->block_translation_size_on_disk, bt->block_translation_address_on_disk);
+        //Load translations from the buffer.
+        u_int64_t i;
+    	struct rbuf rt;
+	rt.buf = buffer;
+	rt.ndone = 0;
+	rt.size = bt->block_translation_size_on_disk-4;//4==checksum
+	assert(rt.size>0);
+        for (i=0; i<bt->translated_blocknum_limit; i++) {
+            bt->block_translation[i].diskoff = rbuf_diskoff(&rt);
+            bt->block_translation[i].size    = rbuf_diskoff(&rt);
+            if (bt->block_translation[i].size > 0)
+                block_allocator_alloc_block_at(bt->block_allocator, bt->block_translation[i].size, bt->block_translation[i].diskoff);
+            //printf("%s:%d %ld %ld\n", __FILE__, __LINE__, bt->block_translation[i].diskoff, bt->block_translation[i].size);
+        }
+
+    }
+    
+    // printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, bt->translated_blocknum_limit, bt->block_translation_address_on_disk);
+
+    *btp = bt;
+    unlock_for_blocktable();
+}
+
+void
+toku_blocktable_create_new(BLOCK_TABLE *btp) {
+    toku_blocktable_create(btp,
+                           make_blocknum(-1),
+                           make_blocknum(2),
+                           0, 0, 0, NULL);
+}
+
--- a/newbrt/block_table.h
+++ b/newbrt/block_table.h
+/* -*- mode: C; c-basic-offset: 4 -*- */
+#ifndef BLOCKTABLE_H
+#define BLOCKTABLE_H
+#ident "Copyright (c) 2007, 2008 Tokutek Inc.  All rights reserved."
+
+typedef struct block_table *BLOCK_TABLE;
+
+//Needed by tests, brtdump
+struct block_translation_pair {
+    DISKOFF diskoff; // When in free list, set to the next free block.  In this case it's really a BLOCKNUM.
+    DISKOFF size;    // set to 0xFFFFFFFFFFFFFFFF for free
+};
+
+void toku_blocktable_lock_init(void);
+void toku_blocktable_lock_destroy(void);
+
+void toku_block_realloc(BLOCK_TABLE bt, BLOCKNUM b, u_int64_t size, u_int64_t *offset);
+void toku_block_alloc(BLOCK_TABLE bt, u_int64_t size, u_int64_t *offset);
+void toku_block_free(BLOCK_TABLE bt, u_int64_t offset);
+DISKOFF toku_block_get_offset(BLOCK_TABLE bt, BLOCKNUM b);
+DISKOFF toku_block_get_size(BLOCK_TABLE bt, BLOCKNUM b);
+int toku_allocate_diskblocknumber(BLOCK_TABLE bt, BLOCKNUM *res, int *dirty, TOKULOGGER logger);
+int toku_free_diskblocknumber(BLOCK_TABLE bt, BLOCKNUM *b, int *dirty, TOKULOGGER logger);
+void toku_verify_diskblocknumber_allocated(BLOCK_TABLE bt, BLOCKNUM b);
+void toku_block_verify_no_free_blocks(BLOCK_TABLE bt);
+u_int64_t toku_block_allocator_allocated_limit(BLOCK_TABLE bt);
+void toku_block_dump_translation_table(FILE *f, BLOCK_TABLE bt);
+void toku_block_dump_translation(BLOCK_TABLE bt, u_int64_t offset);
+
+void toku_blocktable_destroy(BLOCK_TABLE *btp);
+void toku_blocktable_debug_set_translation(BLOCK_TABLE bt,
+        u_int64_t limit,
+        struct block_translation_pair *table);
+void toku_blocktable_create(BLOCK_TABLE *btp,
+        BLOCKNUM free_blocks,
+        BLOCKNUM unused_blocks,
+        u_int64_t translated_blocknum_limit,
+        u_int64_t block_translation_address_on_disk,
+        u_int64_t block_translation_size_on_disk,
+        unsigned char *buffer);
+void toku_blocktable_create_new(BLOCK_TABLE *bt);
+
+void toku_block_recovery_set_unused_blocks(BLOCK_TABLE bt, BLOCKNUM newunused);
+void toku_block_recovery_set_free_blocks(BLOCK_TABLE bt, BLOCKNUM newfree);
+BLOCKNUM toku_block_get_unused_blocks(BLOCK_TABLE bt);
+BLOCKNUM toku_block_get_free_blocks(BLOCK_TABLE bt);
+u_int64_t toku_block_get_translated_blocknum_limit(BLOCK_TABLE bt);
+
+void toku_block_memcpy_translation_table(BLOCK_TABLE bt, size_t n, void *p);
+
+//Unlocked/multi ops
+void toku_block_lock_for_multiple_operations(void);
+void toku_block_unlock_for_multiple_operations(void);
+
+void toku_block_realloc_translation_unlocked(BLOCK_TABLE bt);
+void toku_block_wbuf_free_blocks_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf);
+void toku_block_wbuf_unused_blocks_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf);
+void toku_block_wbuf_translated_blocknum_limit_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf);
+void toku_block_wbuf_block_translation_address_on_disk_unlocked(BLOCK_TABLE bt, struct wbuf *wbuf);
+void toku_block_wbuf_init_and_fill_unlocked(BLOCK_TABLE bt, struct wbuf *w,
+                                            u_int64_t *size, u_int64_t *address);
+
+
+
+#endif
+
--- a/newbrt/brt-internal.h
+++ b/newbrt/brt-internal.h
@@ -14,6 +14,7 @@
 typedef void *OMTVALUE;
 #include "omt.h"
 #include "leafentry.h"
+#include "block_table.h"

 #ifndef BRT_FANOUT
 #define BRT_FANOUT 16
@@ -113,11 +114,6 @@ struct remembered_hash {
    u_int32_t fullhash; // fullhash is the hashed value of fnum and root.
 };

-struct block_translation_pair {
-    DISKOFF diskoff; // When in free list, set to the next free block.  In this case it's really a BLOCKNUM.
-    DISKOFF size;    // set to 0xFFFFFFFFFFFFFFFF for free
-};
-
 // The brt_header is not managed by the cachetable.  Instead, it hangs off the cachefile as userdata.

 struct brt_header {
@@ -137,23 +133,7 @@ struct brt_header {

    u_int64_t root_put_counter; // the generation number of the brt

-    // This is the map from block numbers to offsets
-    //int n_blocks, n_blocks_array_size;
-    //struct block_descriptor *blocks;
-    BLOCKNUM free_blocks; // free list for blocks.  Use -1 to indicate that there are no free blocks
-    BLOCKNUM unused_blocks; // first unused block
-
-    u_int64_t translated_blocknum_limit;
-    struct block_translation_pair *block_translation;
-
-    // Where and how big is the block translation vector stored on disk.
-    // The size of the on_disk buffer may no longer match the max_blocknum_translated field, since blocks may have been allocated or freed.
-    // We need to remember this old information so we can free it properly.
-    u_int64_t block_translation_size_on_disk;    // the size of the block containing the translation (i.e. 8 times the number of entries)
-    u_int64_t block_translation_address_on_disk; // 0 if there is no memory allocated
-    
-    // The in-memory data structure  for block allocation
-    BLOCK_ALLOCATOR block_allocator;
+    BLOCK_TABLE blocktable;
 };

 struct brt {
@@ -292,12 +272,6 @@ void toku_brtheader_free (struct brt_header *h);
 int toku_brtheader_close (CACHEFILE cachefile, void *header_v, char **error_string);
 int toku_brtheader_checkpoint (CACHEFILE cachefile, void *header_v);

-#define BLOCK_ALLOCATOR_ALIGNMENT 4096
-// How much must be reserved at the beginning for the block?
-//  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
-//  So 4096 should be enough.
-#define BLOCK_ALLOCATOR_HEADER_RESERVE 4096
-
 int toku_db_badformat(void);

 #endif
--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -353,24 +353,17 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
    int r;
    {
 	lock_for_pwrite();
+//TODO: #1463 START (might not be the entire range
 	// If the node has never been written, then write the whole buffer, including the zeros
 	assert(blocknum.b>=0);
 	//printf("%s:%d h=%p\n", __FILE__, __LINE__, h);
 	//printf("%s:%d translated_blocknum_limit=%lu blocknum.b=%lu\n", __FILE__, __LINE__, h->translated_blocknum_limit, blocknum.b);
 	//printf("%s:%d allocator=%p\n", __FILE__, __LINE__, h->block_allocator);
 	//printf("%s:%d bt=%p\n", __FILE__, __LINE__, h->block_translation);
-	extend_block_translation(blocknum, h);
-	if (h->block_translation[blocknum.b].size > 0) {
-	    block_allocator_free_block(h->block_allocator, h->block_translation[blocknum.b].diskoff);
-	    h->block_translation[blocknum.b].diskoff = 0;
-	    h->block_translation[blocknum.b].size    = 0;
-	}
 	h->dirty = 1; // Allocating a block dirties the header.
 	size_t n_to_write = uncompressed_magic_len + compression_header_len + compressed_len;
 	u_int64_t offset;
-	block_allocator_alloc_block(h->block_allocator, n_to_write, &offset);
-	h->block_translation[blocknum.b].diskoff = offset;
-	h->block_translation[blocknum.b].size = n_to_write;
+        toku_block_realloc(h->blocktable, blocknum, n_to_write, &offset);
 	ssize_t n_wrote;
 	r=toku_pwrite_extend(fd, compressed_buf, n_to_write, offset, &n_wrote);
 	if (r) {
@@ -378,6 +371,7 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
 	} else {
 	    r=0;
 	}
+//TODO: #1463 END
 	unlock_for_pwrite();
    }

@@ -391,8 +385,7 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
 int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *brtnode, struct brt_header *h) {
    if (0) printf("Deserializing Block %" PRId64 "\n", blocknum.b);
    if (h->panic) return h->panic;
-    assert(0 <= blocknum.b && (u_int64_t)blocknum.b < h->translated_blocknum_limit);
-    DISKOFF offset = h->block_translation[blocknum.b].diskoff;
+    DISKOFF offset = toku_block_get_offset(h->blocktable, blocknum);
    TAGMALLOC(BRTNODE, result);
    struct rbuf rc;
    int i;
@@ -714,16 +707,19 @@ int toku_serialize_brt_header_to_wbuf (struct wbuf *wbuf, struct brt_header *h)
    wbuf_int    (wbuf, size);
    wbuf_int    (wbuf, BRT_LAYOUT_VERSION);
    wbuf_int    (wbuf, h->nodesize);
-    wbuf_BLOCKNUM(wbuf, h->free_blocks);
-    wbuf_BLOCKNUM(wbuf, h->unused_blocks);
+    //TODO: Use 'prelocked/unlocked' versions to make this atomic
+//TODO: #1463 START
+
+    toku_block_realloc_translation_unlocked(h->blocktable);
+    toku_block_wbuf_free_blocks_unlocked(h->blocktable, wbuf);
+    toku_block_wbuf_unused_blocks_unlocked(h->blocktable, wbuf);
+//TODO: #1463 END
    wbuf_int    (wbuf, h->n_named_roots);
-    if (h->block_translation_address_on_disk != 0) {
-	block_allocator_free_block(h->block_allocator, h->block_translation_address_on_disk);
-    }
-    block_allocator_alloc_block(h->block_allocator, 4 + 16*h->translated_blocknum_limit, &h->block_translation_address_on_disk);
+//TODO: #1463 START
    //printf("%s:%d bta=%lu size=%lu\n", __FILE__, __LINE__, h->block_translation_address_on_disk, 4 + 16*h->translated_blocknum_limit);
-    wbuf_ulonglong(wbuf, h->translated_blocknum_limit);
-    wbuf_DISKOFF(wbuf, h->block_translation_address_on_disk);
+    toku_block_wbuf_translated_blocknum_limit_unlocked(h->blocktable, wbuf);
+    toku_block_wbuf_block_translation_address_on_disk_unlocked(h->blocktable, wbuf);
+//TODO: #1463 END
    if (h->n_named_roots>=0) {
 	int i;
 	for (i=0; i<h->n_named_roots; i++) {
@@ -746,18 +742,31 @@ int toku_serialize_brt_header_to (int fd, struct brt_header *h) {
    int rr = 0;
    if (h->panic) return h->panic;
    lock_for_pwrite();
+    toku_block_lock_for_multiple_operations();
+    struct wbuf w_main;
+    unsigned int size_main = toku_serialize_brt_header_size (h);
    {
-	struct wbuf w;
-	unsigned int size = toku_serialize_brt_header_size (h);
-	wbuf_init(&w, toku_malloc(size), size);
+	wbuf_init(&w_main, toku_malloc(size_main), size_main);
 	{
-	    int r=toku_serialize_brt_header_to_wbuf(&w, h);
+	    int r=toku_serialize_brt_header_to_wbuf(&w_main, h);
 	    assert(r==0);
 	}
-	assert(w.ndone==size);
+	assert(w_main.ndone==size_main);
+    }
+    struct wbuf w_translation;
+    u_int64_t size_translation;
+    u_int64_t address_translation;
+    {
+        toku_block_wbuf_init_and_fill_unlocked(h->blocktable, &w_translation,
+                                               &size_translation, &address_translation);
+        size_translation = w_translation.size;
+    }
+    toku_block_unlock_for_multiple_operations();
+    {
+        //Actual Write main header
 	ssize_t nwrote;
-	rr = toku_pwrite_extend(fd, w.buf, w.ndone, 0, &nwrote);
-	toku_free(w.buf);
+	rr = toku_pwrite_extend(fd, w_main.buf, w_main.ndone, 0, &nwrote);
+	toku_free(w_main.buf);
 	if (rr) {
 	    if (h->panic==0) {
 		char s[200];
@@ -767,31 +776,21 @@ int toku_serialize_brt_header_to (int fd, struct brt_header *h) {
 	    }
 	    goto finish;
 	}
-	assert((u_int64_t)nwrote==size);
+	assert((u_int64_t)nwrote==size_main);
    }
    {
-	struct wbuf w;
-	u_int64_t size = 4 + h->translated_blocknum_limit * 16; // 4 for the checksum
-	//printf("%s:%d writing translation table of size %ld at %ld\n", __FILE__, __LINE__, size, h->block_translation_address_on_disk);
-	wbuf_init(&w, toku_malloc(size), size);
-	u_int64_t i;
-	for (i=0; i<h->translated_blocknum_limit; i++) {
-	    //printf("%s:%d %ld,%ld\n", __FILE__, __LINE__, h->block_translation[i].diskoff, h->block_translation[i].size);
-	    wbuf_ulonglong(&w, h->block_translation[i].diskoff);
-	    wbuf_ulonglong(&w, h->block_translation[i].size);
-	}
-	u_int32_t checksum = x1764_finish(&w.checksum);
-	wbuf_int(&w, checksum);
+        //Actual Write translation table
 	ssize_t nwrote;
-	rr = toku_pwrite_extend(fd, w.buf, size, h->block_translation_address_on_disk, &nwrote);
-	toku_free(w.buf);
+	rr = toku_pwrite_extend(fd, w_translation.buf,
+                                size_translation, address_translation, &nwrote);
 	if (rr) {
 	    //fprintf(stderr, "%s:%d: Error writing data to file.  errno=%d (%s)\n", __FILE__, __LINE__, rr, strerror(rr));
 	    goto finish;
 	}
-	assert((u_int64_t)nwrote==size);
+	assert((u_int64_t)nwrote==size_translation);
    }
 finish:
+    toku_free(w_translation.buf);
    unlock_for_pwrite();
    return rr;
 }
@@ -820,49 +819,48 @@ deserialize_brtheader (u_int32_t size, int fd, DISKOFF off, struct brt_header **
    h->layout_version = rbuf_int(&rc);
    h->nodesize      = rbuf_int(&rc);
    assert(h->layout_version==BRT_LAYOUT_VERSION_9);
-    h->free_blocks   = rbuf_blocknum(&rc);
-    h->unused_blocks = rbuf_blocknum(&rc);
+    BLOCKNUM free_blocks = rbuf_blocknum(&rc);
+    BLOCKNUM unused_blocks = rbuf_blocknum(&rc);
    h->n_named_roots = rbuf_int(&rc);
-    h->translated_blocknum_limit = rbuf_diskoff(&rc);
-    h->block_translation_size_on_disk    = 4 + 16 * h->translated_blocknum_limit;
-    h->block_translation_address_on_disk = rbuf_diskoff(&rc);
-    // Set up the the block translation buffer.
-    create_block_allocator(&h->block_allocator, BLOCK_ALLOCATOR_HEADER_RESERVE, BLOCK_ALLOCATOR_ALIGNMENT);
+    u_int64_t translated_blocknum_limit = rbuf_diskoff(&rc);
+    u_int64_t block_translation_address_on_disk = rbuf_diskoff(&rc);
+    u_int64_t block_translation_size_on_disk = 4 +//4 for checksum
+                                               16*translated_blocknum_limit;
    // printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, h->translated_blocknum_limit, h->block_translation_address_on_disk);
-    if (h->block_translation_address_on_disk == 0) {
-	h->block_translation = 0;
-    } else {
+    if (block_translation_address_on_disk == 0) {
+        //There is no data on the disk.
+        //Create empty translation table.
+        toku_blocktable_create(&h->blocktable,
+                               free_blocks, unused_blocks,
+                               translated_blocknum_limit,
+                               block_translation_address_on_disk,
+                               block_translation_size_on_disk, NULL);
+    }
+    else {
+        //Load translation table if it exists on disk.
 	lock_for_pwrite();
-	block_allocator_alloc_block_at(h->block_allocator, h->block_translation_size_on_disk, h->block_translation_address_on_disk);
-	XMALLOC_N(h->translated_blocknum_limit, h->block_translation);
-	unsigned char *XMALLOC_N(h->block_translation_size_on_disk, tbuf);
+        //TODO: #1463 load!
+	unsigned char *XMALLOC_N(block_translation_size_on_disk, tbuf);
 	{
-	    ssize_t r = pread(fd, tbuf, h->block_translation_size_on_disk, h->block_translation_address_on_disk);
+	    ssize_t r = pread(fd, tbuf, block_translation_size_on_disk, block_translation_address_on_disk);
 	    // This cast is messed up in 32-bits if the block translation table is ever more than 4GB.  But in that case, the translation table itself won't fit in main memory.
-	    assert((u_int64_t)r==h->block_translation_size_on_disk);
+	    assert((u_int64_t)r==block_translation_size_on_disk);
 	}
 	{
 	    // check the checksum
-	    u_int32_t x1764 = x1764_memory(tbuf, h->block_translation_size_on_disk - 4);
-	    u_int64_t offset = h->block_translation_size_on_disk - 4;
-	    //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, h->block_translation_address_on_disk, offset, h->block_translation_size_on_disk);
+	    u_int32_t x1764 = x1764_memory(tbuf, block_translation_size_on_disk - 4);
+	    u_int64_t offset = block_translation_size_on_disk - 4;
+	    //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
 	    u_int32_t stored_x1764 = toku_ntohl(*(int*)(tbuf + offset));
 	    assert(x1764 == stored_x1764);
 	}
-	// now read all that data.
-	u_int64_t i;
-	struct rbuf rt;
-	rt.buf = tbuf;
-	rt.ndone = 0;
-	rt.size = h->block_translation_size_on_disk-4;
-	assert(rt.size>0);
-	for (i=0; i<h->translated_blocknum_limit; i++) {
-	    h->block_translation[i].diskoff = rbuf_diskoff(&rt);
-	    h->block_translation[i].size    = rbuf_diskoff(&rt);
-	    if (h->block_translation[i].size > 0)
-		block_allocator_alloc_block_at(h->block_allocator, h->block_translation[i].size, h->block_translation[i].diskoff);
-	    //printf("%s:%d %ld %ld\n", __FILE__, __LINE__, h->block_translation[i].diskoff, h->block_translation[i].size);
-	}
+	// Create table and read in data.
+        toku_blocktable_create(&h->blocktable,
+                               free_blocks, unused_blocks,
+                               translated_blocknum_limit,
+                               block_translation_address_on_disk,
+                               block_translation_size_on_disk,
+                               tbuf);
 	unlock_for_pwrite();
 	toku_free(tbuf);
    }
@@ -898,7 +896,7 @@ deserialize_brtheader (u_int32_t size, int fd, DISKOFF off, struct brt_header **
    toku_free(rc.buf);
    {
 	int r;
-	if ((r = deserialize_fifo_at(fd, block_allocator_allocated_limit(h->block_allocator), &h->fifo))) return r;
+	if ((r = deserialize_fifo_at(fd, toku_block_allocator_allocated_limit(h->blocktable), &h->fifo))) return r;
    }
    *brth = h;
    return 0;

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -557,10 +557,8 @@ brtheader_init(struct brt_header *h) {

 static void
 brtheader_partial_destroy(struct brt_header *h) {
-    toku_free(h->block_translation);
-    h->block_translation = 0;
+    toku_blocktable_destroy(&h->blocktable);
    toku_fifo_free(&h->fifo);
-    destroy_block_allocator(&h->block_allocator);
 }

 static void
@@ -603,62 +601,6 @@ toku_brtheader_free (struct brt_header *h) {
    brtheader_free(h);
 }

-void
-extend_block_translation (BLOCKNUM blocknum, struct brt_header *h)
-// Effect: Record a block translation.  This means extending the translation table, and setting the diskoff and size to zero in any of the unused spots.
-{
-    if (h->translated_blocknum_limit <= (u_int64_t)blocknum.b) {
-        if (h->block_translation == 0) assert(h->translated_blocknum_limit==0);
-        u_int64_t new_limit = blocknum.b + 1;
-        u_int64_t old_limit = h->translated_blocknum_limit;
-        u_int64_t j;
-        XREALLOC_N(new_limit, h->block_translation);
-        for (j=old_limit; j<new_limit; j++) {
-            h->block_translation[j].diskoff = 0;
-            h->block_translation[j].size    = 0;
-        }
-        h->translated_blocknum_limit = new_limit;
-    }
-}
-
-const DISKOFF diskoff_is_null = (DISKOFF)-1; // in a freelist, this indicates end of list
-const DISKOFF size_is_free = (DISKOFF)-1;
-
-static int
-allocate_diskblocknumber (BLOCKNUM *res, BRT brt, TOKULOGGER logger __attribute__((__unused__))) {
-    BLOCKNUM result;
-    if (brt->h->free_blocks.b == diskoff_is_null) {
-        // no blocks in the free list
-        result = brt->h->unused_blocks;
-        brt->h->unused_blocks.b++;
-    } else {
-        result = brt->h->free_blocks;
-        assert(brt->h->block_translation[result.b].size = size_is_free);
-        brt->h->block_translation[result.b].size = 0;
-        brt->h->free_blocks.b = brt->h->block_translation[result.b].diskoff; // pop the freelist
-    }
-    assert(result.b>0);
-    *res = result;
-    brt->h->dirty = 1;
-    return 0;
-}
-
-static int
-free_diskblocknumber (BLOCKNUM *b, struct brt_header *h, TOKULOGGER logger __attribute__((__unused__)))
-// Effect: Free a diskblock
-//  Watch out for the case where the disk block was never yet written to disk and is beyond the translated_blocknum_limit.
-{
-    extend_block_translation(*b, h);
-    assert((u_int64_t)b->b < h->translated_blocknum_limit); // as a "limit" it should be <
-    assert(h->block_translation[b->b].size != size_is_free);
-    h->block_translation[b->b].size = size_is_free;
-    h->block_translation[b->b].diskoff = h->free_blocks.b;
-    h->free_blocks.b = b->b;
-    b->b = 0;
-    h->dirty = 1;
-    return 0;
-}
-
 static void
 initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height)
 // Effect: Fill in N as an empty brtnode.
@@ -712,7 +654,9 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
    int new_height = nodea->height+1;
    int new_nodesize = brt->h->nodesize;
    BLOCKNUM newroot_diskoff;
-    r = allocate_diskblocknumber(&newroot_diskoff, brt, logger);
+    r = toku_allocate_diskblocknumber(brt->h->blocktable,
+                                      &newroot_diskoff,
+                                      &brt->h->dirty, logger);
    assert(r==0);
    assert(newroot);
    newroot->ever_been_written = 0;
@@ -780,7 +724,7 @@ int toku_create_new_brtnode (BRT t, BRTNODE *result, int height, TOKULOGGER logg
    TAGMALLOC(BRTNODE, n);
    int r;
    BLOCKNUM name;
-    r = allocate_diskblocknumber (&name, t, logger);
+    r = toku_allocate_diskblocknumber(t->h->blocktable, &name, &t->h->dirty, logger);
    assert(r==0);
    assert(n);
    assert(t->h->nodesize>0);
@@ -2227,15 +2171,8 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_io, TOKUL
        if (did_merge) {
            BLOCKNUM bn = childb->thisnodename;
            rrb = toku_cachetable_unpin_and_remove(t->cf, bn);
-            // If the block_translation indicates that the size is <=0 then there is no block allocated.
-            // The block translation might not be big enough, and that also indicates no block allocated.
-            assert(0 <= bn.b); // the blocknumber better be good
-            if ((unsigned)bn.b < t->h->translated_blocknum_limit) {
-                if (t->h->block_translation[bn.b].size > 0) {
-                    block_allocator_free_block(t->h->block_allocator, t->h->block_translation[bn.b].diskoff);
-                }
-            }
-            rrb1 = free_diskblocknumber(&bn, t->h, logger);
+            rrb1 = toku_free_diskblocknumber(t->h->blocktable, &bn,
+                                             &t->h->dirty, logger);
        } else {
            rrb = toku_unpin_brtnode(t, childb);
        }
@@ -2246,7 +2183,7 @@ brt_merge_child (BRT t, BRTNODE node, int childnum_to_merge, BOOL *did_io, TOKUL
    }
    verify_local_fingerprint_nonleaf(node);
    return r;
-    }
+}

 static int
 brt_handle_maybe_reactive_child(BRT t, BRTNODE node, int childnum, enum reactivity re, BOOL *did_io, TOKULOGGER logger, BOOL *did_react) {
@@ -2315,7 +2252,8 @@ flush_this_child (BRT t, BRTNODE node, int childnum, TOKULOGGER logger, enum rea
 {
    assert(node->height>0);
    BLOCKNUM targetchild = BNC_BLOCKNUM(node, childnum);
-    assert(targetchild.b>=0 && targetchild.b<t->h->unused_blocks.b); // This assertion could fail in a concurrent setting since another process might have bumped unused memory. 
+    //TODO: #1463 This assert...
+    toku_verify_diskblocknumber_allocated(t->h->blocktable, targetchild);
    u_int32_t childfullhash = compute_child_fullhash(t->cf, node, childnum);
    BRTNODE child;
    {
@@ -2760,23 +2698,18 @@ static int brt_init_header(BRT t, TOKUTXN txn) {
    t->h->dirty=1;
    t->h->flags_array[0] = t->flags;
    t->h->nodesize=t->nodesize;
-    t->h->free_blocks = make_blocknum(-1);
-    t->h->unused_blocks=make_blocknum(2);
-    t->h->translated_blocknum_limit = 0;
-    t->h->block_translation = 0;
-    t->h->block_translation_size_on_disk = 0;
-    t->h->block_translation_address_on_disk = 0;
-    // printf("%s:%d translated_blocknum_limit=%ld, block_translation_address_on_disk=%ld\n", __FILE__, __LINE__, t->h->translated_blocknum_limit, t->h->block_translation_address_on_disk);
-    create_block_allocator(&t->h->block_allocator, BLOCK_ALLOCATOR_HEADER_RESERVE, BLOCK_ALLOCATOR_ALIGNMENT);
+    toku_blocktable_create_new(&t->h->blocktable);
    toku_fifo_create(&t->h->fifo);
    t->h->root_put_counter = global_root_put_counter++; 
            
    {
+        BLOCKNUM free_blocks = toku_block_get_free_blocks(t->h->blocktable);
+        BLOCKNUM unused_blocks = toku_block_get_unused_blocks(t->h->blocktable);
        LOGGEDBRTHEADER lh = {.size= toku_serialize_brt_header_size(t->h),
                              .flags = t->flags,
                              .nodesize = t->h->nodesize,
-                              .free_blocks = t->h->free_blocks,
-                              .unused_blocks = t->h->unused_blocks,
+                              .free_blocks = free_blocks,
+                              .unused_blocks = unused_blocks,
                              .n_named_roots = t->h->n_named_roots };
        if (t->h->n_named_roots>=0) {
            lh.u.many.names = t->h->names;
@@ -2788,7 +2721,7 @@ static int brt_init_header(BRT t, TOKUTXN txn) {
    }
    if ((r=setup_initial_brt_root_node(t, root, toku_txn_logger(txn)))!=0) { return r; }
    //printf("%s:%d putting %p (%d)\n", __FILE__, __LINE__, t->h, 0);
-    assert(t->h->free_blocks.b==-1);
+    toku_block_verify_no_free_blocks(t->h->blocktable);
    toku_cachefile_set_userdata(t->cf, t->h, toku_brtheader_close, toku_brtheader_checkpoint);

    return r;
@@ -2940,7 +2873,7 @@ int toku_brt_open(BRT t, const char *fname, const char *fname_in_env, const char
            t->h->n_named_roots++;
            if ((t->h->names[t->h->n_named_roots-1] = toku_strdup(dbname)) == 0)                                { assert(errno==ENOMEM); r=ENOMEM; goto died_after_read_and_pin; }
            //printf("%s:%d t=%p\n", __FILE__, __LINE__, t);
-            r = allocate_diskblocknumber(&t->h->roots[t->h->n_named_roots-1], t, toku_txn_logger(txn));
+            r = toku_allocate_diskblocknumber(t->h->blocktable, &t->h->roots[t->h->n_named_roots-1], &t->h->dirty, toku_txn_logger(txn));
            if (r!=0) goto died_after_read_and_pin;
            t->h->dirty = 1;
            compute_and_fill_remembered_hash(t, t->h->n_named_roots-1);
@@ -3074,7 +3007,9 @@ toku_brtheader_checkpoint (CACHEFILE cachefile, void *header_v)
 	    int r = toku_serialize_brt_header_to(toku_cachefile_fd(cachefile), h);
 	    if (r) return r;
 	}
-	u_int64_t write_to = block_allocator_allocated_limit(h->block_allocator); // Must compute this after writing the header.
+        //We would want retrieving 'write_to' and writing to that point to be
+        //atomic.  This is only done during shutdown of a BRT, so we allow it.
+	u_int64_t write_to = toku_block_allocator_allocated_limit(h->blocktable); // Must compute this after writing the header.
 	//printf("%s:%d fifo written to %lu\n", __FILE__, __LINE__, write_to);
 	{
 	    int r = toku_serialize_fifo_at(toku_cachefile_fd(cachefile), write_to, h->fifo);
@@ -4360,12 +4295,7 @@ int toku_dump_brt (FILE *f, BRT brt) {
    CACHEKEY *rootp;
    assert(brt->h);
    u_int32_t fullhash;
-    u_int64_t i;
-    fprintf(f, "Block translation:");
-    for (i=0; i<brt->h->translated_blocknum_limit; i++) {
-        fprintf(f, " %"PRIu64": %"PRId64" %"PRId64"", i, brt->h->block_translation[i].diskoff, brt->h->block_translation[i].size);
-    }
-    fprintf(f, "\n");
+    toku_block_dump_translation_table(f, brt->h->blocktable);
    rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
    return toku_dump_brtnode(f, brt, *rootp, 0, 0, 0, 0, 0);
 }
@@ -4396,12 +4326,14 @@ static void toku_brt_lock_init(void) {
    toku_pwrite_lock_init();
    toku_logger_lock_init();
    toku_graceful_lock_init();
+    toku_blocktable_lock_init();
 }

 static void toku_brt_lock_destroy(void) {
    toku_pwrite_lock_destroy();
    toku_logger_lock_destroy();
    toku_graceful_lock_destroy();
+    toku_blocktable_lock_destroy();
 }

 void toku_brt_init(void) {

--- a/newbrt/brt.h
+++ b/newbrt/brt.h
@@ -111,8 +111,6 @@ enum brt_header_flags {

 int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less,  u_int64_t *equal,  u_int64_t *greater);

-void extend_block_translation (BLOCKNUM blocknum, struct brt_header *h);
-
 void toku_brt_init(void);
 void toku_brt_destroy(void);
 void toku_pwrite_lock_init(void);

--- a/newbrt/brtdump.c
+++ b/newbrt/brtdump.c
@@ -30,8 +30,10 @@ dump_header (int f, struct brt_header **header) {
    else printf(" layout_version=%d\n", h->layout_version);
    printf(" dirty=%d\n", h->dirty);
    printf(" nodesize=%u\n", h->nodesize);
-    printf(" free_blocks=%" PRId64 "\n", h->free_blocks.b);
-    printf(" unused_memory=%" PRId64 "\n", h->unused_blocks.b);
+    BLOCKNUM free_blocks = toku_block_get_free_blocks(h->blocktable);
+    BLOCKNUM unused_blocks = toku_block_get_unused_blocks(h->blocktable);
+    printf(" free_blocks=%" PRId64 "\n", free_blocks.b);
+    printf(" unused_memory=%" PRId64 "\n", unused_blocks.b);
    if (h->n_named_roots==-1) {
 	printf(" unnamed_root=%" PRId64 "\n", h->roots[0].b);
 	printf(" flags=%u\n", h->flags_array[0]);
@@ -165,10 +167,7 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {

 static void 
 dump_block_translation(struct brt_header *h, u_int64_t offset) {
-    if (offset < h->translated_blocknum_limit) {
-        struct block_translation_pair *bx = &h->block_translation[offset];
-        printf("%" PRIu64 ": %" PRId64 " %" PRId64 "\n", offset, bx->diskoff, bx->size);
-    }
+    toku_block_dump_translation(h->blocktable, offset);
 }

 static int 
@@ -187,28 +186,31 @@ dump_fragmentation(int f, struct brt_header *h) {
    u_int64_t leafblocks = 0;
    u_int64_t fragsizes = 0;
    u_int64_t i;
-    for (i = 0; i < h->translated_blocknum_limit; i++) {
+    u_int64_t limit = toku_block_get_translated_blocknum_limit(h->blocktable);
+    for (i = 0; i < limit; i++) {
        BRTNODE n;
 	BLOCKNUM blocknum = make_blocknum(i);
        int r = toku_deserialize_brtnode_from (f, blocknum, 0 /*pass zero for hash, it doesn't matter*/, &n, h);
 	if (r != 0) continue;
-        blocksizes += h->block_translation[i].size;
+
+        DISKOFF size = toku_block_get_size(h->blocktable, blocknum);
+        blocksizes += size;
 	if (n->height == 0) {
-	    leafsizes += h->block_translation[i].size;
+	    leafsizes += size;
 	    leafblocks += 1;
 	}
 	toku_brtnode_free(&n);
    }
-    size_t n = h->translated_blocknum_limit * sizeof (struct block_translation_pair);
+    size_t n = limit * sizeof (struct block_translation_pair);
    struct block_translation_pair *bx = toku_malloc(n);
-    memcpy(bx, h->block_translation, n);
-    qsort(bx, h->translated_blocknum_limit, sizeof (struct block_translation_pair), bxpcmp);
-    for (i = 0; i < h->translated_blocknum_limit - 1; i++) {
+    toku_block_memcpy_translation_table(h->blocktable, n, bx);
+    qsort(bx, limit, sizeof (struct block_translation_pair), bxpcmp);
+    for (i = 0; i < limit - 1; i++) {
        // printf("%lu %lu %lu\n", i, bx[i].diskoff, bx[i].size);
        fragsizes += bx[i+1].diskoff - (bx[i].diskoff + bx[i].size);
    }
    toku_free(bx);
-    printf("translated_blocknum_limit: %" PRIu64 "\n", h->translated_blocknum_limit);
+    printf("translated_blocknum_limit: %" PRIu64 "\n", limit);
    printf("leafblocks: %" PRIu64 "\n", leafblocks);
    printf("blocksizes: %" PRIu64 "\n", blocksizes);
    printf("leafsizes: %" PRIu64 "\n", leafsizes);
@@ -299,15 +301,24 @@ main (int argc, const char *argv[]) {
    } else {
 	BLOCKNUM blocknum;
 	printf("Block translation:");
-	for (blocknum.b=0; blocknum.b<h->unused_blocks.b; blocknum.b++) {
+
+        u_int64_t limit = toku_block_get_translated_blocknum_limit(h->blocktable);
+        BLOCKNUM unused_blocks = toku_block_get_unused_blocks(h->blocktable);
+        size_t bx_size = limit * sizeof (struct block_translation_pair);
+        struct block_translation_pair *bx = toku_malloc(bx_size);
+        toku_block_memcpy_translation_table(h->blocktable, bx_size, bx);
+
+
+	for (blocknum.b=0; blocknum.b< unused_blocks.b; blocknum.b++) {
 	    printf(" %" PRId64 ":", blocknum.b);
-	    if (h->block_translation[blocknum.b].size == -1) printf("free");
-	    else printf("%" PRId64 ":%" PRId64, h->block_translation[blocknum.b].diskoff, h->block_translation[blocknum.b].size);
+	    if (bx[blocknum.b].size == -1) printf("free");
+	    else printf("%" PRId64 ":%" PRId64, bx[blocknum.b].diskoff, bx[blocknum.b].size);
 	}
-	for (blocknum.b=1; blocknum.b<h->unused_blocks.b; blocknum.b++) {
-	    if (h->block_translation[blocknum.b].size != -1)
+	for (blocknum.b=1; blocknum.b<unused_blocks.b; blocknum.b++) {
+	    if (bx[blocknum.b].size != -1)
 		dump_node(f, blocknum, h);
        }
+        toku_free(bx);
    }
    toku_brtheader_free(h);
    toku_malloc_cleanup();

--- a/newbrt/recover.c
+++ b/newbrt/recover.c
@@ -137,8 +137,9 @@ static void toku_recover_fheader (LSN UU(lsn), TXNID UU(txnid),FILENUM filenum,L
    XMALLOC(h->flags_array);
    h->flags_array[0] = header.flags;
    h->nodesize = header.nodesize;
-    h->free_blocks   = header.free_blocks;
-    h->unused_blocks = header.unused_blocks;
+    assert(h->blocktable /* Not initialized.  Is this used? */);
+    toku_block_recovery_set_free_blocks(h->blocktable, header.free_blocks);
+    toku_block_recovery_set_unused_blocks(h->blocktable, header.unused_blocks);
    h->n_named_roots = header.n_named_roots;
    r=toku_fifo_create(&h->fifo);
    assert(r==0);
@@ -687,7 +688,7 @@ toku_recover_changeunusedmemory (LSN UU(lsn), FILENUM filenum, BLOCKNUM UU(oldun
    assert(r==0);
    assert(pair->brt);
    assert(pair->brt->h);
-    pair->brt->h->unused_blocks = newunused;
+    toku_block_recovery_set_unused_blocks(pair->brt->h->blocktable, newunused);
 }

 static int toku_recover_checkpoint (LSN UU(lsn)) {

--- a/newbrt/tests/Makefile
+++ b/newbrt/tests/Makefile
@@ -84,8 +84,6 @@ REGRESSION_TESTS_RAW = \
 	omt-cursor-test \
 	omt-test \
 	shortcut \
-	test1305 \
-	test1308a \
 	test-assert \
 	test-brt-delete-both \
 	test-brt-overflow \

--- a/newbrt/tests/brt-serialize-test.c
+++ b/newbrt/tests/brt-serialize-test.c
@@ -53,14 +53,13 @@ static void test_serialize(void) {
    memset(btps, 0, sizeof(btps));
    brt->h = brt_h;
    brt_h->panic = 0; brt_h->panic_string = 0;
-    brt_h->translated_blocknum_limit = 1;
-    brt_h->block_translation = btps;
-    brt_h->block_translation[20].diskoff = 4096;
-    brt_h->block_translation[20].size    = 100;
-    create_block_allocator(&brt_h->block_allocator, 4096, BLOCK_ALLOCATOR_ALIGNMENT);
+    toku_blocktable_create_new(&brt_h->blocktable);
+    toku_blocktable_debug_set_translation(brt_h->blocktable, 1, btps);
+    btps[20].diskoff = 4096;
+    btps[20].size    = 100;
    {
 	u_int64_t b;
-	block_allocator_alloc_block(brt_h->block_allocator, 100, &b);
+        toku_block_alloc(brt_h->blocktable, 100, &b);
 	assert(b==4096);
    }
    
@@ -120,9 +119,8 @@ static void test_serialize(void) {
    toku_free(sn.u.n.childinfos);
    toku_free(sn.u.n.childkeys);

-    block_allocator_free_block(brt_h->block_allocator, 4096);
-    destroy_block_allocator(&brt_h->block_allocator);
-    toku_free(brt_h->block_translation);
+    toku_block_free(brt_h->blocktable, 4096);
+    toku_blocktable_destroy(&brt_h->blocktable);
    toku_free(brt_h);
    toku_free(brt);
 }

--- a/newbrt/wbuf.h
+++ b/newbrt/wbuf.h
@@ -15,6 +15,7 @@
 /* This code requires that the buffer be big enough to hold whatever you put into it. */ 
 /* This abstraction doesn't do a good job of hiding its internals.
 * Why?  The performance of this code is important, and we want to inline stuff */
+//Why is size here an int instead of DISKOFF like in the initializer?
 struct wbuf {
    unsigned char *buf;
    unsigned int  size;