Addresses #855, #856. Reworked ExpectedO(1) cursors to be as close

as possible to the O(logn) cursors. Semantics are now as close as possible, and may be a little faster now. git-svn-id: file:///svn/tokudb@4339 c7de825b-a66e-492c-adef-691d508d4ae1

Addresses #855, #856. Reworked ExpectedO(1) cursors to be as close
as possible to the O(logn) cursors. Semantics are now as close as possible, and may be a little faster now. git-svn-id: file:///svn/tokudb@4339 c7de825b-a66e-492c-adef-691d508d4ae1
e1bd940a · Yoni Fogel · 4c711ed0 · 4c711ed0 · e1bd940a · 4c711ed0
Commit e1bd940a authored Jun 03, 2008 by Yoni Fogel
3 changed files
--- a/newbrt/omt-with-o1-cursors/omt-internal.h
+++ b/newbrt/omt-with-o1-cursors/omt-internal.h
-/**
-  \brief OMT implementation header
-*/
-
-#if !defined(OMTI_H)
-#define OMTI_H
-
-#ident "Copyright (c) 2008 Tokutek Inc.  All rights reserved."
-
-#include <stdint.h>
-
-/** Type for the node index */
-typedef u_int32_t node_idx;
-
-
-/** Define a NULL index in the node array */
-#define NODE_NULL UINT32_MAX
-
-/** OMT node */
-typedef struct omt_node *OMT_NODE;
-struct omt_node {
-    u_int32_t weight; /* Size of subtree rooted at this node 
-                         (including this one). */
-    node_idx  left;   /* Index of left  subtree. */
-    node_idx  right;  /* Index of right subtree. */
-    OMTVALUE  value;  /* The value stored in the node. */
-};
-
-/** Order Maintenance Tree */
-struct omt {
-    node_idx   root;
-
-    u_int32_t  node_capacity;
-    OMT_NODE   nodes;
-    node_idx   free_idx;
-
-    u_int32_t  tmparray_size;
-    node_idx*  tmparray;
-};
-
-//Initial max size of root-to-leaf path
-#define TOKU_OMTCURSOR_INITIAL_SIZE 4
-
-// Cursor for order maintenance tree
-struct omtcursor {
-    u_int32_t max_pathlen; //Max (root to leaf) path length;
-    u_int32_t pathlen;     //Length of current path
-    node_idx *path;
-    OMT       omt;         //Associated OMT
-};
-
-
-#endif  /* #ifndef OMTI_H */
-
--- a/newbrt/omt-with-o1-cursors/omt.c
+++ b/newbrt/omt-with-o1-cursors/omt.c
@@ -2,15 +2,49 @@

 #include <errno.h>
 #include <sys/types.h>
+#include <stdint.h>

 typedef void *OMTVALUE;
 #include "omt.h"
-#include "omt-internal.h"
 #include "../newbrt/memory.h"
 #include "../newbrt/toku_assert.h"
 #include "../include/db.h"
 #include "../newbrt/brttypes.h"

+typedef u_int32_t node_idx;
+static const node_idx NODE_NULL = UINT32_MAX;
+
+typedef struct omt_node *OMT_NODE;
+struct omt_node {
+    u_int32_t weight; /* Size of subtree rooted at this node (including this one). */
+    node_idx  left;   /* Index of left  subtree. */
+    node_idx  right;  /* Index of right subtree. */
+    OMTVALUE  value;  /* The value stored in the node. */
+};
+
+struct omt {
+    node_idx   root;
+
+    u_int32_t  node_capacity;
+    OMT_NODE   nodes;
+    node_idx   free_idx;
+
+    u_int32_t  tmparray_size;
+    node_idx*  tmparray;
+
+    OMTCURSOR  associated; // the OMTs associated with this.
+};
+
+struct omt_cursor {
+    OMT omt;   // The omt this cursor is associated with.  NULL if not present.
+    u_int32_t max_pathlen; //Max (root to leaf) path length;
+    u_int32_t pathlen;     //Length of current path
+    node_idx *path;
+    OMTCURSOR next,prev; // circular linked list of all OMTCURSORs associated with omt.
+};
+
+//Initial max size of root-to-leaf path
+static const u_int32_t TOKU_OMTCURSOR_INITIAL_SIZE = 4;

 static int omt_create_internal(OMT *omtp, u_int32_t num_starting_nodes) {
    if (num_starting_nodes < 2) num_starting_nodes = 2;
@@ -31,6 +65,7 @@ static int omt_create_internal(OMT *omtp, u_int32_t num_starting_nodes) {
        return errno;
    }
    result->free_idx = 0;
+    result->associated = NULL;
    *omtp = result;
    return 0;
 }
@@ -39,8 +74,74 @@ int toku_omt_create (OMT *omtp) {
    return omt_create_internal(omtp, 2);
 }

+int toku_omt_cursor_create (OMTCURSOR *omtcp) {
+    OMTCURSOR MALLOC(c);
+    if (c==NULL) return errno;
+    c->omt = NULL;
+    c->next = c->prev = NULL;
+    c->max_pathlen    = TOKU_OMTCURSOR_INITIAL_SIZE;
+    c->pathlen        = 0;
+    MALLOC_N(c->max_pathlen, c->path); 
+    if (c->path==NULL) {
+        toku_free(c);
+        return errno;
+    }
+    *omtcp = c;
+    return 0;
+}
+
+void toku_omt_cursor_invalidate (OMTCURSOR c) {
+    if (c==NULL || c->omt==NULL) return;
+    if (c->next == c) {
+	// It's the last one.
+	c->omt->associated = NULL;
+    } else {
+	OMTCURSOR next = c->next;
+	OMTCURSOR prev = c->prev;
+	if (c->omt->associated == c) {
+	    c->omt->associated = next;
+	}
+	next->prev = prev;
+	prev->next = next;
+    }
+    c->next = c->prev = NULL;
+    c->omt = NULL;
+}
+
+void toku_omt_cursor_destroy (OMTCURSOR *p) {
+    toku_omt_cursor_invalidate(*p);
+    toku_free((*p)->path);
+    toku_free(*p);
+    *p = NULL;
+}
+
+static void invalidate_cursors (OMT omt) {
+    OMTCURSOR assoced;
+    while ((assoced = omt->associated)) {
+	toku_omt_cursor_invalidate(assoced);
+    }
+}
+
+static void associate (OMT omt, OMTCURSOR c)
+{
+    if (c->omt==omt) return;
+    toku_omt_cursor_invalidate(c);
+    if (omt->associated==NULL) {
+	c->prev = c;
+	c->next = c;
+	omt->associated = c;
+    } else {
+	c->prev = omt->associated->prev;
+	c->next = omt->associated;
+	omt->associated->prev->next = c;
+	omt->associated->prev = c;
+    }
+    c->omt = omt;
+}
+
 void toku_omt_destroy(OMT *omtp) {
    OMT omt=*omtp;
+    invalidate_cursors(omt);
    toku_free(omt->nodes);
    toku_free(omt->tmparray);
    toku_free(omt);
@@ -242,7 +343,8 @@ static inline void insert_internal(OMT omt, node_idx *n_idxp, OMTVALUE value, u_

 int toku_omt_insert_at(OMT omt, OMTVALUE value, u_int32_t index) {
    int r;
-    if (index>nweight(omt, omt->root)) return ERANGE;
+    invalidate_cursors(omt);
+    if (index>nweight(omt, omt->root)) return EINVAL;
    if ((r=maybe_resize_and_rebuild(omt, 1+nweight(omt, omt->root), MAYBE_REBUILD))) return r;
    node_idx* rebalance_idx = NULL;
    insert_internal(omt, &omt->root, value, index, &rebalance_idx);
@@ -263,7 +365,7 @@ static inline void set_at_internal(OMT omt, node_idx n_idx, OMTVALUE v, u_int32_
 }

 int toku_omt_set_at (OMT omt, OMTVALUE value, u_int32_t index) {
-    if (index>=nweight(omt, omt->root)) return ERANGE;
+    if (index>=nweight(omt, omt->root)) return EINVAL;
    set_at_internal(omt, omt->root, value, index);
    return 0;
 }
@@ -310,7 +412,8 @@ static inline void delete_internal(OMT omt, node_idx *n_idxp, u_int32_t index, O
 int toku_omt_delete_at(OMT omt, u_int32_t index) {
    OMTVALUE v;
    int r;
-    if (index>=nweight(omt, omt->root)) return ERANGE;
+    invalidate_cursors(omt);
+    if (index>=nweight(omt, omt->root)) return EINVAL;
    if ((r=maybe_resize_and_rebuild(omt, -1+nweight(omt, omt->root), MAYBE_REBUILD))) return r;
    node_idx* rebalance_idx = NULL;
    delete_internal(omt, &omt->root, index, &v, &rebalance_idx);
@@ -318,7 +421,12 @@ int toku_omt_delete_at(OMT omt, u_int32_t index) {
    return 0;
 }

-static int omtcursor_stack_push(OMTCURSOR c, node_idx idx) {
+static inline void omtcursor_stack_pop(OMTCURSOR c) {
+    assert(c->pathlen);
+    c->pathlen--;
+}
+
+static inline int omtcursor_stack_push(OMTCURSOR c, node_idx idx) {
    if (c->max_pathlen-1<=c->pathlen) {
        //Increase max_pathlen
        u_int32_t new_max = c->max_pathlen*2;
@@ -331,46 +439,31 @@ static int omtcursor_stack_push(OMTCURSOR c, node_idx idx) {
    return 0;
 }

-static node_idx omtcursor_stack_peek(OMTCURSOR c) {
+static inline node_idx omtcursor_stack_peek(OMTCURSOR c) {
    return c->path[c->pathlen-1];
 }

-static node_idx omtcursor_stack_pop(OMTCURSOR c) {
-    assert(c->pathlen);
-    node_idx value = omtcursor_stack_peek(c);;
-    c->pathlen--;
-    return value;
-}
-
-static void omtcursor_associate(OMTCURSOR c, OMT omt) {
-    c->omt     = omt;
-    c->pathlen = 0;
-}
-
 static inline int fetch_internal(OMT V, node_idx idx, u_int32_t i, OMTVALUE *v, OMTCURSOR c) {
+    OMT_NODE n = V->nodes+idx;
    int r;
-    // Add the current index to the cursor path
    if (c!=NULL && (r=omtcursor_stack_push(c, idx))) return r;
-
-    /* Find the node corresponding to index idx */
-    OMT_NODE n = V->nodes+idx;
-
-    /* Visit recursively the appropriate sub-tree */
    if (i < nweight(V, n->left)) {
        return fetch_internal(V, n->left,  i, v, c);
    } else if (i == nweight(V, n->left)) {
        *v = n->value;
+        return 0;
    } else {
        return fetch_internal(V, n->right, i-nweight(V, n->left)-1, v, c);
    }
-    return 0;
 }

 int toku_omt_fetch(OMT V, u_int32_t i, OMTVALUE *v, OMTCURSOR c) {
-    if (i>=nweight(V, V->root)) return ERANGE;
-    if (c!=NULL) omtcursor_associate(c, V);
+    if (i>=nweight(V, V->root)) return EINVAL;
+    if (c) associate(V,c);
    int r = fetch_internal(V, V->root, i, v, c);
-    if (c!=NULL && r!=0) toku_omt_cursor_invalidate(c);
+    if (c && r!=0) {
+        toku_omt_cursor_invalidate(c);
+    }
    return r;
 }

@@ -399,6 +492,8 @@ int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v,
    int r;
    u_int32_t idx;

+    invalidate_cursors(omt);
+
    r = toku_omt_find_zero(omt, h, v, NULL, &idx, NULL);
    if (r==0) {
        if (index) *index = idx;
@@ -412,13 +507,14 @@ int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v,
    return 0;
 }

-static inline int find_internal_zero(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
+static inline int find_internal_zero(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
+// requires: index!=NULL
+{
    int r;
    if (n_idx==NODE_NULL) {
-	*index=0;
+	*index = 0;
 	return DB_NOTFOUND;
    }
-    // Add the current index to the cursor path
    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
    OMT_NODE n = omt->nodes+n_idx;
    int hv = h(n->value, extra);
@@ -427,17 +523,12 @@ static inline int find_internal_zero(OMT omt, node_idx n_idx, int (*h)(OMTVALUE,
        *index += nweight(omt, n->left)+1;
        return r;
    } else if (hv>0) {
-        r = find_internal_zero(omt, n->left, h, extra, value, index, c);
-        if (c!=NULL && r==DB_NOTFOUND && *index==nweight(omt, n->left)) {
-            //Truncate the saved cursor path at n_idx.
-            while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
-        }
-        return r;
+        return find_internal_zero(omt, n->left, h, extra, value, index, c);
    } else {
-        r = find_internal_zero(omt, n->left, h, extra, value, index, c);
+        r =  find_internal_zero(omt, n->left, h, extra, value, index, c);
        if (r==DB_NOTFOUND) {
            *index = nweight(omt, n->left);
-            *value = n->value;
+            if (value) *value = n->value;
            if (c!=NULL) {
                //Truncate the saved cursor path at n_idx.
                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
@@ -449,35 +540,32 @@ static inline int find_internal_zero(OMT omt, node_idx n_idx, int (*h)(OMTVALUE,
 }

 int toku_omt_find_zero(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
-    if (c!=NULL) omtcursor_associate(c, V);
-    u_int32_t idx_tmp;
-    OMTVALUE  val_tmp; 
-    int r = find_internal_zero(V, V->root, h, extra, &val_tmp, &idx_tmp, c);
-    if (c!=NULL && ( (r!=0 && r!=DB_NOTFOUND) ||
-                      idx_tmp==nweight(V, V->root))) {
-        toku_omt_cursor_invalidate(c);
-    }
-    if (c==NULL || r==0 || r==DB_NOTFOUND) {
-        if (index!=NULL)         *index = idx_tmp;
-        if (value!=NULL && r==0) *value = val_tmp;
+    //Index can be modified before a cursor error, so we must use a temp.
+    u_int32_t tmp_index;
+    if (c) associate(V,c);
+    int r = find_internal_zero(V, V->root, h, extra, value, &tmp_index, c);
+    if (c && r!=0) {
+	toku_omt_cursor_invalidate(c);
    }
+    if ((r==0 || r==DB_NOTFOUND) && index!=NULL) *index = tmp_index;
    return r;
 }

 //  If direction <0 then find the largest  i such that h(V_i,extra)<0.
-static inline int find_internal_minus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
+static inline int find_internal_minus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
+// requires: index!=NULL
+{
    int r;
    if (n_idx==NODE_NULL) return DB_NOTFOUND;
-    // Add the current index to the cursor path
    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
    OMT_NODE n = omt->nodes+n_idx;
    int hv = h(n->value, extra);
    if (hv<0) {
        r = find_internal_minus(omt, n->right, h, extra, value, index, c);
-        if (r==0) (*index) += nweight(omt, n->left)+1;
+        if (r==0) *index += nweight(omt, n->left)+1;
        else if (r==DB_NOTFOUND) {
            *index = nweight(omt, n->left);
-            *value = n->value;
+            if (value!=NULL) *value = n->value;
            if (c!=NULL) {
                //Truncate the saved cursor path at n_idx.
                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
@@ -491,10 +579,11 @@ static inline int find_internal_minus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE
 }

 //  If direction >0 then find the smallest i such that h(V_i,extra)>0.
-static inline int find_internal_plus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
+static inline int find_internal_plus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c)
+// requires: index!=NULL
+{
    int r;
    if (n_idx==NODE_NULL) return DB_NOTFOUND;
-    // Add the current index to the cursor path
    if (c!=NULL && (r=omtcursor_stack_push(c, n_idx))) return r;
    OMT_NODE n = omt->nodes+n_idx;
    int hv = h(n->value, extra);
@@ -502,7 +591,7 @@ static inline int find_internal_plus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE,
        r = find_internal_plus(omt, n->left, h, extra, value, index, c);
        if (r==DB_NOTFOUND) {
            *index = nweight(omt, n->left);
-            *value = n->value;
+            if (value!=NULL) *value = n->value;
            if (c!=NULL) {
                //Truncate the saved cursor path at n_idx.
                while (omtcursor_stack_peek(c)!=n_idx) omtcursor_stack_pop(c);
@@ -512,39 +601,35 @@ static inline int find_internal_plus(OMT omt, node_idx n_idx, int (*h)(OMTVALUE,
        return r;
    } else {
        r = find_internal_plus(omt, n->right, h, extra, value, index, c);
-        if (r==0) (*index) += nweight(omt, n->left)+1;
+        if (r==0) *index += nweight(omt, n->left)+1;
        return r;
    }
 }

 int toku_omt_find(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, u_int32_t *index, OMTCURSOR c) {
+    u_int32_t tmp_index;
+    int r;
+    if (index==NULL) index=&tmp_index;
+    if (c) associate(V,c);
    if (direction==0) {
 	abort();
+    } else if (direction<0) {
+        r = find_internal_minus(V, V->root, h, extra, value, index, c);
+    } else {
+        r = find_internal_plus( V, V->root, h, extra, value, index, c);
    }
-    else {
-        int r;
-        u_int32_t idx_tmp;
-        OMTVALUE  val_tmp;
-        if (c!=NULL) omtcursor_associate(c, V);
-        if (direction<0) {
-            r = find_internal_minus(V, V->root, h, extra, &val_tmp, &idx_tmp, c);
-        } else {
-            r = find_internal_plus( V, V->root, h, extra, &val_tmp, &idx_tmp, c);
-        }
-        if (c!=NULL && r!=0) toku_omt_cursor_invalidate(c);
-        if (r==0) {
-            if (index!=NULL) *index = idx_tmp;
-            if (value!=NULL) *value = val_tmp;
-        }
-        return r;
+    if (c && r!=0) {
+	toku_omt_cursor_invalidate(c);
    }
+    return r;
 }

 int toku_omt_split_at(OMT omt, OMT *newomtp, u_int32_t index) {
    int r                = ENOSYS;
    OMT newomt           = NULL;
    OMTVALUE *tmp_values = NULL;
-    if (index>nweight(omt, omt->root)) { r = ERANGE; goto cleanup; }
+    invalidate_cursors(omt);
+    if (index>nweight(omt, omt->root)) { r = EINVAL; goto cleanup; }
    u_int32_t newsize = nweight(omt, omt->root)-index;
    if ((r = omt_create_internal(&newomt, newsize))) goto cleanup;
    MALLOC_N(nweight(omt, omt->root), tmp_values);
@@ -568,6 +653,8 @@ int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomtp) {
    int r                = ENOSYS;
    OMT newomt           = NULL;
    OMTVALUE *tmp_values = NULL;
+    invalidate_cursors(leftomt);
+    invalidate_cursors(rightomt);
    u_int32_t newsize = toku_omt_size(leftomt)+toku_omt_size(rightomt);
    if ((r = omt_create_internal(&newomt, newsize))) goto cleanup;
    MALLOC_N(newsize, tmp_values);
@@ -589,118 +676,94 @@ int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomtp) {
 }

 void toku_omt_clear(OMT omt) {
+    invalidate_cursors(omt);
    omt->free_idx = 0;
    omt->root     = NODE_NULL;
 }

-int toku_omt_cursor_create(OMTCURSOR *p) {
-    OMTCURSOR MALLOC(result);
-    if (result==NULL) return errno;
-    result->max_pathlen = TOKU_OMTCURSOR_INITIAL_SIZE;
-    result->pathlen     = 0;
-    MALLOC_N(result->max_pathlen, result->path); 
-    if (result->path==NULL) {
-        toku_free(result);
-        return errno;
-    }
-    result->omt = NULL;
-    *p = result;
-    return 0;
-}
-
-void toku_omt_cursor_destroy(OMTCURSOR *p) {
-    OMTCURSOR c=*p;
-    toku_free(c->path);
-    toku_free(c);
-    *p = NULL;
-}
-
-int toku_omt_cursor_is_valid(OMTCURSOR c) {
-    return c->pathlen>0 && c->omt!=NULL;
+unsigned long toku_omt_memory_size (OMT omt) {
+    return sizeof(*omt)+omt->node_capacity*sizeof(omt->nodes[0]) + omt->tmparray_size*sizeof(omt->tmparray[0]);
 }

-void toku_omt_cursor_invalidate(OMTCURSOR c) {
-    c->pathlen = 0;
-    c->omt=NULL;
+int toku_omt_cursor_is_valid (OMTCURSOR c) {
+    return c->omt!=NULL;
 }

-static void omtcursor_current_internal(OMTCURSOR c, OMTVALUE *v) {
+static inline void omtcursor_current_internal(OMTCURSOR c, OMTVALUE *v) {
    *v = c->omt->nodes[omtcursor_stack_peek(c)].value;
 }

-int toku_omt_cursor_current(OMTCURSOR c, OMTVALUE *v) {
-    if (!toku_omt_cursor_is_valid(c)) return EINVAL;
-    omtcursor_current_internal(c, v);
-    return 0;
-}
-
-static int omtcursor_next_internal(OMTCURSOR c) {
-    if (!toku_omt_cursor_is_valid(c)) return EINVAL;
+static inline int omtcursor_next_internal(OMTCURSOR c) {
    OMT_NODE current = c->omt->nodes+omtcursor_stack_peek(c);
    if (current->right!=NODE_NULL) {
        //Enter into subtree
-        if (omtcursor_stack_push(c, current->right)) goto invalidate;
+        if (omtcursor_stack_push(c, current->right)) return EINVAL;
        current = c->omt->nodes+current->right;
        while (current->left!=NODE_NULL) {
-            if (omtcursor_stack_push(c, current->left)) goto invalidate;
+            if (omtcursor_stack_push(c, current->left)) return EINVAL;
            current = c->omt->nodes+current->left;
        }
        return 0;
    }
    else {
        //Pop the stack till we remove a left child.
+        node_idx parent_idx = omtcursor_stack_peek(c);
+        node_idx child_idx;
        while (c->pathlen>=2) {
-            node_idx child_idx  = omtcursor_stack_pop(c);
-            node_idx parent_idx = omtcursor_stack_peek(c);
+            child_idx  = parent_idx;
+            omtcursor_stack_pop(c);
+            parent_idx = omtcursor_stack_peek(c);
            if (c->omt->nodes[parent_idx].left==child_idx) return 0;
        }
-        goto invalidate;
+        return EINVAL;
    }
-invalidate:
-    toku_omt_cursor_invalidate(c);
-    return EINVAL;
 }

-int toku_omt_cursor_next(OMTCURSOR c, OMTVALUE *v) {
-    if (omtcursor_next_internal(c)) return EINVAL;
-    omtcursor_current_internal(c, v);
-    return 0;
+int toku_omt_cursor_next (OMTCURSOR c, OMTVALUE *v) {
+    if (c->omt == NULL) return EINVAL;
+    int r = omtcursor_next_internal(c);
+    if (r!=0) toku_omt_cursor_invalidate(c);
+    else omtcursor_current_internal(c, v);
+    return r;
 }

-static int omtcursor_prev_internal(OMTCURSOR c) {
-    if (!toku_omt_cursor_is_valid(c)) return EINVAL;
+static inline int omtcursor_prev_internal(OMTCURSOR c) {
    OMT_NODE current = c->omt->nodes+omtcursor_stack_peek(c);
    if (current->left!=NODE_NULL) {
        //Enter into subtree
-        if (omtcursor_stack_push(c, current->left)) goto invalidate;
+        if (omtcursor_stack_push(c, current->left)) return EINVAL;
        current = c->omt->nodes+current->left;
        while (current->right!=NODE_NULL) {
-            if (omtcursor_stack_push(c, current->right)) goto invalidate;
+            if (omtcursor_stack_push(c, current->right)) return EINVAL;
            current = c->omt->nodes+current->right;
        }
        return 0;
    }
    else {
        //Pop the stack till we remove a right child.
+        node_idx parent_idx = omtcursor_stack_peek(c);
+        node_idx child_idx;
        while (c->pathlen>=2) {
-            node_idx child_idx  = omtcursor_stack_pop(c);
-            node_idx parent_idx = omtcursor_stack_peek(c);
+            child_idx  = parent_idx;
+            omtcursor_stack_pop(c);
+            parent_idx = omtcursor_stack_peek(c);
            if (c->omt->nodes[parent_idx].right==child_idx) return 0;
        }
-        goto invalidate;
+        return EINVAL;
    }
-invalidate:
-    toku_omt_cursor_invalidate(c);
-    return EINVAL;
 }

-int toku_omt_cursor_prev(OMTCURSOR c, OMTVALUE *v) {
-    if (omtcursor_prev_internal(c)) return EINVAL;
-    omtcursor_current_internal(c, v);
-    return 0;
+int toku_omt_cursor_prev (OMTCURSOR c, OMTVALUE *v) {
+    if (c->omt == NULL) return EINVAL;
+    int r = omtcursor_prev_internal(c);
+    if (r!=0) toku_omt_cursor_invalidate(c);
+    else omtcursor_current_internal(c, v);
+    return r;
 }

-size_t toku_omt_memory_size (OMT omt) {
-    return sizeof(*omt)+omt->node_capacity*sizeof(omt->nodes[0]) + omt->tmparray_size*sizeof(omt->tmparray[0]);
+int toku_omt_cursor_current (OMTCURSOR c, OMTVALUE *v) {
+    if (c->omt == NULL) return EINVAL;
+    omtcursor_current_internal(c, v);
+    return 0;
 }

--- a/newbrt/omt-with-o1-cursors/omt.h
+++ b/newbrt/omt-with-o1-cursors/omt.h
-#if !defined(OMT_H)
-#define OMT_H
-
-#ident "Copyright (c) 2008 Tokutek Inc.  All rights reserved."
-
-// Order Maintenance Tree (OMT)
-//
-// Maintains a collection of totally ordered values, where each value has an integer weight.
-// The OMT is a mutable datatype.
-//
-// The Abstraction:
-//
-// An OMT is a vector of values, $V$, where $|V|$ is the length of the vector.
-// The vector is numbered from $0$ to $|V|-1$.
-// Each value has a weight.  The weight of the $i$th element is denoted $w(V_i)$.
-//
-// We can create a new OMT, which is the empty vector.
-//
-// We can insert a new element $x$ into slot $i$, changing $V$ into $V'$ where
-//  $|V'|=1+|V|$       and
-//
-//   V'_j = V_j       if $j<i$
-//          x         if $j=i$
-//          V_{j-1}   if $j>i$.
-//
-// We can specify $i$ using a kind of function instead of as an integer.
-// Let $b$ be a function mapping from values to nonzero integers, such that
-// the signum of $b$ is monotically increasing.
-// We can specify $i$ as the minimum integer such that $b(V_i)>0$.
-//
-// We look up a value using its index, or using a Heaviside function.
-// For lookups, we allow $b$ to be zero for some values, and again the signum of $b$ must be monotonically increasing.
-// When lookup up values, we can look up
-//  $V_i$ where $i$ is the minimum integer such that $b(V_i)=0$.   (With a special return code if no such value exists.)
-//      (Rationale:  Ordinarily we want $i$ to be unique.  But for various reasons we want to allow multiple zeros, and we want the smallest $i$ in that case.)
-//  $V_i$ where $i$ is the minimum integer such that $b(V_i)>0$.   (Or an indication that no such value exists.)
-//  $V_i$ where $i$ is the maximum integer such that $b(V_i)<0$.   (Or an indication that no such value exists.)
-//
-// When looking up a value using a Heaviside function, we get the value and its index.
-//
-// We can also split an OMT into two OMTs, splitting the weight of the values evenly.
-// Find a value $j$ such that the values to the left of $j$ have about the same total weight as the values to the right of $j$.
-// The resulting two OMTs contain the values to the left of $j$ and the values to the right of $j$ respectively.
-// All of the values from the original OMT go into one of the new OMTs.
-// If the weights of the values don't split exactly evenly, then the implementation has the freedom to choose whether
-//  the new left OMT or the new right OMT is larger.
-//
-// Performance:
-//  Insertion and deletion should run with $O(\log |V|)$ time and $O(\log |V|)$ calls to the Heaviside function.
-//  The memory required is O(|V|).
-//
-
-//**********************************************************************
-//* OMT Cursors
-//**********************************************************************
-
-// OMTs also support cursors.   An OMTCURSOR is a  mutable
-// An OMTCURSOR is a mutable object that, at any moment in time, is
-// either associated with a single OMT or is not associated with any
-// OMT.  Many different OMTCURSORs can be associated with a single OMT.
-
-// We say that an OMTCURSOR is *valid* if it is currently
-// associated with an OMT and has an abstract offset assigned to it.
-// An OMTCURSOR that is not valid is said to be invalid.
-
-// Abstractly, an OMTCURSOR simply contains an integer offset of a
-// particular OMTVALUE.   We call this abstract integer the *offset*.
-// Note, however, that the implementation may use a more
-// complex representation in order to obtain higher performance.
-// (Note: A first implementation might use the integer.)
-
-// Given a valid OMTCURSOR, one
-//  * obtain the OMTVALUE at which the integer points in O(1) time,
-//  * increment or decrement the abstract integer (usually quickly.)
-//    The requirements are that the cursor is initialized to a
-//    randomly chosen valid integer, then the integer can be
-//    incremented in O(1) expected time.
-
-// The OMTCURSOR may become invalidated under several conditions:
-//  * Incrementing or decrementing the abstract integer out of its
-//    valid range invalidates the OMTCURSOR.
-//  * If the OMT is modified, it may invalidate the cursor.
-//  * The user of the OMTCURSOR may explicitly invalidate the cursor.
-//  * The OMT is destroyed (in which case the OMTCURSOR is
-//    invalidated, but not destroyed.)
-
-// Implementation Hint:  One way to implement the OMTCURSOR is with an
-// integer.  The problem is that obtaining the value at which the integer
-// points takes O(\log n) time, which is not fast enough to meet the
-// specification.    However, this implementation is probably much
-// faster than our current implementation because it is O(\log n)
-// integer comparisons instead of O(\log n) key comparisons.  This
-// simple implementation may be the right thing for a first cut.
-//
-// To actually achieve the performance requirements, here's a better
-// implementation:   The OMTCURSOR contains a path from root to leaf.
-// Fetching the current value is O(1) time since the leaf is
-// immediately accessible.   Modifying the path to find the next or
-// previous item has O(1) expected time at a randomly chosen valid
-// point
-//
-// The path can be implemented as an array.  It probably makes sense
-// for the array to by dynamically resized as needed.  Since the
-// array's size is O(log n), it is not necessary to ever shrink the
-// array.  Also, from the perspective of testing, it's probably best
-// if the array is initialized to a short length (e.g., length 4) so
-// that the doubling code is actually exercised.
-
-// One way to implement invalidation is for each OMT to maintain a
-// doubly linked list of OMTCURSORs.  When destroying an OMT or
-// changing the OMT's shape, one can simply step through the list
-// invalidating all the OMTCURSORs.
-
-// The list of OMTCURSORs should use the list.h abstraction.  If it's
-// not clear how to use it, Rich can explain it.
-
-// The programming API:
-
-typedef struct omt *OMT;
-
-typedef struct omtcursor *OMTCURSOR;
-
-
-int toku_omt_create (OMT *omtp);
-// Effect: Create an empty OMT.  Stores it in *omtp.
-// Requires: omtp != NULL
-// Returns:
-//   0        success
-//   ENOMEM   out of memory (and doesn't modify *omtp)
-// Performance: constant time.
-
-int toku_omt_create_from_sorted_array(OMT *omtp, OMTVALUE *values, u_int32_t numvalues);
-// Effect: Create a OMT containing values.  The number of values is in numvalues.
-//  Stores the new OMT in *omtp.
-// Requires: omtp != NULL
-// Requires: values != NULL
-// Requires: values is sorted
-// Returns:
-//   0        success
-//   ENOMEM   out of memory (and doesn't modify *omtp)
-// Performance:  time=O(numvalues)
-// Rational:     Normally to insert N values takes O(N lg N) amortized time.
-//               If the N values are known in advance, are sorted, and
-//               the structure is empty, we can batch insert them much faster.
-
-void toku_omt_destroy(OMT *omtp);
-// Effect:  Destroy an OMT, freeing all its memory.
-//   Does not free the OMTVALUEs stored in the OMT.
-//   Those values may be freed before or after calling toku_omt_destroy.
-//   Also sets *omtp=NULL.
-// Requires: omtp != NULL
-// Requires: *omtp != NULL
-// Rationale:  The usage is to do something like
-//   toku_omt_destroy(&s->omt);
-// and now s->omt will have a NULL pointer instead of a dangling freed pointer.
-// Rationale: Returns no values since free() cannot fail.
-// Rationale: Does not free the OMTVALUEs to reduce complexity.
-// Performance:  time=O(toku_omt_size(*omtp))
-
-u_int32_t toku_omt_size(OMT V);
-// Effect: return |V|.
-// Requires: V != NULL
-// Performance:  time=O(1)
-
-int toku_omt_iterate_on_range(OMT omt, u_int32_t left, u_int32_t right, int (*f)(OMTVALUE, u_int32_t, void*), void*v);
-// Effect:  Iterate over the values of the omt, from left to right, calling f on each value.
-//  The second argument passed to f is the index of the value.
-//  The third argument passed to f is v.
-//  The indices run from 0 (inclusive) to toku_omt_size(omt) (exclusive).
-//  We will iterate only over [left,right)
-//
-// Requires: omt != NULL
-// left <= right
-// Requires: f != NULL
-// Returns:
-//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_omt_iterate.
-//  If f always returns zero, then toku_omt_iterate returns 0.
-// Requires:  Don't modify omt while running.  (E.g., f may not insert or delete values form omt.)
-// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in omt.
-// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
-
-int toku_omt_iterate(OMT omt, int (*f)(OMTVALUE, u_int32_t, void*), void*v);
-// Effect:  Iterate over the values of the omt, from left to right, calling f on each value.
-//  The second argument passed to f is the index of the value.
-//  The third argument passed to f is v.
-//  The indices run from 0 (inclusive) to toku_omt_size(omt) (exclusive).
-// Requires: omt != NULL
-// Requires: f != NULL
-// Returns:
-//  If f ever returns nonzero, then the iteration stops, and the value returned by f is returned by toku_omt_iterate.
-//  If f always returns zero, then toku_omt_iterate returns 0.
-// Requires:  Don't modify omt while running.  (E.g., f may not insert or delete values form omt.)
-// Performance: time=O(i+\log N) where i is the number of times f is called, and N is the number of elements in omt.
-// Rational: Although the functional iterator requires defining another function (as opposed to C++ style iterator), it is much easier to read.
-
-int toku_omt_insert_at(OMT omt, OMTVALUE value, u_int32_t index);
-// Effect: Increases indexes of all items at slot >= index by 1.
-//         Insert value into the position at index.
-//
-// Returns:
-//   0         success
-//   ERANGE    if index>toku_omt_size(omt)
-//   ENOMEM
-// On error, omt is unchanged.
-// Performance: time=O(\log N) amortized time.
-// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
-
-int toku_omt_set_at (OMT omt, OMTVALUE value, u_int32_t index);
-// Effect:  Replaces the item at index with value.
-// Returns:
-//   0       success
-//   ERANGE    if index>=toku_omt_size(omt)
-// On error, omt i sunchanged.
-// Performance: time=O(\log N)
-// Rationale: The BRT needs to be able to replace a value with another copy of the same value (allocated in a different location)
-
-int toku_omt_insert(OMT omt, OMTVALUE value, int(*h)(OMTVALUE, void*v), void *v, u_int32_t *index);
-// Effect:  Insert value into the OMT.
-//   If there is some i such that $h(V_i, v)=0$ then returns DB_KEYEXIST.
-//   Otherwise, let i be the minimum value such that $h(V_i, v)>0$.
-//      If no such i exists, then let i be |V|
-//   Then this has the same effect as
-//    omt_insert_at(tree, value, i);
-//   If index!=NULL then i is stored in *index
-// Requires:  The signum of h must be monotonically increasing.
-// Returns:
-//    0            success
-//    DB_KEYEXIST  the key is present (h was equal to zero for some value)
-//    ENOMEM      
-// On nonzero return, omt is unchanged.
-// On nonzero non-DB_KEYEXIST return, *index is unchanged.
-// Performance: time=O(\log N) amortized.
-// Rationale: Some future implementation may be O(\log N) worst-case time, but O(\log N) amortized is good enough for now.
-
-int toku_omt_delete_at(OMT omt, u_int32_t index);
-// Effect: Delete the item in slot index.
-//         Decreases indexes of all items at slot >= index by 1.
-// Returns
-//     0            success
-//     ERANGE       if index>=toku_omt_size(omt)
-// On error, omt is unchanged.
-// Rationale: To delete an item, first find its index using toku_omt_find, then delete it.
-// Performance: time=O(\log N) amortized.
-
-int toku_omt_fetch (OMT V, u_int32_t i, OMTVALUE *v, OMTCURSOR c);
-// Effect: Set *v=V_i
-//   If c != NULL then set c's abstract offset to i.
-// Requires: v   != NULL
-// Returns
-//    0             success
-//    ERANGE        if index>=toku_omt_size(omt)
-//    ENOMEM        if c!=NULL and we run out of memory
-// On nonzero return, *v is unchanged, and c (if nonnull) is either
-//   invalidated or unchanged.
-// Performance: time=O(\log N)
-// Notes: It is possible that c was previously valid and was
-//   associated with a different OMT.   If c is changed by this
-//   function, the function must remove c's association with the old
-//   OMT, and associate it with the new OMT.
-
-int toku_omt_find_zero(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, OMTVALUE *value, u_int32_t *index, OMTCURSOR c);
-// Effect:  Find the smallest i such that h(V_i, extra)>=0
-//   If c != NULL and there is such an i then set c's abstract offset to i.
-//  If there is such an i and h(V_i,extra)==0 then set *index=i and return 0.
-//  If there is such an i and h(V_i,extra)>0  then set *index=i and return DB_NOTFOUND.
-//  If there is no such i then set *index=toku_omt_size(V), invalidate the cursor (if not NULL), and return DB_NOTFOUND.
-// Requires: index!=NULL
-// Returns
-//    0             success
-//    ENOMEM        if c!=NULL and we run out of memory
-// Performance: time=O(\log N) (calls to h)
-// Notes: It is possible that c was previously valid and was
-//   associated with a different OMT.   If c is changed by this
-//   function, the function must remove c's association with the old
-//   OMT, and associate it with the new OMT.
-// Future directions: the current implementation can be improved, in some cases, by supporting tail recursion.
-//   This would require an additional parameter that represents the current value of the index where the function is recursing,
-//   so that it becomes similar to the way fetch works.
-
-int toku_omt_find(OMT V, int (*h)(OMTVALUE, void*extra), void*extra, int direction, OMTVALUE *value, u_int32_t *index, OMTCURSOR c);
-// Effect:
-//  If direction >0 then find the smallest i such that h(V_i,extra)>0.
-//  If direction <0 then find the largest  i such that h(V_i,extra)<0.
-//  (Direction may not be equal to zero.)
-//  If value!=NULL then store V_i in *value
-//  If index!=NULL then store i in *index.
-//  If c != NULL and there is such an i then set c's abstract offset to i.
-// Requires: The signum of h is monotically increasing.
-// Performance: time=O(\log N) (calls to h)
-// Returns
-//    0             success
-//    DB_NOTFOUND   no such value is found.
-//    ENOMEM        if c!= NULL and we run out of memory
-// On nonzero return, *value and *index are unchanged, and c (if nonnull) is either
-//   invalidated or unchanged.
-// Notes: It is possible that c was previously valid and was
-//   associated with a different OMT.   If c is changed by this
-//   function, the function must remove c's association with the old
-//   OMT, and associate it with the new OMT.
-// Rationale:
-//   Here's how to use the find function to find various things
-//     Cases for find:
-//      find first value:         ( h(v)=+1, direction=+1 )
-//      find last value           ( h(v)=-1, direction=-1 )
-//      find first X              ( h(v)=(v< x) ? -1 : 1    direction=+1 )
-//      find last X               ( h(v)=(v<=x) ? -1 : 1    direction=-1 )
-//      find X or successor to X  ( same as find first X. )
-//
-// Rationale: To help understand heaviside functions and behavor of find:
-//  There are 7 kinds of heaviside functions.
-//  The signum of the h must be monotonically increasing.
-//  Given a function of the following form, A is the element
-//  returned for direction>0, B is the element returned
-//  for direction<0, C is the element returned for
-//  direction==0 (see find_zero) (with a return of 0), and D is the element
-//  returned for direction==0 (see find_zero) with a return of DB_NOTFOUND.
-//  If any of A, B, or C are not found, then asking for the
-//  associated direction will return DB_NOTFOUND.
-//  See find_zero for more information.
-//  
-//  Let the following represent the signum of the heaviside function.
-//
-//  -...-
-//      A
-//       D
-//
-//  +...+
-//  B
-//  D
-//
-//  0...0
-//  C
-//
-//  -...-0...0
-//      AC
-//
-//  0...0+...+
-//  C    B
-//
-//  -...-+...+
-//      AB
-//       D
-//
-//  -...-0...0+...+
-//      AC    B
-
-
-int toku_omt_split_at(OMT omt, OMT *newomt, u_int32_t index);
-// Effect: Create a new OMT, storing it in *newomt.
-//  The values to the right of index (starting at index) are moved to *newomt.
-// Requires: omt != NULL
-// Requires: newomt != NULL
-// Returns
-//    0             success,
-//    ERANGE        if index > toku_omt_size(omt)
-//    ENOMEM
-// On nonzero return, omt and *newomt are unmodified.
-// Performance: time=O(n)
-// Rationale:  We don't need a split-evenly operation.  We need to split items so that their total sizes
-//  are even, and other similar splitting criteria.  It's easy to split evenly by calling toku_omt_size(), and dividing by two.
- 
-int toku_omt_merge(OMT leftomt, OMT rightomt, OMT *newomt);
-// Effect: Appends leftomt and rightomt to produce a new omt.
-//  Sets *newomt to the new omt.
-//  On success, leftomt and rightomt destroyed,.
-// Returns 0 on success
-//   ENOMEM on out of memory.
-// On error, nothing is modified.
-// Performance: time=O(n) is acceptable, but one can imagine implementations that are O(\log n) worst-case.
-
-void toku_omt_clear(OMT omt);
-// Effect: Set the tree to be empty.
-//  Note: Will not resize the array, since void precludes allowing a malloc.
-// Performance: time=O(1)
-
-int toku_omt_cursor_create (OMTCURSOR *p);
-// Effect: Create an OMTCURSOR.  Stores it in *p.  The OMTCURSOR is
-// initially invalid.
-// Requires: p != NULL
-// Returns:
-//   0        success
-//   ENOMEM   out of memory (and doesn't modify *omtp)
-// Performance: constant time.
-
-void toku_omt_cursor_destroy (OMTCURSOR *p);
-// Effect:  Invalidates *p (if it is valid) and frees any memory
-// associated with *p.
-//  Also sets *p=NULL.
-// Requires: *p != NULL
-// Rationale:  The usage is to do something like
-//   toku_omt_cursor_destroy(&c);
-// and now c will have a NULL pointer instead of a dangling freed pointer.
-// Rationale: Returns no values since free() cannot fail.
-// Performance:  time=O(1) x #calls to free
-
-int toku_omt_cursor_is_valid (OMTCURSOR c);
-// Effect:  returns 0 iff c is invalid.
-// Performance:  time=O(1)
-
-int toku_omt_cursor_next (OMTCURSOR c, OMTVALUE *v);
-// Effect: Increment c's abstract offset, and store the corresponding value in v.
-// Requires: v != NULL
-// Returns
-//   0 success
-//   EINVAL if the offset goes out of range or c is invalid.
-// On nonzero return, *v is unchanged and c is invalidated.
-// Performance:  time=O(log N) worst case, expected time=O(1) for a randomly
-//  chosen initial position.
-
-int toku_omt_cursor_current (OMTCURSOR c, OMTVALUE *v);
-// Effect: Store in v the value pointed by c's abstract offset
-// Requires: v != NULL
-// Returns
-//  0 success
-//  EINVAL if c is invalid
-// On non-zero return, *v is unchanged
-// Performance: O(1) time
-
-int toku_omt_cursor_prev (OMTCURSOR c, OMTVALUE *v);
-// Effect: Decrement c's abstract offset, and store the corresponding value in v.
-// Requires: v != NULL
-// Returns
-//   0 success
-//   EINVAL if the offset goes out of range or c is invalid.
-// On nonzero return, *v is unchanged and c is invalidated.
-// Performance:  time=O(log N) worst case, expected time=O(1) for a randomly
-//  chosen initial position.
-
-void toku_omt_cursor_invalidate (OMTCURSOR c);
-// Effect: Invalidate c.  (This does not mean that c is destroyed or
-// that its memory is freed.)
-
-// Usage Hint:   The OMTCURSOR is designed to be used inside the
-// BRTcursor.   A BRTcursor includes a pointer to an OMTCURSOR, which
-// is created when the BRTcursor is created.
-//
-// The brt cursor implements its search by first finding a leaf node,
-// containing an OMT.  The BRT then passes its OMTCURSOR into the lookup
-// method (i.e., one of toku_ebdomt_fetch, toku_omt_find_zero,
-// toku_omt_find).  The lookup method, if successful, sets the
-// OMTCURSOR to refer to that element.
-//
-// As long as the OMTCURSOR remains valid, a BRTCURSOR next or prev
-// operation can be implemented using next or prev on the OMTCURSOR.
-//
-// If the OMTCURSOR becomes invalidated, then the BRT must search
-// again from the root of the tree.   The only error that an OMTCURSOR
-// next operation  can raise is that it is invalid.
-//
-// If an element is inserted into the BRT, it may cause an OMTCURSOR
-// to become invalid.  This is especially true if the element will end
-// up in the OMT associated with the cursor.  A simple implementation
-// is to invalidate all OMTCURSORS any time anything is inserted into
-// into the BRT.  Since the BRT already contains a list of BRT cursors
-// associated with it, it is straightforward to go through that list
-// and invalidate all the cursors.
-//
-// When the BRT closes a cursor, it destroys the OMTCURSOR.
-
-
-size_t toku_omt_memory_size (OMT omt);
-// Effect: Return the size (in bytes) of the omt, as it resides in main memory.  Don't include any of the OMTVALUES.
-
-#endif  /* #ifndef OMT_H */
-