locktree.c 59.1 KB
Newer Older
Yoni Fogel's avatar
Yoni Fogel committed
1 2 3 4 5
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007-8 Tokutek Inc.  All rights reserved."

#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
6 7 8 9 10 11
/**
   \file  locktree.c
   \brief Lock trees: implementation
*/
  

Yoni Fogel's avatar
Yoni Fogel committed
12 13 14 15
#include <locktree.h>
#include <ydb-internal.h>
#include <brt-internal.h>

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/* TODO: Yoni should check that all asserts make sense instead of panic,
         and all early returns make sense instead of panic,
         and vice versa. */
/* TODO: During integration, create a db panic function to take care of this.
         The panic function will go in ydb.c.
         We may have to return the panic return code something.
         We know the DB will always return EINVAL afterwards, but
         what is the INITIAL panic return?
         ALSO maybe make ticket, maybe it should be doing DB_RUNRECOVERY after
         instead of EINVAL.
*/
/* TODO: During integration, make sure we first verify the NULL CONSISTENCY,
         (return EINVAL if necessary) before making lock tree calls. */


31
static inline int toku__lt_panic(toku_lock_tree *tree, int r) {
Yoni Fogel's avatar
Yoni Fogel committed
32
    return tree->panic(tree->db, r);
Yoni Fogel's avatar
Yoni Fogel committed
33 34
}
                
35
static inline int toku__lt_callback(toku_lock_tree *tree, DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
36 37 38
    return tree->lock_callback ? tree->lock_callback(txn, tree) : 0;
}

Yoni Fogel's avatar
Yoni Fogel committed
39
const u_int32_t __toku_default_buflen = 2;
Yoni Fogel's avatar
Yoni Fogel committed
40

Yoni Fogel's avatar
Yoni Fogel committed
41 42 43 44 45 46
static const DBT __toku_lt_infinity;
static const DBT __toku_lt_neg_infinity;

const DBT* const toku_lt_infinity     = &__toku_lt_infinity;
const DBT* const toku_lt_neg_infinity = &__toku_lt_neg_infinity;

Yoni Fogel's avatar
Yoni Fogel committed
47 48 49 50 51 52 53
char* toku_lt_strerror(TOKU_LT_ERROR r) {
    if (r >= 0) return strerror(r);
    if (r == TOKU_LT_INCONSISTENT) {
        return "Locking data structures have become inconsistent.\n";
    }
    return "Unknown error in locking data structures.\n";
}
Yoni Fogel's avatar
Yoni Fogel committed
54
/* Compare two payloads assuming that at least one of them is infinite */ 
55
static inline int toku__infinite_compare(const DBT* a, const DBT* b) {
Yoni Fogel's avatar
Yoni Fogel committed
56 57 58 59 60
    if    (a == b)                      return  0;
    if    (a == toku_lt_infinity)       return  1;
    if    (b == toku_lt_infinity)       return -1;
    if    (a == toku_lt_neg_infinity)   return -1;
    assert(b == toku_lt_neg_infinity);  return  1;
Yoni Fogel's avatar
Yoni Fogel committed
61 62
}

63
static inline BOOL toku__lt_is_infinite(const DBT* p) {
Yoni Fogel's avatar
Yoni Fogel committed
64 65 66 67 68 69
    if (p == toku_lt_infinity || p == toku_lt_neg_infinity) {
        DBT* dbt = (DBT*)p;
        assert(!dbt->data && !dbt->size);
        return TRUE;
    }
    return FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
70 71
}

Yoni Fogel's avatar
Yoni Fogel committed
72 73
/* Verifies that NULL data and size are consistent.
   i.e. The size is 0 if and only if the data is NULL. */
74
static inline int toku__lt_verify_null_key(const DBT* key) {
Yoni Fogel's avatar
Yoni Fogel committed
75 76
    if (key && key->size && !key->data) return EINVAL;
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
77 78
}

79
static inline DBT* toku__recreate_DBT(DBT* dbt, void* payload, u_int32_t length) {
Yoni Fogel's avatar
Yoni Fogel committed
80 81 82 83 84 85
    memset(dbt, 0, sizeof(DBT));
    dbt->data = payload;
    dbt->size = length;
    return dbt;
}

Yoni Fogel's avatar
Yoni Fogel committed
86
static inline int toku__lt_txn_cmp(const DB_TXN* a, const DB_TXN* b) {
Yoni Fogel's avatar
Yoni Fogel committed
87 88 89
    return a < b ? -1 : (a != b);
}

Yoni Fogel's avatar
Yoni Fogel committed
90
int toku__lt_point_cmp(const toku_point* x, const toku_point* y) {
Yoni Fogel's avatar
Yoni Fogel committed
91 92 93 94
    int partial_result;
    DBT point_1;
    DBT point_2;

95
    assert(x && y);
Yoni Fogel's avatar
Yoni Fogel committed
96 97
    assert(x->lt);
    assert(x->lt == y->lt);
Yoni Fogel's avatar
Yoni Fogel committed
98

Yoni Fogel's avatar
Yoni Fogel committed
99 100
    if (toku__lt_is_infinite(x->key_payload) ||
        toku__lt_is_infinite(y->key_payload)) {
Yoni Fogel's avatar
Yoni Fogel committed
101 102 103 104 105 106
        /* If either payload is infinite, then:
           - if duplicates are allowed, the data must be the same 
             infinite value. 
           - if duplicates are not allowed, the data is irrelevant
             In either case, we do not have to compare data: the key will
             be the sole determinant of the comparison */
Yoni Fogel's avatar
Yoni Fogel committed
107
        return toku__infinite_compare(x->key_payload, y->key_payload);
Yoni Fogel's avatar
Yoni Fogel committed
108
    }
Yoni Fogel's avatar
Yoni Fogel committed
109
    partial_result = x->lt->compare_fun(x->lt->db,
Yoni Fogel's avatar
Yoni Fogel committed
110 111
                     toku__recreate_DBT(&point_1, x->key_payload, x->key_len),
                     toku__recreate_DBT(&point_2, y->key_payload, y->key_len));
Yoni Fogel's avatar
Yoni Fogel committed
112 113 114 115
    if (partial_result) return partial_result;
    
    if (!x->lt->duplicates) return 0;

Yoni Fogel's avatar
Yoni Fogel committed
116 117 118
    if (toku__lt_is_infinite(x->data_payload) ||
        toku__lt_is_infinite(y->data_payload)) {
        return toku__infinite_compare(x->data_payload, y->data_payload);
Yoni Fogel's avatar
Yoni Fogel committed
119
    }
Yoni Fogel's avatar
Yoni Fogel committed
120
    return x->lt->dup_compare(x->lt->db,
Yoni Fogel's avatar
Yoni Fogel committed
121 122
                   toku__recreate_DBT(&point_1, x->data_payload, x->data_len),
                   toku__recreate_DBT(&point_2, y->data_payload, y->data_len));
Yoni Fogel's avatar
Yoni Fogel committed
123 124
}

Yoni Fogel's avatar
Yoni Fogel committed
125 126
static inline BOOL toku__lt_percent_ranges_free(toku_lock_tree* tree, 
                                                u_int32_t percent) {
Yoni Fogel's avatar
Yoni Fogel committed
127 128
    assert(tree && tree->max_ranges && tree->num_ranges && (percent <= 100));
    u_int64_t max_ranges64 = *tree->max_ranges;
Yoni Fogel's avatar
Yoni Fogel committed
129
    return *tree->num_ranges <= max_ranges64 * (100 - percent) / 100;
Yoni Fogel's avatar
Yoni Fogel committed
130 131
}

Yoni Fogel's avatar
Yoni Fogel committed
132 133
/* Functions to update the range count and compare it with the
   maximum number of ranges */
134
static inline BOOL toku__lt_range_test_incr(toku_lock_tree* tree, u_int32_t replace) {
Yoni Fogel's avatar
Yoni Fogel committed
135 136 137
    assert(tree);
    assert(tree->num_ranges);
    assert(replace <= *tree->num_ranges);
Yoni Fogel's avatar
Yoni Fogel committed
138
    return *tree->num_ranges - replace < *tree->max_ranges;
Yoni Fogel's avatar
Yoni Fogel committed
139
}
Yoni Fogel's avatar
Yoni Fogel committed
140

141
static inline void toku__lt_range_incr(toku_lock_tree* tree, u_int32_t replace) {
Yoni Fogel's avatar
Yoni Fogel committed
142
    assert(toku__lt_range_test_incr(tree, replace));
Yoni Fogel's avatar
Yoni Fogel committed
143 144 145 146
    *tree->num_ranges -= replace;
    *tree->num_ranges += 1;
}

147
static inline void toku__lt_range_decr(toku_lock_tree* tree, u_int32_t ranges) {
Yoni Fogel's avatar
Yoni Fogel committed
148 149 150 151 152 153
    assert(tree);
    assert(tree->num_ranges);
    assert(*tree->num_ranges >= ranges);
    *tree->num_ranges -= ranges;
}

154
static inline void toku__p_free(toku_lock_tree* tree, toku_point* point) {
Yoni Fogel's avatar
Yoni Fogel committed
155
    assert(point);
Yoni Fogel's avatar
Yoni Fogel committed
156
    if (!toku__lt_is_infinite(point->key_payload)) {
Yoni Fogel's avatar
Yoni Fogel committed
157
        tree->free(point->key_payload);
Yoni Fogel's avatar
Yoni Fogel committed
158
    }
Yoni Fogel's avatar
Yoni Fogel committed
159
    if (!toku__lt_is_infinite(point->data_payload)) {
Yoni Fogel's avatar
Yoni Fogel committed
160
        tree->free(point->data_payload);
Yoni Fogel's avatar
Yoni Fogel committed
161
    }
Yoni Fogel's avatar
Yoni Fogel committed
162
    tree->free(point);
Yoni Fogel's avatar
Yoni Fogel committed
163 164
}

Yoni Fogel's avatar
Yoni Fogel committed
165 166 167
/*
   Allocate and copy the payload.
*/
168
static inline int toku__payload_copy(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
169
                               void** payload_out, u_int32_t* len_out,
Yoni Fogel's avatar
Yoni Fogel committed
170
                               void*  payload_in,  u_int32_t  len_in) {
Yoni Fogel's avatar
Yoni Fogel committed
171
    assert(payload_out && len_out);
Yoni Fogel's avatar
Yoni Fogel committed
172
    if (!len_in) {
Yoni Fogel's avatar
Yoni Fogel committed
173
        assert(!payload_in || toku__lt_is_infinite(payload_in));
Yoni Fogel's avatar
Yoni Fogel committed
174
        *payload_out = payload_in;
Yoni Fogel's avatar
Yoni Fogel committed
175
        *len_out     = len_in;
Yoni Fogel's avatar
Yoni Fogel committed
176 177
    }
    else {
Yoni Fogel's avatar
Yoni Fogel committed
178
        assert(payload_in);
Yoni Fogel's avatar
Yoni Fogel committed
179
        *payload_out = tree->malloc((size_t)len_in);
Yoni Fogel's avatar
Yoni Fogel committed
180
        if (!*payload_out) return errno;
Yoni Fogel's avatar
Yoni Fogel committed
181
        *len_out     = len_in;
Yoni Fogel's avatar
Yoni Fogel committed
182
        memcpy(*payload_out, payload_in, (size_t)len_in);
Yoni Fogel's avatar
Yoni Fogel committed
183
    }
Yoni Fogel's avatar
Yoni Fogel committed
184
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
185
}
Yoni Fogel's avatar
Yoni Fogel committed
186

187
static inline int toku__p_makecopy(toku_lock_tree* tree, toku_point** ppoint) {
Yoni Fogel's avatar
Yoni Fogel committed
188
    assert(ppoint);
189
    toku_point*     point      = *ppoint;
Yoni Fogel's avatar
Yoni Fogel committed
190
    toku_point*     temp_point = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
191 192
    int r;

Yoni Fogel's avatar
Yoni Fogel committed
193
    temp_point = (toku_point*)tree->malloc(sizeof(toku_point));
Yoni Fogel's avatar
Yoni Fogel committed
194
    if (0) {
Yoni Fogel's avatar
Yoni Fogel committed
195
        died1: tree->free(temp_point); return r; }
Yoni Fogel's avatar
Yoni Fogel committed
196
    if (!temp_point) return errno;
Yoni Fogel's avatar
Yoni Fogel committed
197
    memcpy(temp_point, point, sizeof(toku_point));
Yoni Fogel's avatar
Yoni Fogel committed
198

Yoni Fogel's avatar
Yoni Fogel committed
199
    r = toku__payload_copy(tree,
Yoni Fogel's avatar
Yoni Fogel committed
200
                            &temp_point->key_payload, &temp_point->key_len,
Yoni Fogel's avatar
Yoni Fogel committed
201 202 203
                                  point->key_payload,       point->key_len);
    if (0) {
        died2:
Yoni Fogel's avatar
Yoni Fogel committed
204
        if (!toku__lt_is_infinite(temp_point->key_payload)) {
Yoni Fogel's avatar
Yoni Fogel committed
205
            tree->free(temp_point->key_payload); }
Yoni Fogel's avatar
Yoni Fogel committed
206
        goto died1; }
Yoni Fogel's avatar
Yoni Fogel committed
207
    if (r!=0) goto died1;
Yoni Fogel's avatar
Yoni Fogel committed
208
    toku__payload_copy(tree,
Yoni Fogel's avatar
Yoni Fogel committed
209
                        &temp_point->data_payload, &temp_point->data_len,
Yoni Fogel's avatar
Yoni Fogel committed
210 211
                              point->data_payload,       point->data_len);
    if (r!=0) goto died2;
Yoni Fogel's avatar
Yoni Fogel committed
212 213
    *ppoint = temp_point;
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
214 215
}

Yoni Fogel's avatar
Yoni Fogel committed
216 217
/* Provides access to a selfread tree for a particular transaction.
   Returns NULL if it does not exist yet. */
Yoni Fogel's avatar
Yoni Fogel committed
218
toku_range_tree* toku__lt_ifexist_selfread(toku_lock_tree* tree, DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
219
    assert(tree && txn);
Yoni Fogel's avatar
Yoni Fogel committed
220 221
    toku_rt_forest* forest = toku_rth_find(tree->rth, txn);
    return forest ? forest->self_read : NULL;
Yoni Fogel's avatar
Yoni Fogel committed
222 223 224
}

/* Provides access to a selfwrite tree for a particular transaction.
Yoni Fogel's avatar
Yoni Fogel committed
225
   Returns NULL if it does not exist yet. */
Yoni Fogel's avatar
Yoni Fogel committed
226
toku_range_tree* toku__lt_ifexist_selfwrite(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
227 228 229 230
                                             DB_TXN* txn) {
    assert(tree && txn);
    toku_rt_forest* forest = toku_rth_find(tree->rth, txn);
    return forest ? forest->self_write : NULL;
Yoni Fogel's avatar
Yoni Fogel committed
231 232 233 234
}

/* Provides access to a selfread tree for a particular transaction.
   Creates it if it does not exist. */
235
static inline int toku__lt_selfread(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
236
                              toku_range_tree** pselfread) {
Yoni Fogel's avatar
Yoni Fogel committed
237
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
238
    assert(tree && txn && pselfread);
Yoni Fogel's avatar
Yoni Fogel committed
239

Yoni Fogel's avatar
Yoni Fogel committed
240 241
    toku_rt_forest* forest = toku_rth_find(tree->rth, txn);
    if (!forest) {
Yoni Fogel's avatar
Yoni Fogel committed
242
        /* Neither selfread nor selfwrite exist. */
Yoni Fogel's avatar
Yoni Fogel committed
243 244 245 246 247 248 249
        r = toku_rth_insert(tree->rth, txn);
        if (r!=0) return r;
        forest = toku_rth_find(tree->rth, txn);
    }
    assert(forest);
    if (!forest->self_read) {
        r = toku_rt_create(&forest->self_read,
Yoni Fogel's avatar
Yoni Fogel committed
250
                           toku__lt_point_cmp, toku__lt_txn_cmp,
Yoni Fogel's avatar
Yoni Fogel committed
251 252 253 254 255 256 257
                           FALSE,
                           tree->malloc, tree->free, tree->realloc);
        if (r!=0) return r;
        assert(forest->self_read);
    }
    *pselfread = forest->self_read;
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
258 259
}

Yoni Fogel's avatar
Yoni Fogel committed
260 261
/* Provides access to a selfwrite tree for a particular transaction.
   Creates it if it does not exist. */
262
static inline int toku__lt_selfwrite(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
263 264 265 266 267 268 269 270 271 272 273 274 275
                               toku_range_tree** pselfwrite) {
    int r;
    assert(tree && txn && pselfwrite);

    toku_rt_forest* forest = toku_rth_find(tree->rth, txn);
    if (!forest) {
        r = toku_rth_insert(tree->rth, txn);
        if (r!=0) return r;
        forest = toku_rth_find(tree->rth, txn);
    }
    assert(forest);
    if (!forest->self_write) {
        r = toku_rt_create(&forest->self_write,
Yoni Fogel's avatar
Yoni Fogel committed
276
                           toku__lt_point_cmp, toku__lt_txn_cmp,
Yoni Fogel's avatar
Yoni Fogel committed
277 278 279 280 281 282
                           FALSE,
                           tree->malloc, tree->free, tree->realloc);
        if (r!=0) return r;
        assert(forest->self_write);
    }
    *pselfwrite = forest->self_write;
Yoni Fogel's avatar
Yoni Fogel committed
283
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
284 285
}

Yoni Fogel's avatar
Yoni Fogel committed
286

Yoni Fogel's avatar
Yoni Fogel committed
287 288 289 290 291 292
static inline BOOL toku__dominated(toku_range* query, toku_range* by) {
    assert(query && by);
    return (toku__lt_point_cmp(query->left,  by->left) >= 0 &&
            toku__lt_point_cmp(query->right, by->right) <= 0);
}

Yoni Fogel's avatar
Yoni Fogel committed
293 294 295 296 297
/*
    This function only supports non-overlapping trees.
    Uses the standard definition of dominated from the design document.
    Determines whether 'query' is dominated by 'rt'.
*/
298
static inline int toku__lt_rt_dominates(toku_lock_tree* tree, toku_range* query,
Yoni Fogel's avatar
Yoni Fogel committed
299
                                  toku_range_tree* rt, BOOL* dominated) {
Yoni Fogel's avatar
Yoni Fogel committed
300
    assert(tree && query && dominated);
Yoni Fogel's avatar
Yoni Fogel committed
301 302 303 304 305
    if (!rt) {
        *dominated = FALSE;
        return 0;
    }
    
Yoni Fogel's avatar
Yoni Fogel committed
306 307 308 309 310 311 312
    BOOL            allow_overlaps;
    const u_int32_t query_size = 1;
    toku_range      buffer[query_size];
    u_int32_t       buflen     = query_size;
    toku_range*     buf        = &buffer[0];
    u_int32_t       numfound;
    int             r;
Yoni Fogel's avatar
Yoni Fogel committed
313

Yoni Fogel's avatar
Yoni Fogel committed
314 315 316 317 318
    /* Sanity check. (Function only supports non-overlap range trees.) */
    r = toku_rt_get_allow_overlaps(rt, &allow_overlaps);
    if (r!=0) return r;
    assert(!allow_overlaps);

Yoni Fogel's avatar
Yoni Fogel committed
319
    r = toku_rt_find(rt, query, query_size, &buf, &buflen, &numfound);
Yoni Fogel's avatar
Yoni Fogel committed
320 321 322 323 324
    if (r!=0) return r;
    if (numfound == 0) {
        *dominated = FALSE;
        return 0;
    }
Yoni Fogel's avatar
Yoni Fogel committed
325
    assert(numfound == 1);
Yoni Fogel's avatar
Yoni Fogel committed
326
    *dominated = toku__dominated(query, &buf[0]);
Yoni Fogel's avatar
Yoni Fogel committed
327 328 329
    return 0;
}

Yoni Fogel's avatar
Yoni Fogel committed
330 331 332 333 334 335 336 337 338 339 340 341
typedef enum
       {TOKU_NO_CONFLICT, TOKU_MAYBE_CONFLICT, TOKU_YES_CONFLICT} toku_conflict;
/*
    This function checks for conflicts in the borderwrite tree.
    If no range overlaps, there is no conflict.
    If >= 2 ranges overlap the query then, by definition of borderwrite,
    at least one overlapping regions must not be 'self'. Design document
    explains why this MUST cause a conflict.
    If exactly one range overlaps and its data == self, there is no conflict.
    If exactly one range overlaps and its data != self, there might be a
    conflict.  We need to check the 'peer'write table to verify.
*/
342
static inline int toku__lt_borderwrite_conflict(toku_lock_tree* tree, DB_TXN* self,
Yoni Fogel's avatar
Yoni Fogel committed
343 344 345 346 347
                                       toku_range* query,
                                       toku_conflict* conflict, DB_TXN** peer) {
    assert(tree && self && query && conflict && peer);
    toku_range_tree* rt = tree->borderwrite;
    assert(rt);
Yoni Fogel's avatar
Yoni Fogel committed
348

Yoni Fogel's avatar
Yoni Fogel committed
349
    const u_int32_t query_size = 2;
Yoni Fogel's avatar
Yoni Fogel committed
350
    toku_range   buffer[query_size];
Yoni Fogel's avatar
Yoni Fogel committed
351
    u_int32_t     buflen     = query_size;
Yoni Fogel's avatar
Yoni Fogel committed
352
    toku_range*  buf        = &buffer[0];
Yoni Fogel's avatar
Yoni Fogel committed
353
    u_int32_t     numfound;
Yoni Fogel's avatar
Yoni Fogel committed
354 355 356
    int          r;

    r = toku_rt_find(rt, query, query_size, &buf, &buflen, &numfound);
Yoni Fogel's avatar
Yoni Fogel committed
357
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
358
    assert(numfound <= query_size);
Yoni Fogel's avatar
Yoni Fogel committed
359
    *peer = NULL;
Yoni Fogel's avatar
Yoni Fogel committed
360 361
    if      (numfound == 2) *conflict = TOKU_YES_CONFLICT;
    else if (numfound == 0 || buf[0].data == self) *conflict = TOKU_NO_CONFLICT;
Yoni Fogel's avatar
Yoni Fogel committed
362
    else {
Yoni Fogel's avatar
Yoni Fogel committed
363 364
        *conflict = TOKU_MAYBE_CONFLICT;
        *peer = buf[0].data;
Yoni Fogel's avatar
Yoni Fogel committed
365 366 367 368 369
    }
    return 0;
}

/*
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
370 371 372
    Determines whether 'query' meets 'rt'.
    This function supports only non-overlapping trees with homogeneous 
    transactions, i.e., a selfwrite or selfread table only.
Yoni Fogel's avatar
Yoni Fogel committed
373 374 375
    Uses the standard definition of 'query' meets 'tree' at 'data' from the
    design document.
*/
376
static inline int toku__lt_meets(toku_lock_tree* tree, toku_range* query, 
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
377 378
                           toku_range_tree* rt, BOOL* met) {
    assert(tree && query && rt && met);
Yoni Fogel's avatar
Yoni Fogel committed
379
    const u_int32_t query_size = 1;
Yoni Fogel's avatar
Yoni Fogel committed
380
    toku_range   buffer[query_size];
Yoni Fogel's avatar
Yoni Fogel committed
381
    u_int32_t     buflen     = query_size;
Yoni Fogel's avatar
Yoni Fogel committed
382
    toku_range*  buf        = &buffer[0];
Yoni Fogel's avatar
Yoni Fogel committed
383
    u_int32_t     numfound;
Yoni Fogel's avatar
Yoni Fogel committed
384 385
    int          r;
    BOOL         allow_overlaps;
Yoni Fogel's avatar
Yoni Fogel committed
386 387 388 389 390 391

    /* Sanity check. (Function only supports non-overlap range trees.) */
    r = toku_rt_get_allow_overlaps(rt, &allow_overlaps);
    if (r!=0) return r;
    assert(!allow_overlaps);

Yoni Fogel's avatar
Yoni Fogel committed
392
    r = toku_rt_find(rt, query, query_size, &buf, &buflen, &numfound);
Yoni Fogel's avatar
Yoni Fogel committed
393
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
394
    assert(numfound <= query_size);
Yoni Fogel's avatar
Yoni Fogel committed
395 396 397 398
    *met = numfound != 0;
    return 0;
}

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
399 400
/* 
    Determines whether 'query' meets 'rt' at txn2 not equal to txn.
Yoni Fogel's avatar
Yoni Fogel committed
401 402
    This function supports all range trees, but queries must either be a single point,
    or the range tree is homogenous.
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
403 404 405
    Uses the standard definition of 'query' meets 'tree' at 'data' from the
    design document.
*/
406
static inline int toku__lt_meets_peer(toku_lock_tree* tree, toku_range* query, 
Yoni Fogel's avatar
Yoni Fogel committed
407 408
                                       toku_range_tree* rt, BOOL is_homogenous,
                                       DB_TXN* self, BOOL* met) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
409
    assert(tree && query && rt && self && met);
Yoni Fogel's avatar
Yoni Fogel committed
410
    assert(query->left == query->right || is_homogenous);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
411

Yoni Fogel's avatar
Yoni Fogel committed
412 413 414
    const u_int32_t query_size = is_homogenous ? 1 : 2;
    toku_range   buffer[2];
    u_int32_t    buflen     = query_size;
Yoni Fogel's avatar
Yoni Fogel committed
415
    toku_range*  buf        = &buffer[0];
Yoni Fogel's avatar
Yoni Fogel committed
416
    u_int32_t    numfound;
Yoni Fogel's avatar
Yoni Fogel committed
417
    int          r;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
418

Yoni Fogel's avatar
Yoni Fogel committed
419
    r = toku_rt_find(rt, query, query_size, &buf, &buflen, &numfound);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
420
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
421
    assert(numfound <= query_size);
Yoni Fogel's avatar
Yoni Fogel committed
422
    *met = numfound == 2 || (numfound == 1 && buf[0].data != self);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
423 424 425
    return 0;
}

Yoni Fogel's avatar
Yoni Fogel committed
426 427 428 429
/*
    Utility function to implement: (from design document)
    if K meets E at v'!=t and K meets W_v' then return failure.
*/
430
static inline int toku__lt_check_borderwrite_conflict(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
431 432 433 434 435 436 437
                                               DB_TXN* txn, toku_range* query) {
    assert(tree && txn && query);
    toku_conflict conflict;
    DB_TXN* peer;
    toku_range_tree* peer_selfwrite;
    int r;
    
Yoni Fogel's avatar
Yoni Fogel committed
438
    r = toku__lt_borderwrite_conflict(tree, txn, query, &conflict, &peer);
Yoni Fogel's avatar
Yoni Fogel committed
439 440 441
    if (r!=0) return r;
    if (conflict == TOKU_MAYBE_CONFLICT) {
        assert(peer);
Yoni Fogel's avatar
Yoni Fogel committed
442 443
        peer_selfwrite = toku__lt_ifexist_selfwrite(tree, peer);
        if (!peer_selfwrite) return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);
Yoni Fogel's avatar
Yoni Fogel committed
444 445

        BOOL met;
Yoni Fogel's avatar
Yoni Fogel committed
446
        r = toku__lt_meets(tree, query, peer_selfwrite, &met);
Yoni Fogel's avatar
Yoni Fogel committed
447
        if (r!=0)   return r;
Yoni Fogel's avatar
Yoni Fogel committed
448
        conflict = met ? TOKU_YES_CONFLICT : TOKU_NO_CONFLICT;
Yoni Fogel's avatar
Yoni Fogel committed
449
    }
Yoni Fogel's avatar
Yoni Fogel committed
450
    if    (conflict == TOKU_YES_CONFLICT) return DB_LOCK_NOTGRANTED;
Yoni Fogel's avatar
Yoni Fogel committed
451
    assert(conflict == TOKU_NO_CONFLICT);
Yoni Fogel's avatar
Yoni Fogel committed
452 453 454
    return 0;
}

455
static inline void toku__payload_from_dbt(void** payload, u_int32_t* len,
456
                                           const DBT* dbt) {
Yoni Fogel's avatar
Yoni Fogel committed
457
    assert(payload && len && dbt);
Yoni Fogel's avatar
Yoni Fogel committed
458
    if (toku__lt_is_infinite(dbt)) *payload = (void*)dbt;
Yoni Fogel's avatar
Yoni Fogel committed
459 460 461 462 463
    else if (!dbt->size) {
        *payload = NULL;
        *len     = 0;
    } else {
        assert(dbt->data);
Yoni Fogel's avatar
Yoni Fogel committed
464
        *payload = dbt->data;
Yoni Fogel's avatar
Yoni Fogel committed
465
        *len     = dbt->size;
Yoni Fogel's avatar
Yoni Fogel committed
466 467 468
    }
}

469
static inline void toku__init_point(toku_point* point, toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
470
                              const DBT* key, const DBT* data) {
Yoni Fogel's avatar
Yoni Fogel committed
471
    assert(point && tree && key);
Yoni Fogel's avatar
Yoni Fogel committed
472
    assert(!tree->duplicates == !data);
Yoni Fogel's avatar
Yoni Fogel committed
473 474
    memset(point, 0, sizeof(toku_point));
    point->lt = tree;
Yoni Fogel's avatar
Yoni Fogel committed
475

Yoni Fogel's avatar
Yoni Fogel committed
476
    toku__payload_from_dbt(&point->key_payload, &point->key_len, key);
Yoni Fogel's avatar
Yoni Fogel committed
477 478
    if (tree->duplicates) {
        assert(data);
Yoni Fogel's avatar
Yoni Fogel committed
479
        toku__payload_from_dbt(&point->data_payload, &point->data_len, data);
Yoni Fogel's avatar
Yoni Fogel committed
480 481 482 483 484 485 486 487
    }
    else {
        assert(data == NULL);
        point->data_payload = NULL;
        point->data_len     = 0;
    }
}

488
static inline void toku__init_query(toku_range* query,
Yoni Fogel's avatar
Yoni Fogel committed
489 490 491 492 493 494
                              toku_point* left, toku_point* right) {
    query->left  = left;
    query->right = right;
    query->data  = NULL;
}

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
495 496 497 498 499 500 501 502 503 504 505 506 507
/*
    Memory ownership: 
     - to_insert we own (it's static)
     - to_insert.left, .right are toku_point's, and we own them.
       If we have consolidated, we own them because we had allocated
       them earlier, but
       if we have not consolidated we need to gain ownership now: 
       we will gain ownership by copying all payloads and 
       allocating the points. 
     - to_insert.{left,right}.{key_payload, data_payload} are owned by lt,
       we made copies from the DB at consolidation time 
*/

508
static inline void toku__init_insert(toku_range* to_insert,
Yoni Fogel's avatar
Yoni Fogel committed
509 510 511 512 513 514 515
                               toku_point* left, toku_point* right,
                               DB_TXN* txn) {
    to_insert->left  = left;
    to_insert->right = right;
    to_insert->data  = txn;
}

Yoni Fogel's avatar
Yoni Fogel committed
516 517
/* Returns whether the point already exists
   as an endpoint of the given range. */
518
static inline BOOL toku__lt_p_independent(toku_point* point, toku_range* range) {
Yoni Fogel's avatar
Yoni Fogel committed
519 520 521 522
    assert(point && range);
    return point != range->left && point != range->right;
}

523
static inline int toku__lt_extend_extreme(toku_lock_tree* tree,toku_range* to_insert,
Yoni Fogel's avatar
Yoni Fogel committed
524
                                    BOOL* alloc_left, BOOL* alloc_right,
Yoni Fogel's avatar
Yoni Fogel committed
525
                                    u_int32_t numfound) {
Yoni Fogel's avatar
Yoni Fogel committed
526
    assert(to_insert && tree && alloc_left && alloc_right);
Yoni Fogel's avatar
Yoni Fogel committed
527
    u_int32_t i;
Yoni Fogel's avatar
Yoni Fogel committed
528
    assert(numfound <= tree->buflen);
Yoni Fogel's avatar
Yoni Fogel committed
529 530 531
    for (i = 0; i < numfound; i++) {
        int c;
        /* Find the extreme left end-point among overlapping ranges */
Yoni Fogel's avatar
Yoni Fogel committed
532
        if ((c = toku__lt_point_cmp(tree->buf[i].left, to_insert->left))
Yoni Fogel's avatar
Yoni Fogel committed
533
            <= 0) {
Yoni Fogel's avatar
Yoni Fogel committed
534
            if ((!*alloc_left && c == 0) ||
Yoni Fogel's avatar
Yoni Fogel committed
535 536
                !toku__lt_p_independent(tree->buf[i].left, to_insert)) {
                return toku__lt_panic(tree, TOKU_LT_INCONSISTENT); }
Yoni Fogel's avatar
Yoni Fogel committed
537 538 539 540
            *alloc_left      = FALSE;
            to_insert->left  = tree->buf[i].left;
        }
        /* Find the extreme right end-point */
Yoni Fogel's avatar
Yoni Fogel committed
541
        if ((c = toku__lt_point_cmp(tree->buf[i].right, to_insert->right))
Yoni Fogel's avatar
Yoni Fogel committed
542
            >= 0) {
Yoni Fogel's avatar
Yoni Fogel committed
543 544 545 546
            if ((!*alloc_right && c == 0) ||
                (tree->buf[i].right == to_insert->left &&
                 tree->buf[i].left  != to_insert->left) ||
                 tree->buf[i].right == to_insert->right) {
Yoni Fogel's avatar
Yoni Fogel committed
547
                return toku__lt_panic(tree, TOKU_LT_INCONSISTENT); }
Yoni Fogel's avatar
Yoni Fogel committed
548 549 550 551
            *alloc_right     = FALSE;
            to_insert->right = tree->buf[i].right;
        }
    }
Yoni Fogel's avatar
Yoni Fogel committed
552
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
553 554
}

555
static inline int toku__lt_alloc_extreme(toku_lock_tree* tree, toku_range* to_insert,
Yoni Fogel's avatar
Yoni Fogel committed
556 557 558 559 560
                                   BOOL alloc_left, BOOL* alloc_right) {
    assert(to_insert && alloc_right);
    BOOL copy_left = FALSE;
    int r;
    
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
561 562
    /* The pointer comparison may speed up the evaluation in some cases, 
       but it is not strictly needed */
Yoni Fogel's avatar
Yoni Fogel committed
563
    if (alloc_left && alloc_right &&
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
564
        (to_insert->left == to_insert->right ||
Yoni Fogel's avatar
Yoni Fogel committed
565
         toku__lt_point_cmp(to_insert->left, to_insert->right) == 0)) {
Yoni Fogel's avatar
Yoni Fogel committed
566 567 568
        *alloc_right = FALSE;
        copy_left    = TRUE;
    }
Yoni Fogel's avatar
Yoni Fogel committed
569

Yoni Fogel's avatar
Yoni Fogel committed
570
    if (alloc_left) {
Yoni Fogel's avatar
Yoni Fogel committed
571
        r = toku__p_makecopy(tree, &to_insert->left);
Yoni Fogel's avatar
Yoni Fogel committed
572
        if (0) { died1:
Yoni Fogel's avatar
Yoni Fogel committed
573
            if (alloc_left) toku__p_free(tree, to_insert->left); return r; }
Yoni Fogel's avatar
Yoni Fogel committed
574 575 576 577
        if (r!=0) return r;
    }
    if (*alloc_right) {
        assert(!copy_left);
Yoni Fogel's avatar
Yoni Fogel committed
578
        r = toku__p_makecopy(tree, &to_insert->right);
Yoni Fogel's avatar
Yoni Fogel committed
579 580 581 582 583 584
        if (r!=0) goto died1;
    }
    else if (copy_left) to_insert->right = to_insert->left;
    return 0;
}

585
static inline int toku__lt_delete_overlapping_ranges(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
586
                                               toku_range_tree* rt,
Yoni Fogel's avatar
Yoni Fogel committed
587
                                               u_int32_t numfound) {
Yoni Fogel's avatar
Yoni Fogel committed
588 589
    assert(tree && rt);
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
590
    u_int32_t i;
Yoni Fogel's avatar
Yoni Fogel committed
591
    assert(numfound <= tree->buflen);
Yoni Fogel's avatar
Yoni Fogel committed
592 593
    for (i = 0; i < numfound; i++) {
        r = toku_rt_delete(rt, &tree->buf[i]);
Yoni Fogel's avatar
Yoni Fogel committed
594
        if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
595
    }
Yoni Fogel's avatar
Yoni Fogel committed
596
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
597 598
}

599
static inline int toku__lt_free_points(toku_lock_tree* tree, toku_range* to_insert,
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
600
                                  u_int32_t numfound, toku_range_tree *rt) {
Yoni Fogel's avatar
Yoni Fogel committed
601
    assert(tree && to_insert);
Yoni Fogel's avatar
Yoni Fogel committed
602
    assert(numfound <= tree->buflen);
Yoni Fogel's avatar
Yoni Fogel committed
603

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
604
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
605
    u_int32_t i;
Yoni Fogel's avatar
Yoni Fogel committed
606
    for (i = 0; i < numfound; i++) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
607
        if (rt != NULL) {
Yoni Fogel's avatar
Yoni Fogel committed
608
            r = toku_rt_delete(rt, &tree->buf[i]);
Yoni Fogel's avatar
Yoni Fogel committed
609
            if (r!=0) return toku__lt_panic(tree, r);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
610
        }
Yoni Fogel's avatar
Yoni Fogel committed
611 612 613
        /*
           We will maintain the invariant: (separately for read and write
           environments)
Yoni Fogel's avatar
Yoni Fogel committed
614
           (toku__lt_point_cmp(a, b) == 0 && a.txn == b.txn) => a == b
Yoni Fogel's avatar
Yoni Fogel committed
615 616 617
        */
        /* Do not double-free. */
        if (tree->buf[i].right != tree->buf[i].left &&
Yoni Fogel's avatar
Yoni Fogel committed
618 619
            toku__lt_p_independent(tree->buf[i].right, to_insert)) {
            toku__p_free(tree, tree->buf[i].right);
Yoni Fogel's avatar
Yoni Fogel committed
620
        }
Yoni Fogel's avatar
Yoni Fogel committed
621 622
        if (toku__lt_p_independent(tree->buf[i].left,  to_insert)) {
            toku__p_free(tree, tree->buf[i].left);
Yoni Fogel's avatar
Yoni Fogel committed
623 624
        }
    }
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
625
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
626 627
}

Yoni Fogel's avatar
Yoni Fogel committed
628 629 630
/* TODO: query should be made from the to_insert instead of a parameter. */
/* TODO: toku_query should be an object.  toku_range would contain a query and a transaction. */
/* TODO: Toku error codes, i.e. get rid of the extra parameter for (ran out of locks) */
Yoni Fogel's avatar
Yoni Fogel committed
631
/* Consolidate the new range and all the overlapping ranges */
632
static inline int toku__consolidate(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
633 634
                                    toku_range* query, toku_range* to_insert,
                                    DB_TXN* txn, BOOL* out_of_locks) {
Yoni Fogel's avatar
Yoni Fogel committed
635 636 637 638
    int r;
    BOOL             alloc_left    = TRUE;
    BOOL             alloc_right   = TRUE;
    toku_range_tree* selfread;
Yoni Fogel's avatar
Yoni Fogel committed
639 640
    assert(tree && to_insert && txn && out_of_locks);
    *out_of_locks = FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
641
#if !defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
642 643
    toku_range_tree* mainread      = tree->mainread;
    assert(mainread);
Yoni Fogel's avatar
Yoni Fogel committed
644
#endif
Yoni Fogel's avatar
Yoni Fogel committed
645
    /* Find the self read tree */
Yoni Fogel's avatar
Yoni Fogel committed
646
    r = toku__lt_selfread(tree, txn, &selfread);
Yoni Fogel's avatar
Yoni Fogel committed
647 648 649
    if (r!=0) return r;
    assert(selfread);
    /* Find all overlapping ranges in the self-read */
Yoni Fogel's avatar
Yoni Fogel committed
650
    u_int32_t numfound;
Yoni Fogel's avatar
Yoni Fogel committed
651 652 653
    r = toku_rt_find(selfread, query, 0, &tree->buf, &tree->buflen,
                     &numfound);
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
654
    assert(numfound <= tree->buflen);
Yoni Fogel's avatar
Yoni Fogel committed
655
    /* Find the extreme left and right point of the consolidated interval */
Yoni Fogel's avatar
Yoni Fogel committed
656
    r = toku__lt_extend_extreme(tree, to_insert, &alloc_left, &alloc_right,
Yoni Fogel's avatar
Yoni Fogel committed
657
                                numfound);
Yoni Fogel's avatar
Yoni Fogel committed
658
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
659 660 661 662
    if (!toku__lt_range_test_incr(tree, numfound)) {
        *out_of_locks = TRUE;
        return 0;
    }
Yoni Fogel's avatar
Yoni Fogel committed
663
    /* Allocate the consolidated range */
Yoni Fogel's avatar
Yoni Fogel committed
664
    r = toku__lt_alloc_extreme(tree, to_insert, alloc_left, &alloc_right);
Yoni Fogel's avatar
Yoni Fogel committed
665
    if (0) { died1:
Yoni Fogel's avatar
Yoni Fogel committed
666 667
        if (alloc_left)  toku__p_free(tree, to_insert->left);
        if (alloc_right) toku__p_free(tree, to_insert->right); return r; }
Yoni Fogel's avatar
Yoni Fogel committed
668
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
669
    /* From this point on we have to panic if we cannot finish. */
Yoni Fogel's avatar
Yoni Fogel committed
670
    /* Delete overlapping ranges from selfread ... */
Yoni Fogel's avatar
Yoni Fogel committed
671 672
    r = toku__lt_delete_overlapping_ranges(tree, selfread, numfound);
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
673 674 675
    /* ... and mainread.
       Growth direction: if we had no overlaps, the next line
       should be commented out */
Yoni Fogel's avatar
Yoni Fogel committed
676
#if !defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
677 678
    r = toku__lt_delete_overlapping_ranges(tree, mainread, numfound);
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
679
#endif
Yoni Fogel's avatar
Yoni Fogel committed
680
    /* Free all the points from ranges in tree->buf[0]..tree->buf[numfound-1] */
Yoni Fogel's avatar
Yoni Fogel committed
681
    toku__lt_free_points(tree, to_insert, numfound, NULL);
Yoni Fogel's avatar
Yoni Fogel committed
682 683
    /* We don't necessarily need to panic after here unless numfound > 0
       Which indicates we deleted something. */
Yoni Fogel's avatar
Yoni Fogel committed
684
    /* Insert extreme range into selfread. */
Yoni Fogel's avatar
Yoni Fogel committed
685
    /* VL */
Yoni Fogel's avatar
Yoni Fogel committed
686
    r = toku_rt_insert(selfread, to_insert);
Yoni Fogel's avatar
Yoni Fogel committed
687
#if !defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
688
    int r2;
Yoni Fogel's avatar
Yoni Fogel committed
689
    if (0) { died2: r2 = toku_rt_delete(selfread, to_insert);
Yoni Fogel's avatar
Yoni Fogel committed
690
        if (r2!=0) return toku__lt_panic(tree, r2); goto died1; }
Yoni Fogel's avatar
Yoni Fogel committed
691
#endif
Yoni Fogel's avatar
Yoni Fogel committed
692
    if (r!=0) {
Yoni Fogel's avatar
Yoni Fogel committed
693
        /* If we deleted/merged anything, this is a panic situation. */
Yoni Fogel's avatar
Yoni Fogel committed
694
        if (numfound) return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);
Yoni Fogel's avatar
Yoni Fogel committed
695
        goto died1; }
Yoni Fogel's avatar
Yoni Fogel committed
696
#if !defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
697
    /* Insert extreme range into mainread. */
Yoni Fogel's avatar
Yoni Fogel committed
698
    assert(tree->mainread);
Yoni Fogel's avatar
Yoni Fogel committed
699 700
    r = toku_rt_insert(tree->mainread, to_insert);
    if (r!=0) {
Yoni Fogel's avatar
Yoni Fogel committed
701
        /* If we deleted/merged anything, this is a panic situation. */
Yoni Fogel's avatar
Yoni Fogel committed
702
        if (numfound) return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);
Yoni Fogel's avatar
Yoni Fogel committed
703
        goto died2; }
Yoni Fogel's avatar
Yoni Fogel committed
704
#endif
Yoni Fogel's avatar
Yoni Fogel committed
705
    toku__lt_range_incr(tree, numfound);
Yoni Fogel's avatar
Yoni Fogel committed
706 707 708
    return 0;
}

709
static inline void toku__lt_init_full_query(toku_lock_tree* tree, toku_range* query,
Yoni Fogel's avatar
Yoni Fogel committed
710
                                      toku_point* left, toku_point* right) {
Yoni Fogel's avatar
Yoni Fogel committed
711
    toku__init_point(left,  tree,       (DBT*)toku_lt_neg_infinity,
Yoni Fogel's avatar
Yoni Fogel committed
712
                      tree->duplicates ? (DBT*)toku_lt_neg_infinity : NULL);
Yoni Fogel's avatar
Yoni Fogel committed
713
    toku__init_point(right, tree,       (DBT*)toku_lt_infinity,
Yoni Fogel's avatar
Yoni Fogel committed
714
                      tree->duplicates ? (DBT*)toku_lt_infinity : NULL);
Yoni Fogel's avatar
Yoni Fogel committed
715
    toku__init_query(query, left, right);
Yoni Fogel's avatar
Yoni Fogel committed
716 717
}

Yoni Fogel's avatar
Yoni Fogel committed
718 719 720 721 722
/*
    TODO: Refactor.
    toku__lt_free_points should be replaced (or supplanted) with a 
    toku__lt_free_point (singular)
*/
723
static inline int toku__lt_free_contents(toku_lock_tree* tree, toku_range_tree* rt,
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
724
                                   toku_range_tree *rtdel) {
Yoni Fogel's avatar
Yoni Fogel committed
725
    assert(tree);
Yoni Fogel's avatar
Yoni Fogel committed
726
    if (!rt) return 0;
Yoni Fogel's avatar
Yoni Fogel committed
727
    
Yoni Fogel's avatar
Yoni Fogel committed
728
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
729
    int r2;
Yoni Fogel's avatar
Yoni Fogel committed
730
    BOOL found = FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
731

Yoni Fogel's avatar
Yoni Fogel committed
732 733 734
    toku_range query;
    toku_point left;
    toku_point right;
Yoni Fogel's avatar
Yoni Fogel committed
735
    toku__lt_init_full_query(tree, &query, &left, &right);
Yoni Fogel's avatar
Yoni Fogel committed
736 737

    toku_rt_start_scan(rt);
Yoni Fogel's avatar
Yoni Fogel committed
738 739 740 741
    while ((r = toku_rt_next(rt, &tree->buf[0], &found)) == 0 && found) {
        r = toku__lt_free_points(tree, &query, 1, rtdel);
        if (r!=0) return toku__lt_panic(tree, r);
    }
Yoni Fogel's avatar
Yoni Fogel committed
742
    r2 = toku_rt_close(rt);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
743 744
    assert(r2 == 0);
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
745 746
}

747
static inline BOOL toku__r_backwards(toku_range* range) {
Yoni Fogel's avatar
Yoni Fogel committed
748 749 750 751 752 753 754
    assert(range && range->left && range->right);
    toku_point* left  = (toku_point*)range->left;
    toku_point* right = (toku_point*)range->right;

    /* Optimization: if all the pointers are equal, clearly left == right. */
    return (left->key_payload  != right->key_payload ||
            left->data_payload != right->data_payload) &&
Yoni Fogel's avatar
Yoni Fogel committed
755
            toku__lt_point_cmp(left, right) > 0;
Yoni Fogel's avatar
Yoni Fogel committed
756 757 758
}


759
static inline int toku__lt_preprocess(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
760 761
                                const DBT* key_left,  const DBT** pdata_left,
                                const DBT* key_right, const DBT** pdata_right,
Yoni Fogel's avatar
Yoni Fogel committed
762
                                toku_point* left, toku_point* right,
Yoni Fogel's avatar
Yoni Fogel committed
763
                                toku_range* query, BOOL* out_of_locks) {
Yoni Fogel's avatar
Yoni Fogel committed
764
    assert(pdata_left && pdata_right);
Yoni Fogel's avatar
Yoni Fogel committed
765
    if (!tree || !txn || !key_left || !key_right || !out_of_locks) return EINVAL;
Yoni Fogel's avatar
Yoni Fogel committed
766 767 768
    if (!tree->duplicates) *pdata_right = *pdata_left = NULL;
    const DBT* data_left  = *pdata_left;
    const DBT* data_right = *pdata_right;
Yoni Fogel's avatar
Yoni Fogel committed
769 770
    if (tree->duplicates  && (!data_left || !data_right))   return EINVAL;
    if (tree->duplicates  && key_left != data_left &&
Yoni Fogel's avatar
Yoni Fogel committed
771
        toku__lt_is_infinite(key_left))                    return EINVAL;
Yoni Fogel's avatar
Yoni Fogel committed
772
    if (tree->duplicates  && key_right != data_right &&
Yoni Fogel's avatar
Yoni Fogel committed
773
        toku__lt_is_infinite(key_right))                   return EINVAL;
Yoni Fogel's avatar
Yoni Fogel committed
774

Yoni Fogel's avatar
Yoni Fogel committed
775
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
776 777
    /* Verify that NULL keys have payload and size that are mutually 
       consistent*/
Yoni Fogel's avatar
Yoni Fogel committed
778 779 780 781 782 783 784 785
    if ((r = toku__lt_verify_null_key(key_left))   != 0) return r;
    if ((r = toku__lt_verify_null_key(data_left))  != 0) return r;
    if ((r = toku__lt_verify_null_key(key_right))  != 0) return r;
    if ((r = toku__lt_verify_null_key(data_right)) != 0) return r;

    toku__init_point(left,  tree, key_left,  data_left);
    toku__init_point(right, tree, key_right, data_right);
    toku__init_query(query, left, right);
Yoni Fogel's avatar
Yoni Fogel committed
786
    /* Verify left <= right, otherwise return EDOM. */
Yoni Fogel's avatar
Yoni Fogel committed
787
    if (toku__r_backwards(query))                          return EDOM;
Yoni Fogel's avatar
Yoni Fogel committed
788
    tree->dups_final = TRUE;
Yoni Fogel's avatar
Yoni Fogel committed
789

Yoni Fogel's avatar
Yoni Fogel committed
790
    r = toku__lt_callback(tree, txn);
Yoni Fogel's avatar
Yoni Fogel committed
791
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
792 793 794
    return 0;
}

795
static inline int toku__lt_get_border(toku_lock_tree* tree, BOOL in_borderwrite,
Yoni Fogel's avatar
Yoni Fogel committed
796 797 798 799 800 801
                                toku_range* pred, toku_range* succ,
                                BOOL* found_p,    BOOL* found_s,
                                toku_range* to_insert) {
    assert(tree && pred && succ && found_p && found_s);                                    
    int r;
    toku_range_tree* rt;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
802
    rt = in_borderwrite ? tree->borderwrite : 
Yoni Fogel's avatar
Yoni Fogel committed
803 804
                          toku__lt_ifexist_selfwrite(tree, tree->buf[0].data);
    if (!rt)  return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);
Yoni Fogel's avatar
Yoni Fogel committed
805
    r = toku_rt_predecessor(rt, to_insert->left,  pred, found_p);
Yoni Fogel's avatar
Yoni Fogel committed
806
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
807 808 809 810 811
    r = toku_rt_successor  (rt, to_insert->right, succ, found_s);
    if (r!=0) return r;
    return 0;
}

812
static inline int toku__lt_expand_border(toku_lock_tree* tree, toku_range* to_insert,
Yoni Fogel's avatar
Yoni Fogel committed
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
                                   toku_range* pred, toku_range* succ,
                                   BOOL  found_p,    BOOL  found_s) {
    assert(tree && to_insert && pred && succ);
    int r;
    if      (found_p && pred->data == to_insert->data) {
        r = toku_rt_delete(tree->borderwrite, pred);
        if (r!=0) return r;
        to_insert->left = pred->left;
    }
    else if (found_s && succ->data == to_insert->data) {
        r = toku_rt_delete(tree->borderwrite, succ);
        if (r!=0) return r;
        to_insert->right = succ->right;
    }
    return 0;
}

830
static inline int toku__lt_split_border(toku_lock_tree* tree, toku_range* to_insert,
Yoni Fogel's avatar
Yoni Fogel committed
831 832 833 834 835
                                   toku_range* pred, toku_range* succ,
                                   BOOL  found_p,    BOOL  found_s) {
    assert(tree && to_insert && pred && succ);
    int r;
    assert(tree->buf[0].data != to_insert->data);
Yoni Fogel's avatar
Yoni Fogel committed
836
    if (!found_s || !found_p) return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);
Yoni Fogel's avatar
Yoni Fogel committed
837 838

    r = toku_rt_delete(tree->borderwrite, &tree->buf[0]);
Yoni Fogel's avatar
Yoni Fogel committed
839
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
840 841 842

    pred->left  = tree->buf[0].left;
    succ->right = tree->buf[0].right;
Yoni Fogel's avatar
Yoni Fogel committed
843 844
    if (toku__r_backwards(pred) || toku__r_backwards(succ)) {
        return toku__lt_panic(tree, TOKU_LT_INCONSISTENT);}
Yoni Fogel's avatar
Yoni Fogel committed
845 846

    r = toku_rt_insert(tree->borderwrite, pred);
Yoni Fogel's avatar
Yoni Fogel committed
847
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
848
    r = toku_rt_insert(tree->borderwrite, succ);
Yoni Fogel's avatar
Yoni Fogel committed
849
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
    return 0;
}

/*
    Algorithm:
    Find everything (0 or 1 ranges) it overlaps in borderwrite.
    If 0:
        Retrieve predecessor and successor.
        if both found
            assert(predecessor.data != successor.data)
        if predecessor found, and pred.data == my.data
            'merge' (extend to) predecessor.left
                To do this, delete predecessor,
                insert combined me and predecessor.
                then done/return
        do same check for successor.
        if not same, then just insert the actual item into borderwrite.
     if found == 1:
        If data == my data, done/return
        (overlap someone else, retrieve the peer)
        Get the selfwrite for the peer.
        Get successor of my point in peer_selfwrite
        get pred of my point in peer_selfwrite.
        Old range = O.left, O.right
        delete old range,
        insert      O.left, pred.right
        insert      succ.left, O.right
        NO MEMORY GETS FREED!!!!!!!!!!, it all is tied to selfwrites.
        insert point,point into borderwrite
     done with borderwrite.
     insert point,point into selfwrite.
*/
882
static inline int toku__lt_borderwrite_insert(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
883 884 885 886 887
                                        toku_range* query,
                                        toku_range* to_insert) {
    assert(tree && query && to_insert);
    int r;
    toku_range_tree* borderwrite = tree->borderwrite;   assert(borderwrite);
Yoni Fogel's avatar
Yoni Fogel committed
888
    const u_int32_t query_size = 1;
Yoni Fogel's avatar
Yoni Fogel committed
889

Yoni Fogel's avatar
Yoni Fogel committed
890
    u_int32_t numfound;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
891 892
    r = toku_rt_find(borderwrite, query, query_size, &tree->buf, &tree->buflen,
                     &numfound);
Yoni Fogel's avatar
Yoni Fogel committed
893
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
894 895 896 897 898 899 900 901 902 903 904
    assert(numfound <= query_size);

    /* No updated needed in borderwrite: we return right away. */
    if (numfound == 1 && tree->buf[0].data == to_insert->data) return 0;

    /* Find predecessor and successors */
    toku_range pred;
    toku_range succ;
    BOOL found_p;
    BOOL found_s;

Yoni Fogel's avatar
Yoni Fogel committed
905
    r = toku__lt_get_border(tree, numfound == 0, &pred, &succ, 
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
906
                             &found_p, &found_s, to_insert);
Yoni Fogel's avatar
Yoni Fogel committed
907
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
908 909
    
    if (numfound == 0) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
910
        if (found_p && found_s && pred.data == succ.data) {
Yoni Fogel's avatar
Yoni Fogel committed
911 912
            return toku__lt_panic(tree, TOKU_LT_INCONSISTENT); }
        r = toku__lt_expand_border(tree, to_insert, &pred,   &succ,
Yoni Fogel's avatar
Yoni Fogel committed
913
                                                      found_p, found_s);
Yoni Fogel's avatar
Yoni Fogel committed
914
        if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
915 916
    }
    else {  
Yoni Fogel's avatar
Yoni Fogel committed
917
        r = toku__lt_split_border( tree, to_insert, &pred, &succ, 
Yoni Fogel's avatar
Yoni Fogel committed
918
                                                      found_p, found_s);
Yoni Fogel's avatar
Yoni Fogel committed
919
        if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
920 921
    }
    r = toku_rt_insert(borderwrite, to_insert);
Yoni Fogel's avatar
Yoni Fogel committed
922
    if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
923
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
924 925 926
}

int toku_lt_create(toku_lock_tree** ptree, DB* db, BOOL duplicates,
Yoni Fogel's avatar
Yoni Fogel committed
927
                   int   (*panic)(DB*, int), u_int32_t* max_ranges,
Yoni Fogel's avatar
Yoni Fogel committed
928
                   u_int32_t* num_ranges,
Yoni Fogel's avatar
Yoni Fogel committed
929 930 931 932 933
                   int   (*compare_fun)(DB*,const DBT*,const DBT*),
                   int   (*dup_compare)(DB*,const DBT*,const DBT*),
                   void* (*user_malloc) (size_t),
                   void  (*user_free)   (void*),
                   void* (*user_realloc)(void*, size_t)) {
Yoni Fogel's avatar
Yoni Fogel committed
934
    if (!ptree || !db || !compare_fun || !dup_compare || !panic ||
Yoni Fogel's avatar
Yoni Fogel committed
935
        !max_ranges || !num_ranges || !user_malloc || !user_free ||
Yoni Fogel's avatar
Yoni Fogel committed
936
        !user_realloc || !*max_ranges) { return EINVAL; }
Yoni Fogel's avatar
Yoni Fogel committed
937 938
    int r;

Yoni Fogel's avatar
Yoni Fogel committed
939 940 941
    toku_lock_tree* tmp_tree = (toku_lock_tree*)user_malloc(sizeof(*tmp_tree));
    if (0) { died1: user_free(tmp_tree); return r; }
    if (!tmp_tree) return errno;
Yoni Fogel's avatar
Yoni Fogel committed
942
    memset(tmp_tree, 0, sizeof(toku_lock_tree));
Yoni Fogel's avatar
Yoni Fogel committed
943 944 945
    tmp_tree->db               = db;
    tmp_tree->duplicates       = duplicates;
    tmp_tree->panic            = panic;
Yoni Fogel's avatar
Yoni Fogel committed
946 947
    tmp_tree->max_ranges       = max_ranges;
    tmp_tree->num_ranges       = num_ranges;
Yoni Fogel's avatar
Yoni Fogel committed
948 949 950 951 952
    tmp_tree->compare_fun      = compare_fun;
    tmp_tree->dup_compare      = dup_compare;
    tmp_tree->malloc           = user_malloc;
    tmp_tree->free             = user_free;
    tmp_tree->realloc          = user_realloc;
Yoni Fogel's avatar
Yoni Fogel committed
953 954 955
#if defined(TOKU_RT_NOOVERLAPS)
    if (0) { died2: goto died1; }
#else
Yoni Fogel's avatar
Yoni Fogel committed
956
    r = toku_rt_create(&tmp_tree->mainread,
Yoni Fogel's avatar
Yoni Fogel committed
957
                       toku__lt_point_cmp, toku__lt_txn_cmp, TRUE,
Yoni Fogel's avatar
Yoni Fogel committed
958
                       user_malloc, user_free, user_realloc);
Yoni Fogel's avatar
Yoni Fogel committed
959
    if (0) { died2: toku_rt_close(tmp_tree->mainread); goto died1; }
Yoni Fogel's avatar
Yoni Fogel committed
960
    if (r!=0) goto died1;
Yoni Fogel's avatar
Yoni Fogel committed
961
#endif
Yoni Fogel's avatar
Yoni Fogel committed
962
    r = toku_rt_create(&tmp_tree->borderwrite,
Yoni Fogel's avatar
Yoni Fogel committed
963
                       toku__lt_point_cmp, toku__lt_txn_cmp, FALSE,
Yoni Fogel's avatar
Yoni Fogel committed
964
                       user_malloc, user_free, user_realloc);
Yoni Fogel's avatar
Yoni Fogel committed
965
    if (0) { died3: toku_rt_close(tmp_tree->borderwrite); goto died2; }
Yoni Fogel's avatar
Yoni Fogel committed
966
    if (r!=0) goto died2;
Yoni Fogel's avatar
Yoni Fogel committed
967 968 969
    r = toku_rth_create(&tmp_tree->rth, user_malloc, user_free, user_realloc);
    if (0) { died4: toku_rth_close(tmp_tree->rth); goto died3; }
    if (r!=0) goto died3;
Yoni Fogel's avatar
Yoni Fogel committed
970 971 972
    tmp_tree->buflen = __toku_default_buflen;
    tmp_tree->buf    = (toku_range*)
                        user_malloc(tmp_tree->buflen * sizeof(toku_range));
Yoni Fogel's avatar
Yoni Fogel committed
973
    if (!tmp_tree->buf) { r = errno; goto died4; }
Yoni Fogel's avatar
Yoni Fogel committed
974 975 976
    /* We have not failed lock escalation, so we allow escalation if we run
       out of locks. */
    tmp_tree->lock_escalation_allowed = TRUE;
Yoni Fogel's avatar
Yoni Fogel committed
977
    *ptree = tmp_tree;
Yoni Fogel's avatar
Yoni Fogel committed
978
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
979 980 981 982
}

int toku_lt_close(toku_lock_tree* tree) {
    if (!tree)                                              return EINVAL;
Yoni Fogel's avatar
Yoni Fogel committed
983
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
984
    int r2 = 0;
Yoni Fogel's avatar
Yoni Fogel committed
985
#if !defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
986
    r = toku_rt_close(tree->mainread);
Yoni Fogel's avatar
Yoni Fogel committed
987
    if        (r!=0) r2 = r;
Yoni Fogel's avatar
Yoni Fogel committed
988
#endif
Yoni Fogel's avatar
Yoni Fogel committed
989
    r = toku_rt_close(tree->borderwrite);
Yoni Fogel's avatar
Yoni Fogel committed
990
    if (!r2 && r!=0) r2 = r;
Yoni Fogel's avatar
Yoni Fogel committed
991 992 993 994 995

    toku_rth_start_scan(tree->rth);
    toku_rt_forest* forest;
    
    while ((forest = toku_rth_next(tree->rth)) != NULL) {
Yoni Fogel's avatar
Yoni Fogel committed
996
        r = toku__lt_free_contents(tree, forest->self_read,  NULL);
Yoni Fogel's avatar
Yoni Fogel committed
997
        if (!r2 && r!=0) r2 = r;
Yoni Fogel's avatar
Yoni Fogel committed
998
        r = toku__lt_free_contents(tree, forest->self_write, NULL);
Yoni Fogel's avatar
Yoni Fogel committed
999 1000 1001 1002
        if (!r2 && r!=0) r2 = r;
    }
    toku_rth_close(tree->rth);

Yoni Fogel's avatar
Yoni Fogel committed
1003 1004
    tree->free(tree->buf);
    tree->free(tree);
Yoni Fogel's avatar
Yoni Fogel committed
1005
    return r2;
Yoni Fogel's avatar
Yoni Fogel committed
1006 1007 1008
}

int toku_lt_acquire_read_lock(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
1009
                              const DBT* key, const DBT* data) {
Yoni Fogel's avatar
Yoni Fogel committed
1010 1011 1012
    return toku_lt_acquire_range_read_lock(tree, txn, key, data, key, data);
}

Yoni Fogel's avatar
Yoni Fogel committed
1013 1014

static int toku__lt_try_acquire_range_read_lock(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
1015
                                  const DBT* key_left,  const DBT* data_left,
Yoni Fogel's avatar
Yoni Fogel committed
1016 1017
                                  const DBT* key_right, const DBT* data_right,
                                  BOOL* out_of_locks) {
Yoni Fogel's avatar
Yoni Fogel committed
1018 1019 1020 1021 1022 1023
    int r;
    toku_point left;
    toku_point right;
    toku_range query;
    BOOL dominated;
    
Yoni Fogel's avatar
Yoni Fogel committed
1024 1025 1026 1027 1028 1029 1030
    if (!out_of_locks) { return EINVAL; }
    r = toku__lt_preprocess(tree, txn, 
                            key_left,  &data_left,
                            key_right, &data_right,
                            &left, &right,
                            &query, out_of_locks);
    if (r!=0) { goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1031

Yoni Fogel's avatar
Yoni Fogel committed
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
    /*
        For transaction 'txn' to acquire a read-lock on range 'K'=['left','right']:
            if 'K' is dominated by selfwrite('txn') then return success.
            else if 'K' is dominated by selfread('txn') then return success.
            else if 'K' meets borderwrite at 'peer' ('peer'!='txn') &&
                    'K' meets selfwrite('peer') then return failure.
            else
                add 'K' to selfread('txn') and selfwrite('txn').
                This requires merging.. see below.
    */

    /* if 'K' is dominated by selfwrite('txn') then return success. */
Yoni Fogel's avatar
Yoni Fogel committed
1044 1045
    r = toku__lt_rt_dominates(tree, &query, 
                            toku__lt_ifexist_selfwrite(tree, txn), &dominated);
Yoni Fogel's avatar
Yoni Fogel committed
1046
    if (r || dominated) { goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1047 1048

    /* else if 'K' is dominated by selfread('txn') then return success. */
Yoni Fogel's avatar
Yoni Fogel committed
1049 1050
    r = toku__lt_rt_dominates(tree, &query, 
                            toku__lt_ifexist_selfread(tree, txn), &dominated);
Yoni Fogel's avatar
Yoni Fogel committed
1051
    if (r || dominated) { goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1052 1053 1054 1055
    /*
        else if 'K' meets borderwrite at 'peer' ('peer'!='txn') &&
                'K' meets selfwrite('peer') then return failure.
    */
Yoni Fogel's avatar
Yoni Fogel committed
1056
    r = toku__lt_check_borderwrite_conflict(tree, txn, &query);
Yoni Fogel's avatar
Yoni Fogel committed
1057
    if (r!=0) { goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1058 1059
    /* Now need to merge, copy the memory and insert. */
    toku_range  to_insert;
Yoni Fogel's avatar
Yoni Fogel committed
1060
    toku__init_insert(&to_insert, &left, &right, txn);
Yoni Fogel's avatar
Yoni Fogel committed
1061
    /* Consolidate the new range and all the overlapping ranges */
Yoni Fogel's avatar
Yoni Fogel committed
1062 1063 1064 1065 1066 1067 1068 1069
    r = toku__consolidate(tree, &query, &to_insert, txn, out_of_locks);
    if (r!=0) { goto cleanup; }
    
    r = 0;
cleanup:
    return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080
/* Checks for if a write range conflicts with reads.
   Supports ranges. */
static inline int toku__lt_write_range_conflicts_reads(toku_lock_tree* tree,
                                               DB_TXN* txn, toku_range* query) {
    int r    = 0;
    BOOL met = FALSE;
    toku_rth_start_scan(tree->rth);
    toku_rt_forest* forest;
    
    while ((forest = toku_rth_next(tree->rth)) != NULL) {
        if (forest->self_read != NULL && forest->hash_key != txn) {
Yoni Fogel's avatar
Yoni Fogel committed
1081
            r = toku__lt_meets_peer(tree, query, forest->self_read, TRUE, txn,///
Yoni Fogel's avatar
Yoni Fogel committed
1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
                            &met);
            if (r!=0) { goto cleanup; }
            if (met)  { r = DB_LOCK_NOTGRANTED; goto cleanup; }
        }
    }
    r = 0;
cleanup:
    return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
1092 1093 1094 1095 1096 1097 1098
/*
    Tests whether a range from BorderWrite is trivially escalatable.
    i.e. No read locks from other transactions overlap the range.
*/
static inline int toku__border_escalation_trivial(toku_lock_tree* tree, 
                                                  toku_range* border_range, 
                                                  BOOL* trivial) {
Yoni Fogel's avatar
Yoni Fogel committed
1099
    assert(tree && border_range && trivial);
Yoni Fogel's avatar
Yoni Fogel committed
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
    int r = ENOSYS;

    toku_range query = *border_range;
    query.data       = NULL;

    r = toku__lt_write_range_conflicts_reads(tree, border_range->data, &query);
    if (r == DB_LOCK_NOTGRANTED || r == DB_LOCK_DEADLOCK) { *trivial = FALSE; }
    else if (r!=0) { goto cleanup; }
    else { *trivial = TRUE; }

    r = 0;
cleanup:
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1113
}
Yoni Fogel's avatar
Yoni Fogel committed
1114

Yoni Fogel's avatar
Yoni Fogel committed
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
/*  */
static inline int toku__escalate_writes_from_border_range(toku_lock_tree* tree, 
                                                          toku_range* border_range) {
    int r = ENOSYS;
    if (!tree || !border_range) { r = EINVAL; goto cleanup; }
    DB_TXN* txn = border_range->data;
    toku_range_tree* self_write = toku__lt_ifexist_selfwrite(tree, txn);
    assert(self_write);
    toku_range query = *border_range;
    u_int32_t numfound = 0;
    query.data = NULL;

    /*
     * Delete all overlapping ranges
     */
    r = toku_rt_find(self_write, &query, 0, &tree->buf, &tree->buflen, &numfound);
    if (r != 0) { goto cleanup; }
    u_int32_t i;
    for (i = 0; i < numfound; i++) {
        r = toku_rt_delete(self_write, &tree->buf[i]);
        if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
        /*
         * Clean up memory that is not referenced by border_range.
         */
        if (tree->buf[i].left != tree->buf[i].right &&
            toku__lt_p_independent(tree->buf[i].left, border_range)) {
            /* Do not double free if left and right are same point. */
            toku__p_free(tree, tree->buf[i].left);
        }
        if (toku__lt_p_independent(tree->buf[i].right, border_range)) {
            toku__p_free(tree, tree->buf[i].right);
        }
    }
    
    /*
     * Insert border_range into self_write table
     */
    r = toku_rt_insert(self_write, border_range);
    if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }

    toku__lt_range_incr(tree, numfound);
    r = 0;
cleanup:
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1159
}
Yoni Fogel's avatar
Yoni Fogel committed
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204

static inline int toku__escalate_reads_from_border_range(toku_lock_tree* tree, 
                                                         toku_range* border_range) {
    int r = ENOSYS;
    if (!tree || !border_range) { r = EINVAL; goto cleanup; }
    DB_TXN* txn = border_range->data;
    toku_range_tree* self_read = toku__lt_ifexist_selfread(tree, txn);
    if (self_read == NULL) { r = 0; goto cleanup; }
    toku_range query = *border_range;
    u_int32_t numfound = 0;
    query.data = NULL;

    /*
     * Delete all overlapping ranges
     */
    r = toku_rt_find(self_read, &query, 0, &tree->buf, &tree->buflen, &numfound);
    if (r != 0) { goto cleanup; }
    u_int32_t i;
    u_int32_t removed = 0;
    for (i = 0; i < numfound; i++) {
        if (!toku__dominated(&tree->buf[0], border_range)) { continue; }
        r = toku_rt_delete(self_read, &tree->buf[i]);
        if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
#if !defined(TOKU_RT_NOOVERLAPS)
        r = toku_rt_delete(tree->mainread, &tree->buf[i]);
        if (r != 0) { r = toku__lt_panic(tree, r); goto cleanup; }
#endif /* TOKU_RT_NOOVERLAPS */
        removed++;
        /*
         * Clean up memory that is not referenced by border_range.
         */
        if (tree->buf[i].left != tree->buf[i].right &&
            toku__lt_p_independent(tree->buf[i].left, border_range)) {
            /* Do not double free if left and right are same point. */
            toku__p_free(tree, tree->buf[i].left);
        }
        if (toku__lt_p_independent(tree->buf[i].right, border_range)) {
            toku__p_free(tree, tree->buf[i].right);
        }
    }
    
    toku__lt_range_decr(tree, removed);
    r = 0;
cleanup:
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1205 1206
}

Yoni Fogel's avatar
Yoni Fogel committed
1207

Yoni Fogel's avatar
Yoni Fogel committed
1208
/*
Yoni Fogel's avatar
Yoni Fogel committed
1209 1210 1211 1212
 * For each range in BorderWrite:
 *     Check to see if range conflicts any read lock held by other transactions
 *     Replaces all writes that overlap with range
 *     Deletes all reads dominated by range
Yoni Fogel's avatar
Yoni Fogel committed
1213 1214 1215
 */
static int toku__do_escalation(toku_lock_tree* tree, BOOL* locks_available) {
    int r = ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
1216 1217 1218
    if (!tree || !locks_available)      { r = EINVAL; goto cleanup; }
    if (!tree->lock_escalation_allowed) { r = EDOM;   goto cleanup; }
    toku_range_tree* border       = tree->borderwrite;
Yoni Fogel's avatar
Yoni Fogel committed
1219
    assert(border);
Yoni Fogel's avatar
Yoni Fogel committed
1220 1221 1222 1223 1224 1225 1226 1227 1228
    toku_range       border_range;
    BOOL             found        = FALSE;
    BOOL             trivial      = FALSE;

    toku_rt_start_scan(border);
    while ((r = toku_rt_next(border, &border_range, &found)) == 0 && found) {
        r = toku__border_escalation_trivial(tree, &border_range, &trivial);
        if (r!=0)     { goto cleanup; }
        if (!trivial) { continue; }
Yoni Fogel's avatar
Yoni Fogel committed
1229 1230 1231 1232
        /*
         * At this point, we determine that escalation is simple,
         * Attempt escalation
         */
Yoni Fogel's avatar
Yoni Fogel committed
1233 1234
        r = toku__escalate_writes_from_border_range(tree, &border_range);
        if (r!=0)     { r = toku__lt_panic(tree, r); goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1235 1236
        r = toku__escalate_reads_from_border_range(tree, &border_range);
        if (r!=0)     { r = toku__lt_panic(tree, r); goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1237
    }
Yoni Fogel's avatar
Yoni Fogel committed
1238
    r = 0;
Yoni Fogel's avatar
Yoni Fogel committed
1239 1240
    *locks_available = toku__lt_range_test_incr(tree, 0);
    /* Escalation is allowed if 1/10th of the locks (or more) are free. */
Yoni Fogel's avatar
Yoni Fogel committed
1241 1242
    tree->lock_escalation_allowed = toku__lt_percent_ranges_free(tree,
                                             TOKU_DISABLE_ESCALATION_THRESHOLD);
Yoni Fogel's avatar
Yoni Fogel committed
1243
cleanup:
Yoni Fogel's avatar
Yoni Fogel committed
1244 1245 1246 1247 1248 1249
    if (r!=0) {
        if (tree && locks_available) {
            *locks_available              = FALSE;
            tree->lock_escalation_allowed = FALSE;
        }
    }
Yoni Fogel's avatar
Yoni Fogel committed
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281
    return r;
}

int toku_lt_acquire_range_read_lock(toku_lock_tree* tree, DB_TXN* txn,
                                  const DBT* key_left,  const DBT* data_left,
                                  const DBT* key_right, const DBT* data_right) {
    BOOL out_of_locks = FALSE;
    int r = ENOSYS;
    r = toku__lt_try_acquire_range_read_lock(tree, txn, 
                                             key_left,  data_left,
                                             key_right, data_right,
                                            &out_of_locks);
    if (r != 0) { goto cleanup; }

    if (out_of_locks && tree->lock_escalation_allowed) {
        BOOL locks_available = FALSE;
        r = toku__do_escalation(tree, &locks_available);
        if (r != 0) { goto cleanup; }
        
        if (!locks_available) { r = ENOMEM; goto cleanup; }
        
        r = toku__lt_try_acquire_range_read_lock(tree, txn, 
                                                 key_left,  data_left,
                                                 key_right, data_right,
                                                 &out_of_locks);
        if (r != 0) { goto cleanup; }
    }
    if (out_of_locks) { r = ENOMEM; goto cleanup; }

    r = 0;
cleanup:
    return r;
Yoni Fogel's avatar
Yoni Fogel committed
1282 1283
}

Yoni Fogel's avatar
Yoni Fogel committed
1284 1285 1286 1287 1288
/* Checks for if a write point conflicts with reads.
   If mainread exists, it uses a single query, else it uses T queries
   (one in each selfread).
   Does not support write ranges.
*/
Yoni Fogel's avatar
Yoni Fogel committed
1289
static int toku__lt_write_point_conflicts_reads(toku_lock_tree* tree,
Yoni Fogel's avatar
Yoni Fogel committed
1290 1291 1292
                                               DB_TXN* txn, toku_range* query) {
    int r    = 0;
#if defined(TOKU_RT_NOOVERLAPS)
Yoni Fogel's avatar
Yoni Fogel committed
1293 1294
    r = toku__lt_write_range_conflicts_reads(tree, txn, query);
    if (r!=0) { goto cleanup; }    
Yoni Fogel's avatar
Yoni Fogel committed
1295
#else
Yoni Fogel's avatar
Yoni Fogel committed
1296
    BOOL met = FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
1297
    toku_range_tree* mainread = tree->mainread; assert(mainread);
Yoni Fogel's avatar
Yoni Fogel committed
1298
    r = toku__lt_meets_peer(tree, query, mainread, FALSE, txn, &met);
Yoni Fogel's avatar
Yoni Fogel committed
1299
    if (r!=0) { goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1300
    if (met)  { r = DB_LOCK_NOTGRANTED; goto cleanup; }
Yoni Fogel's avatar
Yoni Fogel committed
1301 1302 1303 1304 1305 1306
#endif
    r = 0;
cleanup:
    return r;
}

Yoni Fogel's avatar
Yoni Fogel committed
1307 1308 1309
static int toku__lt_try_acquire_write_lock(toku_lock_tree* tree, DB_TXN* txn,
                               const DBT* key, const DBT* data, 
                               BOOL* out_of_locks) {
Yoni Fogel's avatar
Yoni Fogel committed
1310
    int r;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1311
    toku_point endpoint;
Yoni Fogel's avatar
Yoni Fogel committed
1312 1313 1314
    toku_range query;
    BOOL dominated;
    
Yoni Fogel's avatar
Yoni Fogel committed
1315 1316 1317 1318 1319
    r = toku__lt_preprocess(tree, txn, 
                            key,      &data,
                            key,      &data,
                            &endpoint, &endpoint,
                            &query, out_of_locks);
Yoni Fogel's avatar
Yoni Fogel committed
1320
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
1321 1322
    
    *out_of_locks = FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
1323
    /* if 'K' is dominated by selfwrite('txn') then return success. */
Yoni Fogel's avatar
Yoni Fogel committed
1324 1325
    r = toku__lt_rt_dominates(tree, &query, 
                            toku__lt_ifexist_selfwrite(tree, txn), &dominated);
Yoni Fogel's avatar
Yoni Fogel committed
1326
    if (r || dominated) return r;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1327
    /* else if K meets mainread at 'txn2' then return failure */
Yoni Fogel's avatar
Yoni Fogel committed
1328
    r = toku__lt_write_point_conflicts_reads(tree, txn, &query);
Yoni Fogel's avatar
Yoni Fogel committed
1329
    if (r!=0) return r;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1330 1331 1332 1333
    /*
        else if 'K' meets borderwrite at 'peer' ('peer'!='txn') &&
                'K' meets selfwrite('peer') then return failure.
    */
Yoni Fogel's avatar
Yoni Fogel committed
1334
    r = toku__lt_check_borderwrite_conflict(tree, txn, &query);
Yoni Fogel's avatar
Yoni Fogel committed
1335
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
1336
    /*  Now need to copy the memory and insert.
Yoni Fogel's avatar
Yoni Fogel committed
1337 1338 1339 1340
        No merging required in selfwrite.
        This is a point, and if merging was possible it would have been
        dominated by selfwrite.
    */
Yoni Fogel's avatar
Yoni Fogel committed
1341
    toku_range to_insert;
Yoni Fogel's avatar
Yoni Fogel committed
1342
    toku__init_insert(&to_insert, &endpoint, &endpoint, txn);
Yoni Fogel's avatar
Yoni Fogel committed
1343 1344 1345
    if (!toku__lt_range_test_incr(tree, 0)) { 
        *out_of_locks = TRUE; return 0; 
    }
Yoni Fogel's avatar
Yoni Fogel committed
1346

Yoni Fogel's avatar
Yoni Fogel committed
1347
    BOOL dummy = TRUE;
Yoni Fogel's avatar
Yoni Fogel committed
1348 1349
    r = toku__lt_alloc_extreme(tree, &to_insert, TRUE, &dummy);
    if (0) { died1:  toku__p_free(tree, to_insert.left);   return r; }
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1350
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
1351
    toku_range_tree* selfwrite;
Yoni Fogel's avatar
Yoni Fogel committed
1352
    r = toku__lt_selfwrite(tree, txn, &selfwrite);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1353
    if (r!=0) goto died1;
Yoni Fogel's avatar
Yoni Fogel committed
1354 1355 1356 1357
    assert(selfwrite);
    r = toku_rt_insert(selfwrite, &to_insert);
    if (r!=0) goto died1;
    /* Need to update borderwrite. */
Yoni Fogel's avatar
Yoni Fogel committed
1358 1359 1360
    r = toku__lt_borderwrite_insert(tree, &query, &to_insert);
    if (r!=0) return toku__lt_panic(tree, r);
    toku__lt_range_incr(tree, 0);
Yoni Fogel's avatar
Yoni Fogel committed
1361
    return 0;
Yoni Fogel's avatar
Yoni Fogel committed
1362 1363
}

Yoni Fogel's avatar
Yoni Fogel committed
1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393
int toku_lt_acquire_write_lock(toku_lock_tree* tree, DB_TXN* txn,
                               const DBT* key, const DBT* data) {
    BOOL out_of_locks = FALSE;
    int r = ENOSYS;

    r = toku__lt_try_acquire_write_lock (tree, txn,
                                      key, data,
                                      &out_of_locks);
    if (r != 0) { goto cleanup; }

    if (out_of_locks && tree->lock_escalation_allowed) {
        BOOL locks_available = FALSE;
        r = toku__do_escalation(tree, &locks_available);
        if (r != 0) { goto cleanup; }
        
        if (!locks_available) { r = ENOMEM; goto cleanup; }
        
        r = toku__lt_try_acquire_write_lock (tree, txn,
                                          key, data,
                                          &out_of_locks);
        if (r != 0) { goto cleanup; }
    }
    if (out_of_locks) { r = ENOMEM; goto cleanup; }

    r = 0;
cleanup:
    return r;
}

static int toku__lt_try_acquire_range_write_lock(toku_lock_tree* tree, DB_TXN* txn,
Yoni Fogel's avatar
Yoni Fogel committed
1394
                                  const DBT* key_left,  const DBT* data_left,
Yoni Fogel's avatar
Yoni Fogel committed
1395 1396
                                  const DBT* key_right, const DBT* data_right,
                                  BOOL* out_of_locks) {
Yoni Fogel's avatar
Yoni Fogel committed
1397
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
1398 1399
    toku_point left;
    toku_point right;
Yoni Fogel's avatar
Yoni Fogel committed
1400
    toku_range query;
Yoni Fogel's avatar
Yoni Fogel committed
1401 1402

    if (key_left == key_right &&
1403
        (data_left == data_right || (tree && !tree->duplicates))) {
Yoni Fogel's avatar
Yoni Fogel committed
1404 1405 1406
        return toku__lt_try_acquire_write_lock(tree, txn, 
                                               key_left, data_left, 
                                               out_of_locks);
Yoni Fogel's avatar
Yoni Fogel committed
1407 1408
    }

Yoni Fogel's avatar
Yoni Fogel committed
1409 1410 1411 1412 1413
    r = toku__lt_preprocess(tree, txn, 
                            key_left,  &data_left,
                            key_right, &data_right,
                           &left,      &right,
                           &query,      out_of_locks);
Yoni Fogel's avatar
Yoni Fogel committed
1414
    if (r!=0) return r;
Yoni Fogel's avatar
Yoni Fogel committed
1415
    *out_of_locks = FALSE;
Yoni Fogel's avatar
Yoni Fogel committed
1416 1417

    return ENOSYS;
Yoni Fogel's avatar
Yoni Fogel committed
1418 1419 1420 1421
    //We are not ready for this.
    //Not needed for Feb 1 release.
}

Yoni Fogel's avatar
Yoni Fogel committed
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
int toku_lt_acquire_range_write_lock(toku_lock_tree* tree, DB_TXN* txn,
                                  const DBT* key_left,  const DBT* data_left,
                                  const DBT* key_right, const DBT* data_right) {
    BOOL out_of_locks = FALSE;
    int r = ENOSYS;

    r = toku__lt_try_acquire_range_write_lock (tree, txn,
                                            key_left,  data_left,
                                            key_right, data_right,
                                            &out_of_locks);
    if (r != 0) { goto cleanup; }

    if (out_of_locks && tree->lock_escalation_allowed) {
        BOOL locks_available = FALSE;
        r = toku__do_escalation(tree, &locks_available);
        if (r != 0) { goto cleanup; }
        
        if (!locks_available) { r = ENOMEM; goto cleanup; }
        
        r = toku__lt_try_acquire_range_write_lock (tree, txn,
                                                key_left,  data_left,
                                                key_right, data_right,
                                                &out_of_locks);
        if (r != 0) { goto cleanup; }
    }
    if (out_of_locks) { r = ENOMEM; goto cleanup; }

    r = 0;
cleanup:
    return r;
}
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1453

1454
static inline int toku__sweep_border(toku_lock_tree* tree, toku_range* range) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
    assert(tree && range);
    toku_range_tree* borderwrite = tree->borderwrite;
    assert(borderwrite);

    /* Find overlapping range in borderwrite */
    int r;
    const u_int32_t query_size = 1;
    toku_range      buffer[query_size];
    u_int32_t       buflen     = query_size;
    toku_range*     buf        = &buffer[0];
    u_int32_t       numfound;

    toku_range query = *range;
    query.data = NULL;
    r = toku_rt_find(borderwrite, &query, query_size, &buf, &buflen, &numfound);
    if (r!=0) return r;
    assert(numfound <= query_size);
    
    /*  If none exists or data is not ours (we have already deleted the real
        overlapping range), continue to the end of the loop (i.e., return) */
    if (!numfound || buf[0].data != range->data) return 0;
    assert(numfound == 1);

    /* Delete s from borderwrite */
    r = toku_rt_delete(borderwrite, &buf[0]);
    if (r!=0) return r;

    /* Find pred(s.left), and succ(s.right) */
    toku_range pred;
    toku_range succ;
    BOOL found_p;
    BOOL found_s;

Yoni Fogel's avatar
Yoni Fogel committed
1488
    r = toku__lt_get_border(tree, TRUE, &pred, &succ, &found_p, &found_s,
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1489 1490
                             &buf[0]);
    if (r!=0) return r;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1491
    if (found_p && found_s && pred.data == succ.data &&
Yoni Fogel's avatar
Yoni Fogel committed
1492
        pred.data == buf[0].data) { 
Yoni Fogel's avatar
Yoni Fogel committed
1493
        return toku__lt_panic(tree, TOKU_LT_INCONSISTENT); }
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521

    /* If both found and pred.data=succ.data, merge pred and succ (expand?)
       free_points */
    if (!found_p || !found_s || pred.data != succ.data) return 0;

    r = toku_rt_delete(borderwrite, &pred);
    if (r!=0) return r;
    r = toku_rt_delete(borderwrite, &succ);
    if (r!=0) return r;

    pred.right = succ.right;
    r = toku_rt_insert(borderwrite, &pred);
    if (r!=0) return r;

    return 0;
}

/*
  Algorithm:
    For each range r in selfwrite
      Find overlapping range s in borderwrite 
      If none exists or data is not ours (we have already deleted the real
        overlapping range), continue
      Delete s from borderwrite
      Find pred(s.left), and succ(s.right)
      If both found and pred.data=succ.data, merge pred and succ (expand?)
    free_points
*/
1522
static inline int toku__lt_border_delete(toku_lock_tree* tree, toku_range_tree* rt) {
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1523 1524 1525 1526 1527 1528 1529 1530
    int r;
    assert(tree);
    if (!rt) return 0;

    /* Find the ranges in rt */
    toku_range query;
    toku_point left;
    toku_point right;
Yoni Fogel's avatar
Yoni Fogel committed
1531
    toku__lt_init_full_query(tree, &query, &left, &right);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1532 1533 1534 1535 1536 1537 1538 1539

    u_int32_t numfound;
    r = toku_rt_find(rt, &query, 0, &tree->buf, &tree->buflen, &numfound);
    if (r!=0) return r;
    assert(numfound <= tree->buflen);
    
    u_int32_t i;
    for (i = 0; i < numfound; i++) {
Yoni Fogel's avatar
Yoni Fogel committed
1540
        r = toku__sweep_border(tree, &tree->buf[i]);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1541 1542 1543 1544 1545 1546 1547
        if (r!=0) return r;
    }

    return 0;
}

int toku_lt_unlock(toku_lock_tree* tree, DB_TXN* txn) {
Yoni Fogel's avatar
Yoni Fogel committed
1548
    if (!tree || !txn) return EINVAL;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1549
    int r;
Yoni Fogel's avatar
Yoni Fogel committed
1550 1551
    toku_range_tree *selfwrite = toku__lt_ifexist_selfwrite(tree, txn);
    toku_range_tree *selfread  = toku__lt_ifexist_selfread (tree, txn);
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1552

Yoni Fogel's avatar
Yoni Fogel committed
1553
    u_int32_t ranges = 0;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1554 1555

    if (selfread) {
Yoni Fogel's avatar
Yoni Fogel committed
1556 1557 1558 1559
        u_int32_t size;
        r = toku_rt_get_size(selfread, &size);
        assert(r==0);
        ranges += size;
Yoni Fogel's avatar
Yoni Fogel committed
1560 1561
        r = toku__lt_free_contents(tree, selfread, tree->mainread);
        if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
1562
    }
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1563

Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1564
    if (selfwrite) {
Yoni Fogel's avatar
Yoni Fogel committed
1565 1566 1567 1568
        u_int32_t size;
        r = toku_rt_get_size(selfwrite, &size);
        assert(r==0);
        ranges += size;
Yoni Fogel's avatar
Yoni Fogel committed
1569 1570 1571 1572
        r = toku__lt_border_delete(tree, selfwrite);
        if (r!=0) return toku__lt_panic(tree, r);
        r = toku__lt_free_contents(tree, selfwrite, NULL);
        if (r!=0) return toku__lt_panic(tree, r);
Yoni Fogel's avatar
Yoni Fogel committed
1573
    }
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1574

Yoni Fogel's avatar
Yoni Fogel committed
1575
    if (selfread || selfwrite) toku_rth_delete(tree->rth, txn);
Yoni Fogel's avatar
Yoni Fogel committed
1576
    
Yoni Fogel's avatar
Yoni Fogel committed
1577
    toku__lt_range_decr(tree, ranges);
Yoni Fogel's avatar
Yoni Fogel committed
1578

Yoni Fogel's avatar
Yoni Fogel committed
1579 1580 1581 1582
    if (toku__lt_percent_ranges_free(tree, TOKU_ENABLE_ESCALATION_THRESHOLD)) {
        tree->lock_escalation_allowed = TRUE;
    }

Yoni Fogel's avatar
Yoni Fogel committed
1583 1584
    return 0;
}
Yoni Fogel's avatar
Yoni Fogel committed
1585

Yoni Fogel's avatar
Yoni Fogel committed
1586 1587 1588 1589
int toku_lt_set_dups(toku_lock_tree* tree, BOOL duplicates) {
    if (!tree)              return EINVAL;
    if (tree->dups_final)   return EDOM;
    tree->duplicates = duplicates;
Vincenzo Liberatore's avatar
Vincenzo Liberatore committed
1590 1591
    return 0;
}
Yoni Fogel's avatar
Yoni Fogel committed
1592

Yoni Fogel's avatar
Yoni Fogel committed
1593 1594
int toku_lt_set_txn_add_lt_callback(toku_lock_tree* tree,
                                    int (*callback)(DB_TXN*, toku_lock_tree*)) {
Yoni Fogel's avatar
Yoni Fogel committed
1595
    if (!tree || !callback) return EINVAL;
Yoni Fogel's avatar
Yoni Fogel committed
1596
    if (tree->dups_final)   return EDOM;
Yoni Fogel's avatar
Yoni Fogel committed
1597 1598 1599
    tree->lock_callback = callback;
    return 0;
}