Commit 015fa8c6 authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

[t:3609], fix race condition by having shortcut cursors grab read locks on...

[t:3609], fix race condition by having shortcut cursors grab read locks on PAIRs, range queries will be slower

git-svn-id: file:///svn/toku/tokudb@32713 c7de825b-a66e-492c-adef-691d508d4ae1
parent d64dbafb
...@@ -472,6 +472,8 @@ struct brt_cursor_leaf_info_to_be { ...@@ -472,6 +472,8 @@ struct brt_cursor_leaf_info_to_be {
// Values to be used to pin a leaf for shortcut searches // Values to be used to pin a leaf for shortcut searches
struct brt_cursor_leaf_info { struct brt_cursor_leaf_info {
BLOCKNUM blocknumber;
u_int32_t fullhash;
struct brt_cursor_leaf_info_to_be to_be; struct brt_cursor_leaf_info_to_be to_be;
}; };
......
...@@ -5163,6 +5163,8 @@ brt_search_basement_node( ...@@ -5163,6 +5163,8 @@ brt_search_basement_node(
BRT_GET_CALLBACK_FUNCTION getf, BRT_GET_CALLBACK_FUNCTION getf,
void *getf_v, void *getf_v,
BOOL *doprefetch, BOOL *doprefetch,
BLOCKNUM thisnodename,
u_int32_t fullhash,
BRT_CURSOR brtcursor BRT_CURSOR brtcursor
) )
{ {
...@@ -5240,6 +5242,8 @@ got_a_good_value: ...@@ -5240,6 +5242,8 @@ got_a_good_value:
// is done in brt_cursor_update. // is done in brt_cursor_update.
brtcursor->leaf_info.to_be.omt = bn->buffer; brtcursor->leaf_info.to_be.omt = bn->buffer;
brtcursor->leaf_info.to_be.index = idx; brtcursor->leaf_info.to_be.index = idx;
brtcursor->leaf_info.fullhash = fullhash;
brtcursor->leaf_info.blocknumber = thisnodename;
brt_cursor_update(brtcursor); brt_cursor_update(brtcursor);
//The search was successful. Prefetching can continue. //The search was successful. Prefetching can continue.
*doprefetch = TRUE; *doprefetch = TRUE;
...@@ -5480,6 +5484,8 @@ brt_search_node( ...@@ -5480,6 +5484,8 @@ brt_search_node(
getf, getf,
getf_v, getf_v,
doprefetch, doprefetch,
node->thisnodename,
node->fullhash,
brtcursor brtcursor
); );
} }
...@@ -5769,6 +5775,25 @@ brt_cursor_shortcut (BRT_CURSOR cursor, int direction, u_int32_t limit, BRT_GET_ ...@@ -5769,6 +5775,25 @@ brt_cursor_shortcut (BRT_CURSOR cursor, int direction, u_int32_t limit, BRT_GET_
return r; return r;
} }
static int
brt_cursor_maybe_get_and_pin_leaf(BRT_CURSOR brtcursor, BRTNODE* leafp) {
void *leafv;
int r = toku_cachetable_maybe_get_and_pin_clean(brtcursor->brt->cf,
brtcursor->leaf_info.blocknumber,
brtcursor->leaf_info.fullhash,
&leafv);
if (r == 0) {
*leafp = leafv;
}
return r;
}
static void
brt_cursor_unpin_leaf(BRT_CURSOR brtcursor, BRTNODE leaf) {
toku_unpin_brtnode(brtcursor->brt, leaf);
}
static int static int
brt_cursor_next_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v) brt_cursor_next_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, void *getf_v)
// Effect: If possible, increment the cursor and return the key-value pair // Effect: If possible, increment the cursor and return the key-value pair
...@@ -5777,8 +5802,13 @@ brt_cursor_next_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, voi ...@@ -5777,8 +5802,13 @@ brt_cursor_next_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, voi
{ {
int r; int r;
if (toku_omt_cursor_is_valid(cursor->omtcursor)) { if (toku_omt_cursor_is_valid(cursor->omtcursor)) {
BRTNODE leaf;
r = brt_cursor_maybe_get_and_pin_leaf(cursor, &leaf);
if (r == 0) {
u_int32_t limit = toku_omt_size(toku_omt_cursor_get_omt(cursor->omtcursor)) - 1; u_int32_t limit = toku_omt_size(toku_omt_cursor_get_omt(cursor->omtcursor)) - 1;
r = brt_cursor_shortcut(cursor, 1, limit, getf, getf_v); r = brt_cursor_shortcut(cursor, 1, limit, getf, getf_v);
brt_cursor_unpin_leaf(cursor, leaf);
}
} }
else r = EINVAL; else r = EINVAL;
return r; return r;
...@@ -5838,7 +5868,12 @@ brt_cursor_prev_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, voi ...@@ -5838,7 +5868,12 @@ brt_cursor_prev_shortcut (BRT_CURSOR cursor, BRT_GET_CALLBACK_FUNCTION getf, voi
{ {
int r; int r;
if (toku_omt_cursor_is_valid(cursor->omtcursor)) { if (toku_omt_cursor_is_valid(cursor->omtcursor)) {
BRTNODE leaf;
r = brt_cursor_maybe_get_and_pin_leaf(cursor, &leaf);
if (r == 0) {
r = brt_cursor_shortcut(cursor, -1, 0, getf, getf_v); r = brt_cursor_shortcut(cursor, -1, 0, getf, getf_v);
brt_cursor_unpin_leaf(cursor, leaf);
}
} }
else r = EINVAL; else r = EINVAL;
return r; return r;
......
...@@ -1708,6 +1708,36 @@ int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE cachefile, CACHEKEY key, ...@@ -1708,6 +1708,36 @@ int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE cachefile, CACHEKEY key,
return r; return r;
} }
//Used by shortcut query path.
//Same as toku_cachetable_maybe_get_and_pin except that we don't care if the node is clean or dirty (return the node regardless).
//All other conditions remain the same.
int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value) {
CACHETABLE ct = cachefile->cachetable;
PAIR p;
int count = 0;
int r = -1;
cachetable_lock(ct);
cachetable_maybe_get_and_pins++;
for (p=ct->table[fullhash&(ct->table_size-1)]; p; p=p->hash_chain) {
count++;
if (p->key.b==key.b && p->cachefile==cachefile) {
if (p->state == CTPAIR_IDLE && //If not idle, will require a stall and/or might be GONE once not idle
!p->checkpoint_pending && //If checkpoint pending, we would need to first write it, which would make it clean (if the pin would be used for writes. If would be used for read-only we could return it, but that would increase complexity)
rwlock_try_prefer_read_lock(&p->rwlock, ct->mutex) == 0 //Grab read lock only if no stall required
) {
cachetable_maybe_get_and_pin_hits++;
*value = p->value;
r = 0;
//printf("%s:%d cachetable_maybe_get_and_pin_clean(%lld)--> %p\n", __FILE__, __LINE__, key, *value);
}
break;
}
}
note_hash_count(count);
cachetable_unlock(ct);
return r;
}
static int static int
toku_cachetable_unpin_internal(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, enum cachetable_dirty dirty, long size, BOOL have_ct_lock) toku_cachetable_unpin_internal(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, enum cachetable_dirty dirty, long size, BOOL have_ct_lock)
// size==0 means that the size didn't change. // size==0 means that the size didn't change.
......
...@@ -232,6 +232,9 @@ int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE /*cachefile*/, CACHEKEY ...@@ -232,6 +232,9 @@ int toku_cachetable_get_and_pin_if_in_memory (CACHEFILE /*cachefile*/, CACHEKEY
// Returns: 0 iff the item is in memory (otherwise return a error) // Returns: 0 iff the item is in memory (otherwise return a error)
// Modifies: *value (if returning 0, then the pointer to the value is stored in *value. // Modifies: *value (if returning 0, then the pointer to the value is stored in *value.
int toku_cachetable_maybe_get_and_pin_clean (CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/, void**);
// Effect: Like maybe get and pin, but may pin a clean pair.
// cachetable pair clean or dirty WRT external memory // cachetable pair clean or dirty WRT external memory
enum cachetable_dirty { enum cachetable_dirty {
CACHETABLE_CLEAN=0, // the cached object is clean WRT the cachefile CACHETABLE_CLEAN=0, // the cached object is clean WRT the cachefile
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment