close[t:4574] merging 4574 to main. fixes the hcad deadlock found by Tim's...

close[t:4574] merging 4574 to main. fixes the hcad deadlock found by Tim's stress test, which adds and drops indexes concurrent with queries and insertions. transactions no longer keep trollback nodes pinned after an operation, but instead always unpin them. this merge also introduces a lot of improvements to our rollback code, in terms of clarity and consistency. to that end, variable names and function names were improved, as well as more documentation of the rollback logic in rollback.h and log-internal.h roll.h is removed because it is a dead file. git-svn-id: file:///svn/toku/tokudb@41576 c7de825b-a66e-492c-adef-691d508d4ae1

close[t:4574] merging 4574 to main. fixes the hcad deadlock found by Tim's...
close[t:4574] merging 4574 to main. fixes the hcad deadlock found by Tim's stress test, which adds and drops indexes concurrent with queries and insertions. transactions no longer keep trollback nodes pinned after an operation, but instead always unpin them. this merge also introduces a lot of improvements to our rollback code, in terms of clarity and consistency. to that end, variable names and function names were improved, as well as more documentation of the rollback logic in rollback.h and log-internal.h roll.h is removed because it is a dead file. git-svn-id: file:///svn/toku/tokudb@41576 c7de825b-a66e-492c-adef-691d508d4ae1
ca9e8dc6 · John Esmet · Yoni Fogel · ac7abc7f · ca9e8dc6 · ca9e8dc6
Commit ca9e8dc6 authored Apr 04, 2012 by John Esmet Committed by Yoni Fogel Apr 17, 2013
11 changed files
--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -2046,7 +2046,7 @@ toku_serialize_descriptor_contents_to_fd(int fd, const DESCRIPTOR desc, DISKOFF
 static void
 deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) {
    if (layout_version == BRT_LAYOUT_VERSION_13) {
-	// in older versions of TokuDB the Descriptor had a 4 byte version, which we must skip over
+	// in previous versions of TokuDB the Descriptor had a 4 byte version, which we must skip over
 	u_int32_t dummy_version __attribute__((__unused__)) = rbuf_int(rb);
    }
    u_int32_t size;
@@ -2501,8 +2501,8 @@ serialize_rollback_log_size(ROLLBACK_LOG_NODE log) {
    size_t size = node_header_overhead //8 "tokuroll", 4 version, 4 version_original, 4 build_id
                 +8 //TXNID
                 +8 //sequence
-                 +8 //thislogname
-                 +8 //older (blocknum)
+                 +8 //blocknum
+                 +8 //previous (blocknum)
                 +8 //resident_bytecount
                 +8 //memarena_size_needed_to_load
                 +log->rollentry_resident_bytecount;
@@ -2521,8 +2521,8 @@ serialize_rollback_log_node_to_buf(ROLLBACK_LOG_NODE log, char *buf, size_t calc
        wbuf_nocrc_uint(&wb, BUILD_ID);
        wbuf_nocrc_TXNID(&wb, log->txnid);
        wbuf_nocrc_ulonglong(&wb, log->sequence);
-        wbuf_nocrc_BLOCKNUM(&wb, log->thislogname);
-        wbuf_nocrc_BLOCKNUM(&wb, log->older);
+        wbuf_nocrc_BLOCKNUM(&wb, log->blocknum);
+        wbuf_nocrc_BLOCKNUM(&wb, log->previous);
        wbuf_nocrc_ulonglong(&wb, log->rollentry_resident_bytecount);
        //Write down memarena size needed to restore
        wbuf_nocrc_ulonglong(&wb, memarena_total_size_in_use(log->rollentry_arena));
@@ -2677,18 +2677,18 @@ deserialize_rollback_log_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, ROLLB
    //TODO: This is hard.. everything is shared in a single dictionary.
    rbuf_TXNID(rb, &result->txnid);
    result->sequence = rbuf_ulonglong(rb);
-    result->thislogname = rbuf_blocknum(rb);
-    if (result->thislogname.b != blocknum.b) {
+    result->blocknum = rbuf_blocknum(rb);
+    if (result->blocknum.b != blocknum.b) {
        r = toku_db_badformat();
        goto died0;
    }
-    result->thishash    = toku_cachetable_hash(h->cf, result->thislogname);
-    if (result->thishash != fullhash) {
+    result->hash    = toku_cachetable_hash(h->cf, result->blocknum);
+    if (result->hash != fullhash) {
        r = toku_db_badformat();
        goto died0;
    }
-    result->older       = rbuf_blocknum(rb);
-    result->older_hash  = toku_cachetable_hash(h->cf, result->older);
+    result->previous       = rbuf_blocknum(rb);
+    result->previous_hash  = toku_cachetable_hash(h->cf, result->previous);
    result->rollentry_resident_bytecount = rbuf_ulonglong(rb);

    size_t arena_initial_size = rbuf_ulonglong(rb);

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -120,7 +120,6 @@ basement nodes, bulk fetch,  and partial fetch:
 // Access to nested transaction logic
 #include "ule.h"
 #include "xids.h"
-#include "roll.h"
 #include "sub_block.h"
 #include "sort.h"
 #include <brt-cachetable-wrappers.h>

--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -3571,17 +3571,6 @@ log_open_txn (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) {
    assert(0);
 }

-static int
-unpin_rollback_log_for_checkpoint (OMTVALUE txnv, u_int32_t UU(index), void *UU(extra)) {
-    int r = 0;
-    TOKUTXN    txn    = txnv;
-    if (txn->pinned_inprogress_rollback_log) {
-        r = toku_rollback_log_unpin(txn, txn->pinned_inprogress_rollback_log);
-        assert(r==0);
-    }
-    return r;
-}
-
 // TODO: #1510 locking of cachetable is suspect
 //             verify correct algorithm overall

@@ -3596,12 +3585,6 @@ toku_cachetable_begin_checkpoint (CACHETABLE ct, TOKULOGGER logger) {
    {
        brt_begin_checkpoint();
        unsigned i;
-	if (logger) { // Unpin all 'inprogress rollback log nodes' pinned by transactions
-            int r = toku_omt_iterate(logger->live_txns,
-                                     unpin_rollback_log_for_checkpoint,
-                                     NULL);
-            assert(r==0);
-        }
 	cachetable_lock(ct);
 	//Initialize accountability counters
 	ct->checkpoint_num_files = 0;

--- a/newbrt/log-internal.h
+++ b/newbrt/log-internal.h
@@ -145,17 +145,37 @@ struct tokutxn {
    BOOL       force_fsync_on_commit;  //This transaction NEEDS an fsync once (if) it commits.  (commit means root txn)
    TXN_PROGRESS_POLL_FUNCTION progress_poll_fun;
    void *                     progress_poll_fun_extra;
+
+    // these are number of rollback nodes and rollback entries for this txn.
+    //
+    // the current rollback node below has sequence number num_rollback_nodes - 1
+    // (because they are numbered 0...num-1). often, the current rollback is
+    // already set to this block num, which means it exists and is available to
+    // log some entries. if the current rollback is NONE and the number of
+    // rollback nodes for this transaction is non-zero, then we will use
+    // the number of rollback nodes to know which sequence number to assign
+    // to a new one we create
    uint64_t   num_rollback_nodes;
    uint64_t   num_rollentries;
    uint64_t   num_rollentries_processed;
+
+    // spilled rollback nodes are rollback nodes that were gorged by this
+    // transaction, retired, and saved in a list.
+
+    // the spilled rollback head is the block number of the first rollback node
+    // that makes up the rollback log chain
    BLOCKNUM   spilled_rollback_head;
    uint32_t   spilled_rollback_head_hash;
+    // the spilled rollback is the block number of the last rollback node that
+    // makes up the rollback log chain. 
    BLOCKNUM   spilled_rollback_tail;
    uint32_t   spilled_rollback_tail_hash;
-    BLOCKNUM   current_rollback;
+    // the current rollback node block number we may use. if this is ROLLBACK_NONE,
+    // then we need to create one and set it here before using it.
+    BLOCKNUM   current_rollback; 
    uint32_t   current_rollback_hash;
+
    BOOL       recovered_from_checkpoint;
-    ROLLBACK_LOG_NODE pinned_inprogress_rollback_log;
    struct toku_list checkpoint_before_commit;
    TXN_IGNORE_S ignore_errors; // 2954
    TOKUTXN_STATE state;

--- a/newbrt/logformat.c
+++ b/newbrt/logformat.c
@@ -580,10 +580,8 @@ generate_rollbacks (void) {

 		    fprintf(hf, ");\n");
 		    fprintf(cf, ") {\n");
-                    fprintf(cf, "  int r;\n");
                    fprintf(cf, "  ROLLBACK_LOG_NODE log;\n");
-                    fprintf(cf, "  r = toku_get_and_pin_rollback_log_for_new_entry(txn, &log);\n");
-                    fprintf(cf, "  assert(r==0);\n");
+                    fprintf(cf, "  toku_get_and_pin_rollback_log_for_new_entry(txn, &log);\n");
 		    // 'memdup' all BYTESTRINGS here
 		    DO_FIELDS(ft, lt, {
                        if ( strcmp(ft->type, "BYTESTRING") == 0 ) {
@@ -620,7 +618,10 @@ generate_rollbacks (void) {
 		    fprintf(cf, "  txn->rollentry_raw_count          += rollback_fsize;\n");
                    fprintf(cf, "  txn->num_rollentries++;\n");
                    fprintf(cf, "  log->dirty = TRUE;\n");
-		    fprintf(cf, "  return toku_maybe_spill_rollbacks(txn, log);\n}\n");
+		    fprintf(cf, "  // spill and unpin assert success internally\n");
+		    fprintf(cf, "  toku_maybe_spill_rollbacks(txn, log);\n");
+		    fprintf(cf, "  toku_rollback_log_unpin(txn, log);\n");
+		    fprintf(cf, "  return 0;\n}\n");
 	    });

    DO_ROLLBACKS(lt, {

--- a/newbrt/roll.c
+++ b/newbrt/roll.c
@@ -6,9 +6,23 @@
 /* rollback and rollforward routines. */

 #include "includes.h"
-#include "checkpoint.h"
 #include "xids.h"
-#include "roll.h"
+
+// functionality provided by roll.c is exposed by an autogenerated
+// header file, logheader.h
+//
+// this (poorly) explains the absense of "roll.h"
+
+// these flags control whether or not we send commit messages for
+// various operations
+
+// When a transaction is committed, should we send a BRT_COMMIT message
+// for each BRT_INSERT message sent earlier by the transaction?
+#define TOKU_DO_COMMIT_CMD_INSERT 0
+
+// When a transaction is committed, should we send a BRT_COMMIT message
+// for each BRT_DELETE_ANY message sent earlier by the transaction?
+#define TOKU_DO_COMMIT_CMD_DELETE 1

 int
 toku_commit_fdelete (u_int8_t   file_was_open,
@@ -305,7 +319,7 @@ toku_apply_rollinclude (TXNID      xid,
                        void *     yieldv,
                        LSN        oplsn,
                        apply_rollback_item func) {
-    int r;
+    int r = 0;
    struct roll_entry *item;
    int count=0;

@@ -316,14 +330,13 @@ toku_apply_rollinclude (TXNID      xid,
    BOOL found_head = FALSE;
    assert(next_log.b != ROLLBACK_NONE.b);
    while (next_log.b != ROLLBACK_NONE.b) {
-        ROLLBACK_LOG_NODE log;
        //pin log
-        r = toku_get_and_pin_rollback_log(txn, xid, last_sequence - 1, next_log, next_log_hash, &log);
-        assert(r==0);
+        ROLLBACK_LOG_NODE log;
+        toku_get_and_pin_rollback_log(txn, next_log, next_log_hash, &log);
+        toku_rollback_verify_contents(log, xid, last_sequence - 1);
        last_sequence = log->sequence;
        
-        r = toku_maybe_prefetch_older_rollback_log(txn, log);
-        assert(r==0);
+        toku_maybe_prefetch_previous_rollback_log(txn, log);

        while ((item=log->newest_logentry)) {
            log->newest_logentry = item->prev;
@@ -337,8 +350,8 @@ toku_apply_rollinclude (TXNID      xid,
            found_head = TRUE;
            assert(log->sequence == 0);
        }
-        next_log      = log->older;
-        next_log_hash = log->older_hash;
+        next_log      = log->previous;
+        next_log_hash = log->previous_hash;
        {
            //Clean up transaction structure to prevent
            //toku_txn_close from double-freeing
@@ -350,9 +363,7 @@ toku_apply_rollinclude (TXNID      xid,
                spilled_head_hash = next_log_hash;
            }
        }
-        //Unpins log
-        r = toku_delete_rollback_log(txn, log);
-        assert(r==0);
+        toku_rollback_log_unpin_and_remove(txn, log);
    }
    return r;
 }

--- a/newbrt/roll.h
+++ b/newbrt/roll.h
-/* -*- mode: C; c-basic-offset: 4 -*- */
-#ident "$Id$"
-#ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
-#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
-
-#ifndef TOKUDB_ROLL_H
-#define TOKUDB_ROLL_H
-
-#if defined(__cplusplus) || defined(__cilkplusplus)
-extern "C" {
-#endif
-
-// these flags control whether or not we send commit messages for
-// various operations
-
-// When a transaction is committed, should we send a BRT_COMMIT message
-// for each BRT_INSERT message sent earlier by the transaction?
-#define TOKU_DO_COMMIT_CMD_INSERT 0
-
-// When a transaction is committed, should we send a BRT_COMMIT message
-// for each BRT_DELETE_ANY message sent earlier by the transaction?
-#define TOKU_DO_COMMIT_CMD_DELETE 1
-
-// When a transaction is committed, should we send a BRT_COMMIT message
-// for each BRT_DELETE_BOTH message sent earlier by the transaction?
-#define TOKU_DO_COMMIT_CMD_DELETE_BOTH 1
-
-#if defined(__cplusplus) || defined(__cilkplusplus)
-};
-#endif
-
-#endif
-
--- a/newbrt/rollback.c
+++ b/newbrt/rollback.c
--- a/newbrt/rollback.h
+++ b/newbrt/rollback.h
@@ -11,56 +11,79 @@
 extern "C" {
 #endif

-// these routines in rollback.c
-
 void toku_poll_txn_progress_function(TOKUTXN txn, uint8_t is_commit, uint8_t stall_for_checkpoint);
 int toku_rollback_commit(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn);
 int toku_rollback_abort(TOKUTXN txn, YIELDF yield, void*yieldv, LSN lsn);
 void toku_rollback_txn_close (TOKUTXN txn);
-int toku_get_and_pin_rollback_log_for_new_entry (TOKUTXN txn, ROLLBACK_LOG_NODE *result);
-int toku_get_and_pin_rollback_log(TOKUTXN txn, TXNID xid, uint64_t sequence, BLOCKNUM name, uint32_t hash, ROLLBACK_LOG_NODE *result);
-int toku_maybe_prefetch_older_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log);
-int toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log);
-int toku_unpin_inprogress_rollback_log(TOKUTXN txn);
-int toku_delete_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log);

-typedef int(*apply_rollback_item)(TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
+// these functions assert internally that they succeed
+
+// get a rollback node this txn may use for a new entry. if there
+// is a current rollback node to use, pin it, otherwise create one.
+void toku_get_and_pin_rollback_log_for_new_entry(TOKUTXN txn, ROLLBACK_LOG_NODE *log);
+
+// get a specific rollback by blocknum and hash
+void toku_get_and_pin_rollback_log(TOKUTXN txn, BLOCKNUM blocknum, uint32_t hash, ROLLBACK_LOG_NODE *log);
+
+// unpin a rollback node from the cachetable
+void toku_rollback_log_unpin(TOKUTXN txn, ROLLBACK_LOG_NODE log);
+
+// assert that the given log's txnid and sequence match the ones given
+void toku_rollback_verify_contents(ROLLBACK_LOG_NODE log, TXNID txnid, uint64_t sequence);

+// if there is a previous rollback log for the given log node, prefetch it
+void toku_maybe_prefetch_previous_rollback_log(TOKUTXN txn, ROLLBACK_LOG_NODE log);
+
+// unpin and rmove a rollback log from the cachetable
+void toku_rollback_log_unpin_and_remove(TOKUTXN txn, ROLLBACK_LOG_NODE log);
+
+typedef int(*apply_rollback_item)(TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
 int toku_commit_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);
 int toku_abort_rollback_item (TOKUTXN txn, struct roll_entry *item, YIELDF yield, void*yieldv, LSN lsn);

 void *toku_malloc_in_rollback(ROLLBACK_LOG_NODE log, size_t size);
 void *toku_memdup_in_rollback(ROLLBACK_LOG_NODE log, const void *v, size_t len);
-int toku_maybe_spill_rollbacks (TOKUTXN txn, ROLLBACK_LOG_NODE log);
+
+// given a transaction and a log node, and if the log is too full,
+// set the current rollback log to ROLLBACK_NONE and move the current
+// node onto the tail of the rollback node chain. further insertions
+// into the rollback log for this transaction will force the creation
+// of a new rollback log.
+//
+// this never unpins the rollback log if a spill occurs. the caller
+// is responsible for ensuring the given rollback node is unpinned
+// if necessary.
+void toku_maybe_spill_rollbacks(TOKUTXN txn, ROLLBACK_LOG_NODE log);

 int toku_txn_note_brt (TOKUTXN txn, BRT brt);
 int toku_txn_note_swap_brt (BRT live, BRT zombie);
 int toku_txn_note_close_brt (BRT brt);
 int toku_logger_txn_rollback_raw_count(TOKUTXN txn, u_int64_t *raw_count);

-int toku_txn_find_by_xid (BRT brt, TXNID xid, TOKUTXN *txnptr);
 int toku_find_pair_by_xid (OMTVALUE v, void *txnv);
 int toku_find_xid_by_xid (OMTVALUE v, void *xidv);

-// these routines in roll.c
-int toku_rollback_fileentries (int fd, TOKUTXN txn, YIELDF yield, void *yieldv, LSN lsn);
-int toku_commit_fileentries (int fd, TOKUTXN txn, YIELDF yield,void *yieldv, LSN lsn);
-
-//Heaviside function to find a TOKUTXN by TOKUTXN (used to find the index)
-int find_xid (OMTVALUE v, void *txnv);
-
+// A high-level rollback log is made up of a chain of rollback log nodes.
+// Each rollback log node is represented (separately) in the cachetable by 
+// this structure. Each portion of the rollback log chain has a block num
+// and a hash to identify it.
 struct rollback_log_node {
    int                layout_version;
    int                layout_version_original;
    int                layout_version_read_from_disk;
    uint32_t           build_id;      // build_id (svn rev number) of software that wrote this node to disk
    int                dirty;
-    TXNID              txnid;         // Which transaction made this?
-    uint64_t           sequence;      // Which rollback log in the sequence is this?
-    BLOCKNUM           thislogname;   // Which block number is this chunk of the log?
-    uint32_t           thishash;
-    BLOCKNUM           older;         // Which block number is the next oldest chunk of the log?
-    uint32_t           older_hash;
+    // to which transaction does this node belong?
+    TXNID              txnid;
+    // sequentially, where in the rollback log chain is this node? 
+    // the sequence is between 0 and totalnodes-1
+    uint64_t           sequence;
+    BLOCKNUM           blocknum; // on which block does this node live?
+    uint32_t           hash;
+    // which block number is the previous in the chain of rollback nodes 
+    // that make up this rollback log?
+    BLOCKNUM           previous; 
+    uint32_t           previous_hash;
    struct roll_entry *oldest_logentry;
    struct roll_entry *newest_logentry;
    MEMARENA           rollentry_arena;

--- a/newbrt/txn.c
+++ b/newbrt/txn.c
@@ -223,7 +223,6 @@ toku_txn_create_txn (
    result->current_rollback      = ROLLBACK_NONE;
    result->current_rollback_hash = 0;
    result->num_rollback_nodes = 0;
-    result->pinned_inprogress_rollback_log = NULL;
    result->snapshot_type = snapshot_type;
    result->snapshot_txnid64 = TXNID_NONE;
    result->container_db_txn = container_db_txn;

--- a/src/ydb_txn.c
+++ b/src/ydb_txn.c
@@ -285,26 +285,12 @@ static int
 locked_txn_commit_with_progress(DB_TXN *txn, u_int32_t flags,
                                TXN_PROGRESS_POLL_FUNCTION poll, void* poll_extra) {
    TOKUTXN ttxn = db_txn_struct_i(txn)->tokutxn;
-    //
-    // We must unpin rollback log, otherwise, another thread that tries to checkpoint during commit
-    // will grab the multi operation lock, and then not be able to complete the checkpoint because
-    // this thread has its rollback log pinned and is trying to grab the multi operation lock.
-    //
-    // We grab the ydb lock because the checkpoint thread also unpins inprogress rollback logs,
-    // so the ydb lock protects a race of both this thread and the checkpoint thread unpinning the
-    // inprogress rollback log. If we want, we can probably have the checkpoint thread to not
-    // unpin inprogress rollback logs, making this ydb lock grab unnecessary.
-    //
-    toku_ydb_lock();
-    int r = toku_unpin_inprogress_rollback_log(ttxn);
-    toku_ydb_unlock();
-    assert_zero(r);
    if (toku_txn_requires_checkpoint(ttxn)) {
        toku_checkpoint(txn->mgrp->i->cachetable, txn->mgrp->i->logger, NULL, NULL, NULL, NULL, TXN_COMMIT_CHECKPOINT);
    }
    toku_multi_operation_client_lock(); //Cannot checkpoint during a commit.
    toku_ydb_lock();
-    r = toku_txn_commit_only(txn, flags, poll, poll_extra, true); // the final 'true' says to release the multi_operation_client_lock
+    int r = toku_txn_commit_only(txn, flags, poll, poll_extra, true); // the final 'true' says to release the multi_operation_client_lock
    toku_ydb_unlock();
    toku_txn_destroy(txn);
    return r;