Merge r26784 into the main line.

Changes include patches to * Handle gcc 4.4.1's foibles. * Instrument {{{toku_os_pread}}} * Release the lock during I/Os for queries (#3039) * Simplification of the ydb lock (no more backoff) * Use {{{DB_TXN_NOSYNC}} for several tests (which speeds them up). * Set the redzone to zero for several tests (so they can run on machines with little free space). (#3111) * Add helgrind and valgrind suppressions to some tests. * Add dollar-id-dollar to several files. * Declare the schedule status values to be volatile. * Add a method to the toku {{{RWLOCK}}} to effectively grab the read lock and then release it quickly. (With only one condition variable acquisition.) Fixes #3039. close[t:3039] Refs #3111, #3112, #3119, #3125, #3126, #3128, #3131, #3133, #3142, #3152. [t:3111] [t:3112] [t:3119] [t:3125] [t:3126] [t:3128] [t:3131] [t:3133] [t:3142] [t:3152]. git-svn-id: file:///svn/toku/tokudb@26785 c7de825b-a66e-492c-adef-691d508d4ae1

Merge r26784 into the main line.
Changes include patches to * Handle gcc 4.4.1's foibles. * Instrument {{{toku_os_pread}}} * Release the lock during I/Os for queries (#3039) * Simplification of the ydb lock (no more backoff) * Use {{{DB_TXN_NOSYNC}} for several tests (which speeds them up). * Set the redzone to zero for several tests (so they can run on machines with little free space). (#3111) * Add helgrind and valgrind suppressions to some tests. * Add dollar-id-dollar to several files. * Declare the schedule status values to be volatile. * Add a method to the toku {{{RWLOCK}}} to effectively grab the read lock and then release it quickly. (With only one condition variable acquisition.) Fixes #3039. close[t:3039] Refs #3111, #3112, #3119, #3125, #3126, #3128, #3131, #3133, #3142, #3152. [t:3111] [t:3112] [t:3119] [t:3125] [t:3126] [t:3128] [t:3131] [t:3133] [t:3142] [t:3152]. git-svn-id: file:///svn/toku/tokudb@26785 c7de825b-a66e-492c-adef-691d508d4ae1
abd5212b · Bradley C. Kuszmaul · Yoni Fogel · cf3bbe92 · abd5212b · abd5212b
Commit abd5212b authored Apr 16, 2013 by Bradley C. Kuszmaul Committed by Yoni Fogel Apr 16, 2013
31 changed files
--- a/buildheader/db.h_4_1
+++ b/buildheader/db.h_4_1
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -279,6 +273,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -537,6 +532,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/buildheader/db.h_4_3
+++ b/buildheader/db.h_4_3
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -281,6 +275,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -552,6 +547,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/buildheader/db.h_4_4
+++ b/buildheader/db.h_4_4
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -281,6 +275,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -558,6 +553,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/buildheader/db.h_4_5
+++ b/buildheader/db.h_4_5
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -281,6 +275,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -558,6 +553,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/buildheader/db.h_4_6
+++ b/buildheader/db.h_4_6
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -282,6 +276,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -562,6 +557,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/buildheader/make_db_h.c
+++ b/buildheader/make_db_h.c
@@ -73,6 +73,7 @@ enum {
        TOKUDB_ACCEPT                  = -100009,
        TOKUDB_MVCC_DICTIONARY_TOO_NEW = -100010,
        TOKUDB_UPGRADE_FAILURE         = -100011,
+	TOKUDB_TRY_AGAIN               = -100012,
 };

 static void print_defines (void) {
@@ -221,6 +222,7 @@ static void print_defines (void) {
    dodefine(TOKUDB_ACCEPT);
    dodefine(TOKUDB_MVCC_DICTIONARY_TOO_NEW);
    dodefine(TOKUDB_UPGRADE_FAILURE);
+    dodefine(TOKUDB_TRY_AGAIN);

    /* LOADER flags */
    printf("/* LOADER flags */\n");
@@ -455,19 +457,13 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("  char             creationtime[26];        /* time of environment creation */ \n");
    printf("  char             startuptime[26];         /* time of engine startup */ \n");
    printf("  char             now[26];                 /* time of engine status query (i.e. now)  */ \n");
-    printf("  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ \n");
-    printf("  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ \n");
-    printf("  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ \n");
-    printf("  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ \n");
-    printf("  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ \n");
-    printf("  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ \n");
-    printf("  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ \n");
-    printf("  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ \n");
-    printf("  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ \n");
-    printf("  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ \n");
-    printf("  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ \n");
-    printf("  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ \n");
-    printf("  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ \n");
+    printf("  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */\n");
+    printf("  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */\n");
+    printf("  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */\n");
+    printf("  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */\n");
+    printf("  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */\n"); 
+    printf("  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */\n");
+    printf("  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */\n");
    printf("  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ \n");
    printf("  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ \n");
    printf("  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ \n");
@@ -749,6 +745,7 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) %s;\n", VISIBLE);
    printf("int db_env_set_func_open (int (*)(const char *, int, int)) %s;\n", VISIBLE);
    printf("int db_env_set_func_fclose (int (*)(FILE*)) %s;\n", VISIBLE);
+    printf("int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) %s;\n", VISIBLE);
    printf("void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) %s;\n", VISIBLE);
    printf("void db_env_set_checkpoint_callback (void (*)(void*), void*) %s;\n", VISIBLE);
    printf("void db_env_set_checkpoint_callback2 (void (*)(void*), void*) %s;\n", VISIBLE);
@@ -761,4 +758,3 @@ int main (int argc __attribute__((__unused__)), char *const argv[] __attribute__
    printf("#endif\n");
    return 0;
 }
-
--- a/buildheader/tdb.h
+++ b/buildheader/tdb.h
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -282,6 +276,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -496,6 +491,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/include/db.h
+++ b/include/db.h
@@ -68,19 +68,13 @@ typedef struct __toku_engine_status {
  char             creationtime[26];        /* time of environment creation */ 
  char             startuptime[26];         /* time of engine startup */ 
  char             now[26];                 /* time of engine status query (i.e. now)  */ 
-  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-  u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-  u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-  u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-  u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-  u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-  u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-  u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-  u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-  u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-  u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-  u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-  u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+  u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released?                                                                      */
+  u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder if any)?                            */
+  u_int32_t        max_waiters;             /* The maximum of num_waiters_now.                                                                                       */
+  u_int64_t        total_sleep_time;        /* Total time spent (since the system was booted) sleeping (by the indexer) to give foreground threads a chance to work. */
+  u_int64_t        max_time_ydb_lock_held;  /* Maximum time that the ydb lock was held.                                                                              */
+  u_int64_t        total_time_ydb_lock_held;/* Total time client threads held the ydb lock                                                                           */
+  u_int64_t        total_time_since_start;  /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
  u_int32_t        checkpoint_period;       /* delay between automatic checkpoints  */ 
  u_int32_t        checkpoint_footprint;    /* state of checkpoint procedure        */ 
  char             checkpoint_time_begin[26]; /* time of last checkpoint begin      */ 
@@ -282,6 +276,7 @@ typedef enum {
 #define TOKUDB_ACCEPT -100009
 #define TOKUDB_MVCC_DICTIONARY_TOO_NEW -100010
 #define TOKUDB_UPGRADE_FAILURE -100011
+#define TOKUDB_TRY_AGAIN -100012
 /* LOADER flags */
 #define LOADER_USE_PUTS 1
 typedef int (*generate_row_for_put_func)(DB *dest_db, DB *src_db, DBT *dest_key, DBT *dest_val, const DBT *src_key, const DBT *src_val);
@@ -496,6 +491,7 @@ int db_env_set_func_fdopen (FILE* (*)(int, const char *)) __attribute__((__visib
 int db_env_set_func_fopen (FILE* (*)(const char *, const char *)) __attribute__((__visibility__("default")));
 int db_env_set_func_open (int (*)(const char *, int, int)) __attribute__((__visibility__("default")));
 int db_env_set_func_fclose (int (*)(FILE*)) __attribute__((__visibility__("default")));
+int db_env_set_func_pread (ssize_t (*)(int, void *, size_t, off_t)) __attribute__((__visibility__("default")));
 void db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback (void (*)(void*), void*) __attribute__((__visibility__("default")));
 void db_env_set_checkpoint_callback2 (void (*)(void*), void*) __attribute__((__visibility__("default")));

--- a/linux/file.c
+++ b/linux/file.c
@@ -107,6 +107,7 @@ static FILE *  (*t_fopen)(const char *, const char *) = 0;
 static int     (*t_open)(const char *, int, int) = 0;  // no implementation of variadic form until needed
 static int     (*t_fclose)(FILE *) = 0;
 static ssize_t (*t_read)(int, void *, size_t) = 0;
+static ssize_t (*t_pread)(int, void *, size_t, off_t) = 0;

 int 
 toku_set_func_write (ssize_t (*write_fun)(int, const void *, size_t)) {
@@ -164,6 +165,12 @@ toku_set_func_read (ssize_t (*read_fun)(int, void *, size_t)) {
    return 0;
 }

+int
+toku_set_func_pread (ssize_t (*pread_fun)(int, void *, size_t, off_t)) {
+    t_pread = pread_fun;
+    return 0;
+}
+
 void
 toku_os_full_write (int fd, const void *buf, size_t len) {
    const char *bp = (const char *) buf;
@@ -320,6 +327,17 @@ toku_os_read(int fd, void *buf, size_t count) {
    return r;
 }

+ssize_t
+toku_os_pread (int fd, void *buf, size_t count, off_t offset) {
+    ssize_t r;
+    if (t_pread) {
+	r = t_pread(fd, buf, count, offset);
+    } else {
+	r = pread(fd, buf, count, offset);
+    }
+    return r;
+}
+
 /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // fsync logic:


--- a/newbrt/brt-serialize.c
+++ b/newbrt/brt-serialize.c
@@ -1074,7 +1074,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
    u_int8_t *XMALLOC_N(size, raw_block);
    {
        // read the (partially compressed) block
-        ssize_t rlen = pread(fd, raw_block, size, offset);
+        ssize_t rlen = toku_os_pread(fd, raw_block, size, offset);
        lazy_assert((DISKOFF)rlen == size);
    }
    // get the layout_version
@@ -1439,7 +1439,7 @@ deserialize_descriptor_from(int fd, struct brt_header *h, DESCRIPTOR desc) {
            unsigned char *XMALLOC_N(size, dbuf);
            {
                lock_for_pwrite();
-                ssize_t r = pread(fd, dbuf, size, offset);
+                ssize_t r = toku_os_pread(fd, dbuf, size, offset);
                lazy_assert(r==size);
                unlock_for_pwrite();
            }
@@ -1525,7 +1525,7 @@ deserialize_brtheader (int fd, struct rbuf *rb, struct brt_header **brth) {
        unsigned char *XMALLOC_N(translation_size_on_disk, tbuf);
        {
            // This cast is messed up in 32-bits if the block translation table is ever more than 4GB.  But in that case, the translation table itself won't fit in main memory.
-            ssize_t r = pread(fd, tbuf, translation_size_on_disk, translation_address_on_disk);
+            ssize_t r = toku_os_pread(fd, tbuf, translation_size_on_disk, translation_address_on_disk);
            lazy_assert(r==translation_size_on_disk);
        }
        unlock_for_pwrite();
@@ -1614,7 +1614,7 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, str
                                4;  // size
    unsigned char prefix[prefix_size];
    rb->buf = NULL;
-    int64_t n = pread(fd, prefix, prefix_size, offset_of_header);
+    int64_t n = toku_os_pread(fd, prefix, prefix_size, offset_of_header);
    if (n==0) r = TOKUDB_DICTIONARY_NO_HEADER;
    else if (n<0) {r = errno; lazy_assert(r!=0);}
    else if (n!=prefix_size) r = EINVAL;
@@ -1659,7 +1659,7 @@ deserialize_brtheader_from_fd_into_rbuf(int fd, toku_off_t offset_of_header, str
            rb->buf  = toku_xmalloc(rb->size);
        }
        if (r==0) {
-            n = pread(fd, rb->buf, rb->size, offset_of_header);
+            n = toku_os_pread(fd, rb->buf, rb->size, offset_of_header);
            if (n==-1) {
                r = errno;
                lazy_assert(r!=0);

--- a/newbrt/brt.c
+++ b/newbrt/brt.c
@@ -4564,7 +4564,8 @@ brt_search_child(BRT brt, BRTNODE node, int childnum, brt_search_t *search, BRT_
    BLOCKNUM childblocknum = BNC_BLOCKNUM(node,childnum);
    u_int32_t fullhash =  compute_child_fullhash(brt->cf, node, childnum);
    {
-        int rr = toku_cachetable_get_and_pin(brt->cf, childblocknum, fullhash, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt->h);
+        int rr = toku_cachetable_get_and_pin_nonblocking(brt->cf, childblocknum, fullhash, &node_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt->h);
+	if (rr==TOKUDB_TRY_AGAIN) return rr;
        lazy_assert_zero(rr);
    }

@@ -4660,6 +4661,8 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf,
 {
    int r, rr;

+ try_again:
+
    lazy_assert(brt->h);

    *root_put_counter = brt->h->root_put_counter;
@@ -4690,6 +4693,8 @@ toku_brt_search (BRT brt, brt_search_t *search, BRT_GET_CALLBACK_FUNCTION getf,
    rr = toku_unpin_brtnode(brt, node);
    lazy_assert_zero(rr);

+    if (r==TOKUDB_TRY_AGAIN) goto try_again;
+
    //Heaviside function (+direction) queries define only a lower or upper
    //bound.  Some queries require both an upper and lower bound.
    //They do this by wrapping the BRT_GET_CALLBACK_FUNCTION with another

--- a/newbrt/cachetable.c
+++ b/newbrt/cachetable.c
@@ -159,6 +159,10 @@ struct cachetable {
    OMT reserved_filenums;
    char *env_dir;
    BOOL set_env_dir; //Can only set env_dir once
+
+    // For releasing locks during I/O.  These are named "ydb_lock_callback" but it could be viewed more generally as being used to release and reacquire locks while I/O is takign place.
+    void (*ydb_lock_callback)(void);
+    void (*ydb_unlock_callback)(void);
 };

 // Lock the cachetable
@@ -324,6 +328,14 @@ toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir) {
    ct->set_env_dir = TRUE;
 }

+void
+toku_cachetable_set_lock_unlock_for_io (CACHETABLE ct, void (*ydb_lock_callback)(void), void (*ydb_unlock_callback)(void))
+// Effect: When we do I/O we may need to release locks (e.g., the ydb lock).  These functions release the lock acquire the lock.
+{
+    ct->ydb_lock_callback = ydb_lock_callback;
+    ct->ydb_unlock_callback = ydb_unlock_callback;
+}
+
 //
 // Increment the reference count
 // MUST HOLD cachetable lock
@@ -1128,6 +1140,7 @@ static int cachetable_fetch_pair(CACHETABLE ct, CACHEFILE cf, PAIR p) {
            return 0;
        }
        p->state = CTPAIR_IDLE;
+
        rwlock_write_unlock(&p->rwlock);
        if (0) printf("%s:%d %"PRId64" complete\n", __FUNCTION__, __LINE__, key.b);
        return 0;
@@ -1415,9 +1428,9 @@ static CACHEKEY  get_and_pin_key       = {0};
 static u_int32_t get_and_pin_fullhash  = 0;            


-int toku_cachetable_get_and_pin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep,
-			        CACHETABLE_FLUSH_CALLBACK flush_callback, 
-                                CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs) {
+int toku_cachetable_get_and_pin (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep,
+				 CACHETABLE_FLUSH_CALLBACK flush_callback, 
+				 CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs) {
    CACHETABLE ct = cachefile->cachetable;
    PAIR p;
    int count=0;
@@ -1625,10 +1638,76 @@ int toku_cachetable_unpin(CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash,
    return r;
 }

+int toku_cachetable_get_and_pin_nonblocking (CACHEFILE cf, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep,
+					     CACHETABLE_FLUSH_CALLBACK flush_callback, 
+					     CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs)
+// Effect:  If the block is in the cachetable, then pin it and return it. 
+//   Otherwise call the lock_unlock_callback (to unlock), fetch the data (but don't pin it, since we'll just end up pinning it again later),  and the call (to lock)
+//   and return TOKU_DB_TRYAGAIN.
+{
+    CACHETABLE ct = cf->cachetable;
+    cachetable_lock(ct);
+    int count = 0;
+    PAIR p;
+    for (p = ct->table[fullhash&(ct->table_size-1)]; p; p = p->hash_chain) {
+	count++;
+	if (p->key.b==key.b && p->cachefile==cf) {
+	    note_hash_count(count);
+	    // If I/O is currently happening (p->state=CTPAIR_READING) then we must wait.
+	    //   In this case there is rwlock_write_unlock(&p->rwlock);
+	    // If checkpoint is pending (p->checkpoint_pending) then we must cause the write to happen.
+	    //   In this case we need to write_pair_for_checkpoint(ct, p, FALSE)
+	    //   That code sets locks p->rwlock, sets p->state=CTPAIR_WRITING, and eventually unlocks p->rwlock.
+	    // Fortunately, write_pair_for_checkpint releases the cachetable mutex while the write is happening (and then reacquires it).
+	    // We want to drop the cachetable mutex while waiting.
+	    //   FOR CTPAIR_READING, that's not hard (since we just wait on the p->rwlock, which releases the mutex during the wait).
+	    //   For the case where CTPAIR_WRITING is happening, if we wait on the p->rwlock, then we aren't blocking someone..
+	    //   For the case where *we* do the write (and for that other case where someone else is doing the write), the cachetable_write_pair function releases the mutex.
+	    // So all the cases should work right.
+
+	    // Right now we have the cachetable lock.  That means no one is modifying p.
+	    switch (p->state) {
+	    case CTPAIR_INVALID: assert(0);
+	    case CTPAIR_READING:
+	    case CTPAIR_WRITING:
+		if (ct->ydb_unlock_callback) ct->ydb_unlock_callback();
+		// We need to obtain the read lock (waiting for the write to finish), but then we only waited so we could wake up again.  So rather than locking the read lock, and then releasing it we call this function.
+		rwlock_read_lock_and_unlock(&p->rwlock, ct->mutex); // recall that this lock releases and reacquires the ct->mutex.
+		cachetable_unlock(ct);
+		if (ct->ydb_lock_callback) ct->ydb_lock_callback();
+		return TOKUDB_TRY_AGAIN;
+	    case CTPAIR_IDLE:
+		rwlock_read_lock(&p->rwlock, ct->mutex);
+		lru_touch(ct, p);
+		*value = p->value;
+		if (sizep) *sizep = p->size;
+		cachetable_hit++;
+		cachetable_unlock(ct);
+		return 0;
+	    }
+	    assert(0); // cannot get here
+	}
+    }
+    assert(p==0);
+
+    // Not found
+    p = cachetable_insert_at(ct, cf, key, zero_value, CTPAIR_READING, fullhash, zero_size, flush_callback, fetch_callback, extraargs, CACHETABLE_CLEAN);
+    assert(p);
+    rwlock_write_lock(&p->rwlock, ct->mutex);
+    if (ct->ydb_unlock_callback) ct->ydb_unlock_callback();
+    int r = cachetable_fetch_pair(ct, cf, p);
+    cachetable_unlock(ct);
+    if (ct->ydb_lock_callback) ct->ydb_lock_callback();
+    if (r!=0) return r;
+    else return TOKUDB_TRY_AGAIN;
+}
+
 int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
                            CACHETABLE_FLUSH_CALLBACK flush_callback, 
                            CACHETABLE_FETCH_CALLBACK fetch_callback, 
-                            void *extraargs) {
+                            void *extraargs)
+// Effect: See the documentation for this function in cachetable.h
+{
    if (0) printf("%s:%d %"PRId64"\n", __FUNCTION__, __LINE__, key.b);
    CACHETABLE ct = cf->cachetable;
    cachetable_lock(ct);

--- a/newbrt/cachetable.h
+++ b/newbrt/cachetable.h
@@ -165,6 +165,12 @@ int toku_cachetable_get_and_pin(CACHEFILE, CACHEKEY, u_int32_t /*fullhash*/,
 				CACHETABLE_FLUSH_CALLBACK flush_callback,
                                CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs);

+// Effect:  If the block is in the cachetable, then return it. 
+//   Otherwise call the release_lock_callback, fetch the data (but don't pin it, since we'll just end up pinning it again later),
+//   and return TOKU_DB_TRYAGAIN.
+int toku_cachetable_get_and_pin_nonblocking (CACHEFILE cachefile, CACHEKEY key, u_int32_t fullhash, void**value, long *sizep,
+					     CACHETABLE_FLUSH_CALLBACK flush_callback, 
+					     CACHETABLE_FETCH_CALLBACK fetch_callback, void *extraargs);
 // Maybe get and pin a memory object.
 // Effects:  This function is identical to the get_and_pin function except that it
 // will not attempt to fetch a memory object that is not in the cachetable.
@@ -191,12 +197,28 @@ int toku_cachetable_unpin_and_remove (CACHEFILE, CACHEKEY); /* Removing somethin
 // Effect: Remove an object from the cachetable.  Don't write it back.
 // Requires: The object must be pinned exactly once.

-// Prefetch a memory object for a given key into the cachetable
-// Returns: 0 if success
 int toku_cachefile_prefetch(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
                            CACHETABLE_FLUSH_CALLBACK flush_callback, 
                            CACHETABLE_FETCH_CALLBACK fetch_callback, 
                            void *extraargs);
+// Effect: Prefetch a memory object for a given key into the cachetable
+// Precondition: The cachetable mutex is NOT held.
+// Postcondition: The cachetable mutex is NOT held.
+// Returns: 0 if success
+// Implement Note: 
+//  1) The pair's rwlock is acquired (for write) (there is not a deadlock here because the rwlock is a pthread_cond_wait using the cachetable mutex).  
+//  Case A:  Single-threaded.
+//    A1)  Call cachetable_fetch_pair, which
+//      a) Obtains a readlock on the cachefile's fd (to prevent multipler readers at once)
+//      b) Unlocks the cachetable
+//      c) Does the fetch off disk.
+//      d) Locks the cachetable
+//      e) Unlocks the fd lock.
+//      f) Unlocks the pair rwlock.
+//  Case B: Multithreaded
+//      a) Enqueue a cachetable_reader into the workqueue.
+//      b) Unlock the cache table.
+//      c) The enqueue'd job later locks the cachetable, and calls cachetable_fetch_pair (doing the steps in A1 above).

 int toku_cachetable_assert_all_unpinned (CACHETABLE);

@@ -328,6 +350,11 @@ void toku_cachetable_set_env_dir(CACHETABLE ct, const char *env_dir);
 char * toku_construct_full_name(int count, ...);
 char * toku_cachetable_get_fname_in_cwd(CACHETABLE ct, const char * fname_in_env);

+void toku_cachetable_set_lock_unlock_for_io (CACHETABLE ct, void (*ydb_lock_callback)(void), void (*ydb_unlock_callback)(void));
+// Effect: When we do I/O we may need to release locks (e.g., the ydb lock).  These functions release the lock acquire the lock.
+
+    
+
 int toku_cachetable_local_checkpoint_for_commit(CACHETABLE ct, TOKUTXN txn, uint32_t n, CACHEFILE cachefiles[]);

 // test-only function

--- a/newbrt/le-cursor.c
+++ b/newbrt/le-cursor.c
@@ -101,5 +101,3 @@ is_key_right_of_le_cursor(LE_CURSOR le_cursor, const DBT *key, DB *keycompare_db
    }
    return result;
 }
-    
-
--- a/newbrt/rwlock.h
+++ b/newbrt/rwlock.h
@@ -75,6 +75,20 @@ static inline void rwlock_read_lock(RWLOCK rwlock, toku_pthread_mutex_t *mutex)
    rwlock->reader++;
 }

+static inline void rwlock_read_lock_and_unlock (RWLOCK rwlock, toku_pthread_mutex_t *mutex)
+// Effect: Has the effect of obtaining a read lock and then unlocking it.
+// Implementation note: This can be done faster than actually doing the lock/unlock
+// Usage note:  This is useful when we are waiting on someone who has the write lock, but then we are just going to try again from the top.  (E.g., when releasing the ydb lock).
+{
+    if (rwlock->writer || rwlock->want_write) {
+	rwlock->want_read++;
+        while (rwlock->writer || rwlock->want_write) {
+            int r = toku_pthread_cond_wait(&rwlock->wait_read, mutex); assert(r == 0);
+        }
+        rwlock->want_read--;
+    }
+    // Don't increment reader.
+}

 // preferentially obtain a read lock (ignore request for write lock)
 // expects: mutex is locked

--- a/newbrt/tests/le-cursor-walk.c
+++ b/newbrt/tests/le-cursor-walk.c
@@ -112,10 +112,13 @@ walk_tree(const char *fname, int n) {

    int i;
    for (i = 0; ; i++) {
-        error = le_cursor_next(cursor, &val);
+	error = TOKUDB_TRY_AGAIN;
+	while (error == TOKUDB_TRY_AGAIN) {
+	    error = le_cursor_next(cursor, &val);
+	}
        if (error != 0) 
            break;
-        
+
        LEAFENTRY le = (LEAFENTRY) val.data;
        assert(le->type == LE_MVCC);
        assert(le->keylen == sizeof (int));

--- a/newbrt/valgrind.suppressions
+++ b/newbrt/valgrind.suppressions
+{
+   compress_is_still_not_valgrind_clean_in_ubuntu_3
+   Memcheck:Param
+   write(buf)
+   obj:/lib/libpthread-2.10.1.so
+   fun:toku_os_write
+   fun:write_nonleaf_node
+}
+{
+   compress_is_still_not_valgrind_clean_in_ubuntu_2
+   Memcheck:Param
+   write(buf)
+   obj:/lib/libpthread-2.10.1.so
+   fun:toku_os_write
+   fun:finish_leafnode
+}
+{
+   compress_is_still_not_valgrind_clean_in_ubuntu
+   Memcheck:Param
+   pwrite64(buf)
+   obj:/lib/libpthread-2.10.1.so
+   fun:toku_os_full_pwrite
+   fun:toku_serialize_brtnode_to
+}
 {
   compress_is_not_valgrind_clean
   Memcheck:Cond

--- a/src/elocks.c
+++ b/src/elocks.c
--- a/src/export.map
+++ b/src/export.map
@@ -17,6 +17,7 @@
   db_env_set_func_fopen;
   db_env_set_func_open;
   db_env_set_func_fclose;
+   db_env_set_func_pread;
   db_env_set_func_loader_fwrite;
   db_env_set_checkpoint_callback;
   db_env_set_checkpoint_callback2;

--- a/src/tests/Makefile
+++ b/src/tests/Makefile
@@ -676,11 +676,12 @@ recovery_fileops_unit.tdbrun: recovery_fileops_unit.tdb$(BINSUF) $(PTHREAD_LOCAL
 # helgrind1 is supposed to fail.
 helgrind1.tdbrun: TDBVGRIND=$(HGRIND) --log-file=helgrind1.tdb.deleteme
 helgrind1.tdbrun: MAYBEINVERTER=$(INVERTER)
-helgrind2.tdbrun: TDBVGRIND=$(HGRIND)
 helgrind2.bdbrun: BDBVGRIND=$(HGRIND)
-helgrind3.tdbrun: TDBVGRIND=$(HGRIND)
+helgrind2.tdbrun helgrind3.tdbrun: TDBVGRIND=$(HGRIND)
+helgrind2.tdbrun helgrind3.tdbrun: HGRIND+=--suppressions=helgrind.suppressions
 helgrind3.bdbrun: BDBVGRIND=$(HGRIND)

+
 test_groupcommit_count_hgrind.tdbrun: HGRIND+=--suppressions=helgrind.suppressions
 test_groupcommit_count_hgrind.tdbrun: test_groupcommit_count.tdb$(BINSUF)
 	$(HGRIND) ./test_groupcommit_count.tdb$(BINSUF) $(VERBVERBOSE) -n 1 -p hgrind $(SUMMARIZE_CMD)
@@ -707,6 +708,9 @@ test_thread_stack.%run: test_thread_stack.%$(BINSUF) $(PTHREAD_LOCAL)
 	./$< -a -thread_stack 16384         && \
 	./$< -a -thread_stack 16384 -resume $(SUMMARIZE_CMD)

+preload-db-nested.tdbrun: VGRIND=
+upgrade-test-4.tdbrun: VGRIND=
+
 loader-stress-test.loader: $(patsubst %,loader-stress-test%.tdbrun, 0 1 2 3)
 	true

@@ -773,6 +777,9 @@ CHECK_HOTINDEXER_UNDO_TESTS = $(patsubst %.test,%.run,$(HOTINDEXER_UNDO_TESTS))
 hotindexer-undo-do-test.tdbrun: $(CHECK_HOTINDEXER_UNDO_TESTS)
 	true

+test3039.tdbrun: test3039.tdb
+	( ($(VGRIND) ./$< -v -n 1000 ) && ./$< -v ) $(SUMMARIZE_CMD)
+
 $(CHECK_HOTINDEXER_UNDO_TESTS): %.run: %.test hotindexer-undo-do-test.tdb$(BINSUF)
 	./run-hotindexer-undo-do-tests.bash $< $(SUMMARIZE_CMD)


--- a/src/tests/big-nested-abort-abort.c
+++ b/src/tests/big-nested-abort-abort.c
@@ -91,6 +91,8 @@ setup (void) {
    r=env->set_lk_max_locks(env, N); CKERR(r);
 #ifndef TOKUDB
    r=env->set_lk_max_objects(env, N); CKERR(r);
+#else
+    r=env->set_redzone(env, 0);        CKERR(r);
 #endif
    env->set_errfile(env, stderr);
    r=env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);

--- a/src/tests/diskfull.c
+++ b/src/tests/diskfull.c
@@ -31,6 +31,7 @@ do_db_work(void) {
 	DBT key,data;

 	r=db_env_create(&env, 0);                                                  assert(r==0);
+	r = env->set_redzone(env, 0);    CKERR(r);
 	env->set_errfile(env, error_file ? error_file : stderr);
 	// Don't set the lg bsize for the small experiment.
 	r=env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE|DB_THREAD, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
@@ -75,6 +76,7 @@ do_db_work(void) {

 	// Repeat with more put operations 
 	r=db_env_create(&env, 0);                                                  assert(r==0);
+	r = env->set_redzone(env, 0);    CKERR(r);
 	env->set_errfile(env, error_file ? error_file : stderr);
 	r=env->set_lg_bsize(env, 4096);                                            assert(r==0);
 	r=env->set_cachesize(env, 0, 1, 1);                                        assert(r==0);

--- a/src/tests/helgrind2.c
+++ b/src/tests/helgrind2.c
@@ -22,6 +22,7 @@ static void initialize (void) {
    // setup environment
    {
        r = db_env_create(&env, 0); assert(r == 0);
+	r = env->set_redzone(env, 0);    CKERR(r);
        env->set_errfile(env, stdout);
        r = env->open(env, ENVDIR, DB_INIT_MPOOL + DB_PRIVATE + DB_CREATE, 0777); 
        assert(r == 0);

--- a/src/tests/helgrind3.c
+++ b/src/tests/helgrind3.c
@@ -22,6 +22,7 @@ static void initialize (void) {
    // setup environment
    {
        r = db_env_create(&env, 0); assert(r == 0);
+	r = env->set_redzone(env, 0); CKERR(r);
        env->set_errfile(env, stdout);
        r = env->open(env, ENVDIR, DB_INIT_MPOOL + DB_PRIVATE + DB_CREATE, 0777); 
        assert(r == 0);

--- a/src/tests/hotindexer-undo-do-test.c
+++ b/src/tests/hotindexer-undo-do-test.c
@@ -389,6 +389,7 @@ run_test(char *envdir, char *testname) {
    int r;
    DB_ENV *env = NULL;
    r = db_env_create(&env, 0); assert_zero(r);
+    r = env->set_redzone(env, 0); assert_zero(r);

    r = env->set_generate_row_callback_for_put(env, put_callback); assert_zero(r);


--- a/src/tests/preload-db-nested.c
+++ b/src/tests/preload-db-nested.c
@@ -107,7 +107,7 @@ check_results_nested(DB ** dbs, const uint num_rows) {
        }
        r = cursor->c_close(cursor);
        CKERR(r);
-        r = txn->commit(txn, 0);
+        r = txn->commit(txn, DB_TXN_NOSYNC);
        CKERR(r);
    }
    if ( verbose ) {printf("ok");fflush(stdout);}
@@ -177,7 +177,7 @@ nested_insert(DB ** dbs, uint depth,  DB_TXN *parent_txn, uint k, uint generated
 		printf("abort k = %d, v= %d, depth = %d\n", k, v, depth);
 	}
 	else {    
-	    r = txn->commit(txn, 0);
+	    r = txn->commit(txn, DB_TXN_NOSYNC);
 	    CKERR(r);
 	    if (verbose>=3)
 		printf("commit k = %d, v= %d, depth = %d\n", k, v, depth);

--- a/src/tests/test3039.c
+++ b/src/tests/test3039.c
+/* This is a performance test.  Releasing lock during I/O should mean that given two threads doing queries,
+ * and one of them is in-memory and one of them is out of memory, then the in-memory one should not be slowed down by the out-of-memory one.
+ * 
+ * Step 1: Create a dictionary that doesn't fit in main memory.  Do it fast (sequential insertions).
+ * Step 2: Measure performance of in-memory requests.
+ * Step 3: Add a thread that does requests in parallel.
+ */
+
+#ident "Copyright (c) 2010 Tokutek Inc.  All rights reserved."
+#ident "$Id$"
+
+#include "test.h"
+#include <string.h>
+#include <toku_time.h>
+#include <toku_pthread.h>
+
+static const int envflags = DB_INIT_MPOOL|DB_CREATE|DB_THREAD |DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_TXN|DB_PRIVATE;
+
+#define ROWSIZE 100
+static const char dbname[] = "data.db";
+static unsigned long long n_rows;
+
+static DB_ENV *env = NULL;
+static DB *db;
+
+// BDB cannot handle big transactions  by default (runs out of locks).
+#ifdef TOKUDB
+#define N_PER_XACTION 10000
+#else
+#define N_PER_XACTION 1000
+#endif
+
+static void create_db (u_int64_t N) {
+    n_rows = N;
+    { int r = system("rm -rf " ENVDIR);                                        CKERR(r); }
+    toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
+    { int r = db_env_create(&env, 0);                                          CKERR(r); }
+    env->set_errfile(env, stderr);
+#ifdef TOKUDB
+    env->set_redzone(env, 0);
+#endif
+    { int r = env->set_cachesize(env, 0, 400*4096, 1);                        CKERR(r); }
+    { int r = env->open(env, ENVDIR, envflags, S_IRWXU+S_IRWXG+S_IRWXO);       CKERR(r); }
+    DB_TXN *txn;
+    { int r = env->txn_begin(env, NULL, &txn, 0);                              CKERR(r); }
+    { int r = db_create(&db, env, 0);                                          CKERR(r); }
+    { int r = db->set_pagesize(db, 4096);                                      CKERR(r); }
+    { int r = db->open(db, txn, dbname, NULL, DB_BTREE, DB_CREATE, 0666);      CKERR(r); }
+    { int r = txn->commit(txn, DB_TXN_NOSYNC);                                 CKERR(r); }
+
+    { int r = env->txn_begin(env, NULL, &txn, 0);                              CKERR(r); }
+    u_int64_t n_since_commit = 0;
+    for (unsigned long long i=0; i<N; i++) {
+	if (n_since_commit++ > N_PER_XACTION) {
+	    { int r = txn->commit(txn, DB_TXN_NOSYNC);                         CKERR(r); }
+	    { int r = env->txn_begin(env, NULL, &txn, 0);                      CKERR(r); }
+	}
+	char key[20];
+	char data[200];
+	snprintf(key,  sizeof(key),  "%016llx", i);
+	snprintf(data, sizeof(data), "%08lx%08lx%66s", random(), random()%16, "");
+	DBT keyd, datad;
+	{
+	    int r = db->put(db, txn, dbt_init(&keyd, key, strlen(key)+1), dbt_init(&datad, data, strlen(data)+1), DB_YESOVERWRITE);
+	    CKERR(r);
+	}
+    }
+    //printf("n_rows=%lld\n", n_rows);
+    { int r = txn->commit(txn, DB_TXN_NOSYNC);                                 CKERR(r); }
+}
+
+struct reader_thread_state {
+    /* output */
+    double             elapsed_time;
+    unsigned long long n_did_read;
+
+    /* input */
+    signed long long n_to_read;  // Negative if we just run forever
+    int              do_local;
+
+    /* communicate to the thread while running */
+    volatile int finish;
+
+};
+
+static
+void* reader_thread (void *arg)
+// Return the time to read
+{
+    struct timeval start_time, end_time;
+    gettimeofday(&start_time, 0);
+
+    DB_TXN *txn;
+    struct reader_thread_state *rs = (struct reader_thread_state *)arg;
+    
+    { int r = env->txn_begin(env, NULL, &txn, 0);                              CKERR(r); }
+    char key[20];
+    char data [200];
+    DBT keyd  = { .data = key,  .size = 0, .ulen = sizeof(key),  .flags = DB_DBT_USERMEM };
+    DBT datad = { .data = data, .size = 0, .ulen = sizeof(data), .flags = DB_DBT_USERMEM };
+
+#define N_DISTINCT 16
+    unsigned long long vals[N_DISTINCT];
+    if (rs->do_local) {
+	for (int i=0; i<N_DISTINCT; i++) {
+	    vals[i] = random()%n_rows;
+	}
+    }
+    
+    u_int64_t n_since_commit = 0;
+    long long n_read_so_far = 0;
+    while ((!rs->finish) && ((rs->n_to_read < 0) || (n_read_so_far < rs->n_to_read))) {
+
+	if (n_since_commit++ > N_PER_XACTION) {
+	    { int r = txn->commit(txn, DB_TXN_NOSYNC);                         CKERR(r); }
+	    { int r = env->txn_begin(env, NULL, &txn, 0);                      CKERR(r); }
+	    n_since_commit = 0;
+	}
+	long long value;
+	if (rs->do_local) {
+	    long which = random()%N_DISTINCT;
+	    value = vals[which];
+	    //printf("value=%lld\n", value);
+	} else {
+	    value = random()%n_rows;
+	}
+	snprintf(key,  sizeof(key),  "%016llx", value);
+	keyd.size = strlen(key)+1;
+	int r = db->get(db, txn, &keyd, &datad, 0);
+	CKERR(r);
+	rs->n_did_read++;
+	n_read_so_far ++;
+    }
+    { int r = txn->commit(txn, DB_TXN_NOSYNC);                                 CKERR(r); }
+    
+    gettimeofday(&end_time, 0);
+    rs->elapsed_time = toku_tdiff(&end_time, &start_time);
+    return NULL;
+}
+
+static
+void do_threads (unsigned long long N, int do_nonlocal) {
+    toku_pthread_t ths[2];
+    struct reader_thread_state rstates[2] = {{.n_to_read = N,
+					      .do_local  = 1,
+					      .finish    = 0},
+					     {.n_to_read = -1,
+					      .do_local  = 0,
+					      .finish    = 0}};
+    int n_to_create = do_nonlocal ? 2 : 1;
+    for (int i=0; i<n_to_create; i++) {
+	int r =  toku_pthread_create(&ths[i], 0, reader_thread, (void*)&rstates[i]);
+	CKERR(r);
+    }
+    for (int i=0; i<n_to_create; i++) {
+	void *retval;
+	int r = toku_pthread_join(ths[i], &retval);
+	CKERR(r);
+	assert(retval==0);
+	if (verbose) {
+	    printf("%9s thread time = %8.2fs on %9lld reads (%.3f us/read)\n",
+		   (i==0 ? "local" : "nonlocal"),
+		   rstates[i].elapsed_time, rstates[i].n_did_read, rstates[i].elapsed_time/rstates[i].n_did_read * 1e6);
+	}
+	rstates[1].finish = 1;
+    }
+    if (verbose && do_nonlocal) {
+	printf("total                                %9lld reads (%.3f us/read)\n",
+	       rstates[0].n_did_read + rstates[1].n_did_read,
+	       (rstates[0].elapsed_time)/(rstates[0].n_did_read + rstates[1].n_did_read) * 1e6);
+    }
+}
+
+static volatile unsigned long long n_preads;
+
+static ssize_t my_pread (int fd, void *buf, size_t count, off_t offset) {
+    (void) __sync_fetch_and_add(&n_preads, 1);
+    usleep(1000); // sleep for a millisecond
+    return pread(fd, buf, count, offset);
+}
+
+unsigned long N_default = 100000;
+unsigned long N;
+
+static void my_parse_args (int argc, char * const argv[]) {
+    const char *progname = argv[0];
+    argc--; argv++;
+    verbose = 0;
+    N = N_default;
+    while (argc>0) {
+	if (strcmp(argv[0],"-v")==0) {
+	    verbose++;
+	} else if (strcmp(argv[0],"-q")==0) {
+	    if (verbose>0) verbose--;
+	} else if (strcmp(argv[0],"-n")==0) {
+	    argc--; argv++;
+	    if (argc==0) goto usage;
+	    errno = 0; 
+	    char *end;
+	    N = strtol(argv[0], &end, 10);
+	    if (errno!=0 || *end!=0) goto usage;
+	} else {
+	usage:
+	    fprintf(stderr, "Usage:\n %s [-v] [-q] [-n <rowcount> (default %ld)]\n", progname, N_default);
+	    fprintf(stderr, "  -n 10000     is probably good for valgrind.\n");
+	    exit(1);
+	}
+	argc--; argv++;
+    }
+
+}
+
+int test_main (int argc, char * const argv[])  {
+    my_parse_args(argc, argv);
+
+    unsigned long long M = N*10;
+
+    db_env_set_func_pread(my_pread);
+
+    create_db (N);
+    if (verbose) printf("%lld preads\n", n_preads);
+    do_threads (M, 0);
+    if (verbose) printf("%lld preads\n", n_preads);
+    do_threads (M, 0);
+    if (verbose) printf("%lld preads\n", n_preads);
+    do_threads (M, 1);
+    if (verbose) printf("%lld preads\n", n_preads);
+    { int r = db->close(db, 0);                                                CKERR(r); }
+    { int r = env->close(env, 0);                                              CKERR(r); }
+    if (verbose) printf("%lld preads\n", n_preads);
+    return 0;
+}
+
--- a/src/ydb-internal.h
+++ b/src/ydb-internal.h
@@ -3,6 +3,7 @@
 #define YDB_INTERNAL_H

 #ident "Copyright (c) 2007-2010 Tokutek Inc.  All rights reserved."
+#ident "$Id$"

 #include <db.h>
 #include "../newbrt/brttypes.h"
@@ -106,19 +107,13 @@ struct __toku_db_env_internal {
   ********************************************************* */

 typedef struct {
-    u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released */ 
-    u_int64_t        max_possible_sleep;      /* max possible sleep time for ydb lock scheduling (constant) */ 
-    u_int64_t        processor_freq_mhz;      /* clock frequency in MHz */ 
-    u_int64_t        max_requested_sleep;     /* max sleep time requested, can be larger than max possible */ 
-    u_int64_t        times_max_sleep_used;    /* number of times the max_possible_sleep was used to sleep */ 
-    u_int64_t        total_sleepers;          /* total number of times a client slept for ydb lock scheduling */ 
-    u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling */ 
-    u_int64_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-    u_int64_t        total_waiters;           /* total number of times a client thread waited for ydb lock  */ 
-    u_int64_t        total_clients;           /* total number of separate client threads that use ydb lock  */ 
-    u_int64_t        time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */
-    u_int64_t        max_time_ydb_lock_held;  /* max time a client thread held the ydb lock  */ 
-    u_int64_t        total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
+    volatile u_int64_t        ydb_lock_ctr;            /* how many times has ydb lock been taken/released.  This is precise since it is updated only when the lock is held.                              */ 
+    volatile u_int32_t        num_waiters_now;         /* How many are waiting on the ydb lock right now (including the current lock holder).  This is precise since it is updated with a fetch-and-add. */
+    volatile u_int32_t        max_waiters;             /* max number of simultaneous client threads kept waiting for ydb lock.  This is precise (updated only when the lock is held) but may be running a little behind (while waiting for the lock it hasn't been updated).  */ 
+    volatile u_int64_t        total_sleep_time;        /* total time spent sleeping for ydb lock scheduling (useconds).   This adds up over many clients. This is precise since it is updated with an atomic fetch-and-add. */ 
+    volatile u_int64_t        max_time_ydb_lock_held;  /* max time the ydb lock was held (in microseconds).  This is precise since it is updated only when the lock is held.  */ 
+    volatile u_int64_t        total_time_ydb_lock_held;/* total time the ydb lock has been held (in microseconds).  */
+    volatile u_int64_t        total_time_since_start;  /* total time since the ydb lock was initialized (in microseconds) This is only updated when the lock is accessed (so if you don't acquire the lock this doesn't increase), and it is updated precisely (even though it isn't updated continuously). */
 } SCHEDULE_STATUS_S, *SCHEDULE_STATUS;



--- a/src/ydb.c
+++ b/src/ydb.c
@@ -902,6 +902,7 @@ toku_env_open(DB_ENV * env, const char *home, u_int32_t flags, int mode) {

    r = toku_brt_create_cachetable(&env->i->cachetable, env->i->cachetable_size, ZERO_LSN, env->i->logger);
    if (r!=0) goto died2;
+    toku_cachetable_set_lock_unlock_for_io(env->i->cachetable, toku_ydb_lock, toku_ydb_unlock);

    toku_cachetable_set_env_dir(env->i->cachetable, env->i->dir);

@@ -1752,19 +1753,14 @@ env_get_engine_status(DB_ENV * env, ENGINE_STATUS * engstat, char * env_panic_st
 	{
 	    SCHEDULE_STATUS_S schedstat;
 	    toku_ydb_lock_get_status(&schedstat);
-	    engstat->ydb_lock_ctr = schedstat.ydb_lock_ctr;                        /* how many times has ydb lock been taken/released */ 
-	    engstat->max_possible_sleep = schedstat.max_possible_sleep;            /* max possible sleep time for ydb lock scheduling (constant) */ 
-	    engstat->processor_freq_mhz = schedstat.processor_freq_mhz;            /* clock frequency in MHz */
-	    engstat->max_requested_sleep = schedstat.max_requested_sleep;          /* max sleep time requested, can be larger than max possible */ 
-	    engstat->times_max_sleep_used = schedstat.times_max_sleep_used;        /* number of times the max_possible_sleep was used to sleep */ 
-	    engstat->total_sleepers = schedstat.total_sleepers;                    /* total number of times a client slept for ydb lock scheduling */ 
-	    engstat->total_sleep_time = schedstat.total_sleep_time;                /* total time spent sleeping for ydb lock scheduling */ 
-	    engstat->max_waiters = schedstat.max_waiters;                          /* max number of simultaneous client threads kept waiting for ydb lock  */ 
-	    engstat->total_waiters = schedstat.total_waiters;                      /* total number of times a client thread waited for ydb lock  */ 
-	    engstat->total_clients = schedstat.total_clients;                      /* total number of separate client threads that use ydb lock  */ 
-	    engstat->time_ydb_lock_held_unavailable = schedstat.time_ydb_lock_held_unavailable;  /* number of times a thread migrated and theld is unavailable */ 
-	    engstat->total_time_ydb_lock_held = schedstat.total_time_ydb_lock_held;/* total time client threads held the ydb lock  */ 
-	    engstat->max_time_ydb_lock_held = schedstat.max_time_ydb_lock_held;    /* max time client threads held the ydb lock  */ 
+	    engstat->ydb_lock_ctr             = schedstat.ydb_lock_ctr;             /* How many times has ydb lock been taken/released?                                                                      */ 
+	    engstat->num_waiters_now          = schedstat.num_waiters_now;          /* How many are waiting on on the ydb lock right now (including the current lock holder, if any)?                        */
+	    engstat->max_waiters              = schedstat.max_waiters;              /* The maxium of num_waiters_now (since the system booted).                                                              */ 
+	    engstat->total_sleep_time         = schedstat.total_sleep_time;         /* The total time spent (since the system booted) sleeping (by the indexer) to give foreground threads a chance to work .*/ 
+	    engstat->max_time_ydb_lock_held   = schedstat.max_time_ydb_lock_held;   /* Maximum time that the ydb lock was held.                                                                              */ 
+	    engstat->total_time_ydb_lock_held = schedstat.total_time_ydb_lock_held; /* Total time client threads held the ydb lock                                                                           */ 
+	    engstat->total_time_since_start   = schedstat.total_time_since_start;   /* Total time since the lock was created.  Use this as total_time_ydb_lock_held/total_time_since_start to get a ratio.   */
+
 	}
        {
 	    LE_STATUS_S lestat;                    // Rice's vampire
@@ -1988,18 +1984,12 @@ env_get_engine_status_text(DB_ENV * env, char * buff, int bufsiz) {
 	n += snprintf(buff + n, bufsiz - n, "startuptime                      %s \n", engstat.startuptime);
 	n += snprintf(buff + n, bufsiz - n, "now                              %s \n", engstat.now);
 	n += snprintf(buff + n, bufsiz - n, "ydb_lock_ctr                     %"PRIu64"\n", engstat.ydb_lock_ctr);
-	n += snprintf(buff + n, bufsiz - n, "max_possible_sleep               %"PRIu64"\n", engstat.max_possible_sleep);
-	n += snprintf(buff + n, bufsiz - n, "processor_freq_mhz               %"PRIu64"\n", engstat.processor_freq_mhz);
-	n += snprintf(buff + n, bufsiz - n, "max_requested_sleep              %"PRIu64"\n", engstat.max_requested_sleep);
-	n += snprintf(buff + n, bufsiz - n, "times_max_sleep_used             %"PRIu64"\n", engstat.times_max_sleep_used);
-	n += snprintf(buff + n, bufsiz - n, "total_sleepers                   %"PRIu64"\n", engstat.total_sleepers);
+	n += snprintf(buff + n, bufsiz - n, "num_waiters_now                  %"PRIu32"\n", engstat.num_waiters_now);
+	n += snprintf(buff + n, bufsiz - n, "max_waiters                      %"PRIu32"\n", engstat.max_waiters);
 	n += snprintf(buff + n, bufsiz - n, "total_sleep_time                 %"PRIu64"\n", engstat.total_sleep_time);
-	n += snprintf(buff + n, bufsiz - n, "max_waiters                      %"PRIu64"\n", engstat.max_waiters);
-	n += snprintf(buff + n, bufsiz - n, "total_waiters                    %"PRIu64"\n", engstat.total_waiters);
-	n += snprintf(buff + n, bufsiz - n, "total_clients                    %"PRIu64"\n", engstat.total_clients);
-	n += snprintf(buff + n, bufsiz - n, "time_ydb_lock_held_unavailable   %"PRIu64"\n", engstat.time_ydb_lock_held_unavailable);
 	n += snprintf(buff + n, bufsiz - n, "max_time_ydb_lock_held           %"PRIu64"\n", engstat.max_time_ydb_lock_held);
 	n += snprintf(buff + n, bufsiz - n, "total_time_ydb_lock_held         %"PRIu64"\n", engstat.total_time_ydb_lock_held);
+	n += snprintf(buff + n, bufsiz - n, "total_time_since_start           %"PRIu64"\n", engstat.total_time_since_start);
 	n += snprintf(buff + n, bufsiz - n, "le_max_committed_xr              %"PRIu64"\n", engstat.le_max_committed_xr);
 	n += snprintf(buff + n, bufsiz - n, "le_max_provisional_xr            %"PRIu64"\n", engstat.le_max_provisional_xr);
 	n += snprintf(buff + n, bufsiz - n, "le_expanded                      %"PRIu64"\n", engstat.le_expanded);
@@ -5945,6 +5935,11 @@ db_env_set_func_fclose (int (*fclose_function)(FILE*)) {
    return toku_set_func_fclose(fclose_function);
 }

+int
+db_env_set_func_pread (ssize_t (*fun)(int, void *, size_t, off_t)) {
+    return toku_set_func_pread(fun);
+}
+
 void 
 db_env_set_func_loader_fwrite (size_t (*fwrite_fun)(const void*,size_t,size_t,FILE*)) {
    brtloader_set_os_fwrite(fwrite_fun);

--- a/toku_include/Makefile.include
+++ b/toku_include/Makefile.include
@@ -98,10 +98,12 @@ endif
 # -Wno-deprecated is needed on gcc 4.4.{1,2} to make the #ident complaints go away.
 # -Wno-strict-aliasing is needed on gcc 4.4.{1,2} to make certain gratuitous warnings go away.
 # -Wno-unused-result is needed on gcc 4.4.1 to make warnings about ignoring the results of certain system calls go away.
+# Gcc 4.4.1-4ubuntu9 cannot handle --combine, but 4.4.4 can.  But 4.5 cannot.
 ifeq ($(GCCVERSION),4.4.2)
 GCC_VERSION_SPECIFIC = -Wno-deprecated 
 else ifeq ($(GCCVERSION),4.4.1)
 GCC_VERSION_SPECIFIC = -Wno-deprecated -Wno-unused-result
+ COMBINE := 0
 endif

 WALL      = $(GCC_VERSION_SPECIFIC) -Wall -Wextra -Wcast-align -Wbad-function-cast -Wno-missing-noreturn -Wstrict-prototypes -Wmissing-prototypes -Wmissing-declarations

--- a/toku_include/toku_portability.h
+++ b/toku_include/toku_portability.h
@@ -174,6 +174,7 @@ int toku_os_open(const char *path, int oflag, int mode);
 int toku_os_close(int fd);
 int toku_os_fclose(FILE * stream);
 ssize_t toku_os_read(int fd, void *buf, size_t count);
+ssize_t toku_os_pread(int fd, void *buf, size_t count, off_t offset);

 // wrapper around fsync
 int toku_file_fsync_without_accounting(int fd);
@@ -202,7 +203,7 @@ int toku_set_func_fopen (FILE * (*)(const char *, const char *));
 int toku_set_func_open (int (*)(const char *, int, int));  // variadic form not implemented until needed
 int toku_set_func_fclose(int (*)(FILE*));
 int toku_set_func_read(ssize_t (*)(int, void *, size_t));
-
+int toku_set_func_pread (ssize_t (*)(int, void *, size_t, off_t));
 int toku_portability_init    (void);
 int toku_portability_destroy (void);