Commit 6df82bdb authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

Merge in 1591 and fix conflicts. Fixes #1591.

{{{
svn merge -r10512:10803 https://svn.tokutek.com/tokudb/toku/tokudb.1591
}}}


git-svn-id: file:///svn/toku/tokudb@10804 c7de825b-a66e-492c-adef-691d508d4ae1
parent 0aa7cd63
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -115,12 +115,6 @@ typedef enum { ...@@ -115,12 +115,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[60]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[8]; void* __toku_dummy0[8];
...@@ -180,15 +174,16 @@ struct __toku_db_lsn { ...@@ -180,15 +174,16 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy0[25]; void* __toku_dummy0[24];
char __toku_dummy1[96]; char __toku_dummy1[96];
void *api_internal; /* 32-bit offset=236 size=4, 64=bit offset=376 size=8 */ void *api_internal; /* 32-bit offset=236 size=4, 64=bit offset=376 size=8 */
void* __toku_dummy2[5]; void* __toku_dummy2[5];
......
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -117,12 +117,6 @@ typedef enum { ...@@ -117,12 +117,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[64]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[10]; void* __toku_dummy0[10];
...@@ -190,15 +184,16 @@ struct __toku_db_lsn { ...@@ -190,15 +184,16 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy0[28]; void* __toku_dummy0[27];
char __toku_dummy1[96]; char __toku_dummy1[96];
void *api_internal; /* 32-bit offset=248 size=4, 64=bit offset=400 size=8 */ void *api_internal; /* 32-bit offset=248 size=4, 64=bit offset=400 size=8 */
void* __toku_dummy2[5]; void* __toku_dummy2[5];
......
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -118,12 +118,6 @@ typedef enum { ...@@ -118,12 +118,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[60]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[10]; void* __toku_dummy0[10];
...@@ -193,15 +187,16 @@ struct __toku_db_lsn { ...@@ -193,15 +187,16 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy0[30]; void* __toku_dummy0[29];
char __toku_dummy1[96]; char __toku_dummy1[96];
void *api_internal; /* 32-bit offset=256 size=4, 64=bit offset=416 size=8 */ void *api_internal; /* 32-bit offset=256 size=4, 64=bit offset=416 size=8 */
void* __toku_dummy2[5]; void* __toku_dummy2[5];
......
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -118,12 +118,6 @@ typedef enum { ...@@ -118,12 +118,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[60]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[12]; void* __toku_dummy0[12];
...@@ -192,15 +186,16 @@ struct __toku_db_lsn { ...@@ -192,15 +186,16 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=16 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=20 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy0[33]; void* __toku_dummy0[32];
char __toku_dummy1[96]; char __toku_dummy1[96];
void *api_internal; /* 32-bit offset=268 size=4, 64=bit offset=440 size=8 */ void *api_internal; /* 32-bit offset=268 size=4, 64=bit offset=440 size=8 */
void* __toku_dummy2[5]; void* __toku_dummy2[5];
......
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -120,12 +120,6 @@ typedef enum { ...@@ -120,12 +120,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[64]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[12]; void* __toku_dummy0[12];
...@@ -195,16 +189,17 @@ struct __toku_db_lsn { ...@@ -195,16 +189,17 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
char __toku_dummy0[8]; char __toku_dummy0[8];
void *app_private; /* 32-bit offset=20 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=20 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=24 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=24 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*); int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy1[37]; void* __toku_dummy1[36];
char __toku_dummy2[80]; char __toku_dummy2[80];
void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */ void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */
void* __toku_dummy3[5]; void* __toku_dummy3[5];
......
...@@ -288,7 +288,7 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un ...@@ -288,7 +288,7 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un
"typedef int64_t toku_off_t;\n" "typedef int64_t toku_off_t;\n"
"#endif\n"); "#endif\n");
printf("typedef struct __toku_db_btree_stat DB_BTREE_STAT;\n"); //printf("typedef struct __toku_db_btree_stat DB_BTREE_STAT;\n");
printf("typedef struct __toku_db_env DB_ENV;\n"); printf("typedef struct __toku_db_env DB_ENV;\n");
printf("typedef struct __toku_db_key_range DB_KEY_RANGE;\n"); printf("typedef struct __toku_db_key_range DB_KEY_RANGE;\n");
printf("typedef struct __toku_db_lsn DB_LSN;\n"); printf("typedef struct __toku_db_lsn DB_LSN;\n");
...@@ -302,6 +302,10 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un ...@@ -302,6 +302,10 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un
printf("typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);\n"); printf("typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);\n");
printf("typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);\n"); printf("typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);\n");
printf("typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);\n"); printf("typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);\n");
//stat64
printf("typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;\n");
print_dbtype(); print_dbtype();
// print_db_notices(); // print_db_notices();
print_defines(); print_defines();
...@@ -309,7 +313,8 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un ...@@ -309,7 +313,8 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un
printf("/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/\n"); printf("/* in wrap mode, top-level function txn_begin is renamed, but the field isn't renamed, so we have to hack it here.*/\n");
printf("#ifdef _TOKUDB_WRAP_H\n#undef txn_begin\n#endif\n"); printf("#ifdef _TOKUDB_WRAP_H\n#undef txn_begin\n#endif\n");
assert(sizeof(db_btree_stat_fields32)==sizeof(db_btree_stat_fields64)); assert(sizeof(db_btree_stat_fields32)==sizeof(db_btree_stat_fields64));
print_struct("db_btree_stat", 0, db_btree_stat_fields32, db_btree_stat_fields64, sizeof(db_btree_stat_fields32)/sizeof(db_btree_stat_fields32[0]), 0); // Don't produce db_btree_stat records.
//print_struct("db_btree_stat", 0, db_btree_stat_fields32, db_btree_stat_fields64, sizeof(db_btree_stat_fields32)/sizeof(db_btree_stat_fields32[0]), 0);
assert(sizeof(db_env_fields32)==sizeof(db_env_fields64)); assert(sizeof(db_env_fields32)==sizeof(db_env_fields64));
print_struct("db_env", 1, db_env_fields32, db_env_fields64, sizeof(db_env_fields32)/sizeof(db_env_fields32[0]), 0); print_struct("db_env", 1, db_env_fields32, db_env_fields64, sizeof(db_env_fields32)/sizeof(db_env_fields32[0]), 0);
...@@ -322,6 +327,7 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un ...@@ -322,6 +327,7 @@ int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__un
assert(sizeof(db_fields32)==sizeof(db_fields64)); assert(sizeof(db_fields32)==sizeof(db_fields64));
{ {
const char *extra[]={"int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact)", const char *extra[]={"int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact)",
"int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *)",
"int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*)", "int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*)",
"int (*pre_acquire_table_lock)(DB*, DB_TXN*)", "int (*pre_acquire_table_lock)(DB*, DB_TXN*)",
"const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/", "const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/",
......
/* BDB offsets on a 64-bit machine */
#define DB_VERSION_MAJOR_64 4
#define DB_VERSION_MINOR_64 7
#define DB_VERSION_STRING_64 "Berkeley DB Compatability Header 4.7"
struct fieldinfo db_btree_stat_fields64[] = {
{"u_int32_t bt_nkeys", 12, 4},
{"u_int32_t bt_ndata", 16, 4},
{0, 84, 84} /* size of whole struct */
};
struct fieldinfo db_env_fields64[] = {
{"void *app_private", 216, 8},
{"void *api1_internal", 224, 8},
{"int (*close) (DB_ENV *, u_int32_t)", 408, 8},
{"void (*err) (const DB_ENV *, int, const char *, ...)", 432, 8},
{"int (*get_cachesize) (DB_ENV *, u_int32_t *, u_int32_t *, int *)", 472, 8},
{"int (*get_flags) (DB_ENV *, u_int32_t *)", 520, 8},
{"int (*get_lg_max) (DB_ENV *, u_int32_t*)", 568, 8},
{"int (*get_lk_max_locks) (DB_ENV *, u_int32_t *)", 608, 8},
{"int (*log_archive) (DB_ENV *, char **[], u_int32_t)", 800, 8},
{"int (*log_flush) (DB_ENV *, const DB_LSN *)", 824, 8},
{"int (*open) (DB_ENV *, const char *, u_int32_t, int)", 1048, 8},
{"int (*set_cachesize) (DB_ENV *, u_int32_t, u_int32_t, int)", 1328, 8},
{"int (*set_data_dir) (DB_ENV *, const char *)", 1336, 8},
{"void (*set_errcall) (DB_ENV *, void (*)(const DB_ENV *, const char *, const char *))", 1352, 8},
{"void (*set_errfile) (DB_ENV *, FILE*)", 1360, 8},
{"void (*set_errpfx) (DB_ENV *, const char *)", 1368, 8},
{"int (*set_flags) (DB_ENV *, u_int32_t, int)", 1392, 8},
{"int (*set_lg_bsize) (DB_ENV *, u_int32_t)", 1416, 8},
{"int (*set_lg_dir) (DB_ENV *, const char *)", 1424, 8},
{"int (*set_lg_max) (DB_ENV *, u_int32_t)", 1440, 8},
{"int (*set_lk_detect) (DB_ENV *, u_int32_t)", 1464, 8},
{"int (*set_lk_max_locks) (DB_ENV *, u_int32_t)", 1480, 8},
{"int (*set_tmp_dir) (DB_ENV *, const char *)", 1600, 8},
{"int (*set_verbose) (DB_ENV *, u_int32_t, int)", 1624, 8},
{"int (*txn_begin) (DB_ENV *, DB_TXN *, DB_TXN **, u_int32_t)", 1640, 8},
{"int (*txn_checkpoint) (DB_ENV *, u_int32_t, u_int32_t, u_int32_t)", 1648, 8},
{"int (*txn_stat) (DB_ENV *, DB_TXN_STAT **, u_int32_t)", 1664, 8},
{0, 1688, 1688} /* size of whole struct */
};
struct fieldinfo db_key_range_fields64[] = {
{"double less", 0, 8},
{"double equal", 8, 8},
{"double greater", 16, 8},
{0, 1688, 1688} /* size of whole struct */
};
struct fieldinfo db_lsn_fields64[] = {
{0, 8, 8} /* size of whole struct */
};
struct fieldinfo db_fields64[] = {
{"void *app_private", 32, 8},
{"DB_ENV *dbenv", 40, 8},
{"void *api_internal", 488, 8},
{"int (*close) (DB*, u_int32_t)", 544, 8},
{"int (*cursor) (DB *, DB_TXN *, DBC **, u_int32_t)", 560, 8},
{"int (*del) (DB *, DB_TXN *, DBT *, u_int32_t)", 568, 8},
{"int (*fd) (DB *, int *)", 600, 8},
{"int (*get) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t)", 608, 8},
{"int (*get_flags) (DB *, u_int32_t *)", 688, 8},
{"int (*get_pagesize) (DB *, u_int32_t *)", 752, 8},
{"int (*key_range) (DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)", 832, 8},
{"int (*open) (DB *, DB_TXN *, const char *, const char *, DBTYPE, u_int32_t, int)", 840, 8},
{"int (*put) (DB *, DB_TXN *, DBT *, DBT *, u_int32_t)", 856, 8},
{"int (*remove) (DB *, const char *, const char *, u_int32_t)", 864, 8},
{"int (*rename) (DB *, const char *, const char *, const char *, u_int32_t)", 872, 8},
{"int (*set_bt_compare) (DB *, int (*)(DB *, const DBT *, const DBT *))", 896, 8},
{"int (*set_dup_compare) (DB *, int (*)(DB *, const DBT *, const DBT *))", 928, 8},
{"void (*set_errfile) (DB *, FILE*)", 952, 8},
{"int (*set_flags) (DB *, u_int32_t)", 976, 8},
{"int (*set_pagesize) (DB *, u_int32_t)", 1040, 8},
{"int (*stat) (DB *, void *, u_int32_t)", 1104, 8},
{"int (*truncate) (DB *, DB_TXN *, u_int32_t *, u_int32_t)", 1128, 8},
{"int (*verify) (DB *, const char *, const char *, FILE *, u_int32_t)", 1144, 8},
{0, 1208, 1208} /* size of whole struct */
};
struct fieldinfo db_txn_active_fields64[] = {
{"u_int32_t txnid", 0, 4},
{"DB_LSN lsn", 24, 8},
{0, 232, 232} /* size of whole struct */
};
struct fieldinfo db_txn_fields64[] = {
{"DB_ENV *mgrp /*In TokuDB, mgrp is a DB_ENV not a DB_TXNMGR*/", 0, 8},
{"DB_TXN *parent", 8, 8},
{"void *api_internal", 176, 8},
{"int (*abort) (DB_TXN *)", 200, 8},
{"int (*commit) (DB_TXN*, u_int32_t)", 208, 8},
{"u_int32_t (*id) (DB_TXN *)", 232, 8},
{0, 280, 280} /* size of whole struct */
};
struct fieldinfo db_txn_stat_fields64[] = {
{"u_int32_t st_nactive", 44, 4},
{"DB_TXN_ACTIVE *st_txnarray", 64, 8},
{0, 88, 88} /* size of whole struct */
};
struct fieldinfo dbc_fields64[] = {
{"DB *dbp", 0, 8},
{"int (*c_close) (DBC *)", 408, 8},
{"int (*c_count) (DBC *, db_recno_t *, u_int32_t)", 416, 8},
{"int (*c_del) (DBC *, u_int32_t)", 424, 8},
{"int (*c_get) (DBC *, DBT *, DBT *, u_int32_t)", 440, 8},
{0, 528, 528} /* size of whole struct */
};
struct fieldinfo dbt_fields64[] = {
{"void*data", 0, 8},
{"u_int32_t size", 8, 4},
{"u_int32_t ulen", 12, 4},
{"u_int32_t flags", 32, 4},
{0, 40, 40} /* size of whole struct */
};
...@@ -21,7 +21,6 @@ extern "C" { ...@@ -21,7 +21,6 @@ extern "C" {
#define TOKU_OFF_T_DEFINED #define TOKU_OFF_T_DEFINED
typedef int64_t toku_off_t; typedef int64_t toku_off_t;
#endif #endif
typedef struct __toku_db_btree_stat DB_BTREE_STAT;
typedef struct __toku_db_env DB_ENV; typedef struct __toku_db_env DB_ENV;
typedef struct __toku_db_key_range DB_KEY_RANGE; typedef struct __toku_db_key_range DB_KEY_RANGE;
typedef struct __toku_db_lsn DB_LSN; typedef struct __toku_db_lsn DB_LSN;
...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t; ...@@ -35,6 +34,7 @@ typedef u_int32_t db_recno_t;
typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*); typedef int(*YDB_CALLBACK_FUNCTION)(DBT const*, DBT const*, void*);
typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h); typedef int(*YDB_HEAVISIDE_CALLBACK_FUNCTION)(DBT const *key, DBT const *value, void *extra_f, int r_h);
typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h); typedef int(*YDB_HEAVISIDE_FUNCTION)(const DBT *key, const DBT *value, void *extra_h);
typedef struct __toku_db_btree_stat64 { u_int64_t bt_nkeys, bt_ndata, bt_dsize; } DB_BTREE_STAT64;
typedef enum { typedef enum {
DB_BTREE=1, DB_BTREE=1,
DB_UNKNOWN=5 DB_UNKNOWN=5
...@@ -120,12 +120,6 @@ typedef enum { ...@@ -120,12 +120,6 @@ typedef enum {
#ifdef _TOKUDB_WRAP_H #ifdef _TOKUDB_WRAP_H
#undef txn_begin #undef txn_begin
#endif #endif
struct __toku_db_btree_stat {
char __toku_dummy0[12];
u_int32_t bt_nkeys; /* 32-bit offset=12 size=4, 64=bit offset=12 size=4 */
u_int32_t bt_ndata; /* 32-bit offset=16 size=4, 64=bit offset=16 size=4 */
char __toku_dummy1[64]; /* Padding at the end */
};
struct __toku_db_env { struct __toku_db_env {
struct __toku_db_env_internal *i; struct __toku_db_env_internal *i;
void* __toku_dummy0[12]; void* __toku_dummy0[12];
...@@ -195,16 +189,17 @@ struct __toku_db_lsn { ...@@ -195,16 +189,17 @@ struct __toku_db_lsn {
struct __toku_db { struct __toku_db {
struct __toku_db_internal *i; struct __toku_db_internal *i;
int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact); int (*key_range64)(DB*, DB_TXN *, DBT *, u_int64_t *less, u_int64_t *equal, u_int64_t *greater, int *is_exact);
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*); int (*stat64)(DB *, DB_TXN *, DB_BTREE_STAT64 *);
char __toku_dummy0[8]; char __toku_dummy0[8];
void *app_private; /* 32-bit offset=20 size=4, 64=bit offset=32 size=8 */ void *app_private; /* 32-bit offset=20 size=4, 64=bit offset=32 size=8 */
DB_ENV *dbenv; /* 32-bit offset=24 size=4, 64=bit offset=40 size=8 */ DB_ENV *dbenv; /* 32-bit offset=24 size=4, 64=bit offset=40 size=8 */
int (*pre_acquire_read_lock)(DB*, DB_TXN*, const DBT*, const DBT*, const DBT*, const DBT*);
int (*pre_acquire_table_lock)(DB*, DB_TXN*); int (*pre_acquire_table_lock)(DB*, DB_TXN*);
const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/; const DBT* (*dbt_pos_infty)(void) /* Return the special DBT that refers to positive infinity in the lock table.*/;
const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/; const DBT* (*dbt_neg_infty)(void)/* Return the special DBT that refers to negative infinity in the lock table.*/;
int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */; int (*delboth) (DB*, DB_TXN*, DBT*, DBT*, u_int32_t) /* Delete the key/value pair. */;
int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */; int (*row_size_supported) (DB*, u_int32_t) /* Test whether a row size is supported. */;
void* __toku_dummy1[37]; void* __toku_dummy1[36];
char __toku_dummy2[80]; char __toku_dummy2[80];
void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */ void *api_internal; /* 32-bit offset=276 size=4, 64=bit offset=464 size=8 */
void* __toku_dummy3[5]; void* __toku_dummy3[5];
......
...@@ -38,9 +38,35 @@ enum { BUFFER_HEADER_SIZE = (4 // height// ...@@ -38,9 +38,35 @@ enum { BUFFER_HEADER_SIZE = (4 // height//
+ TREE_FANOUT * 8 // children + TREE_FANOUT * 8 // children
) }; ) };
struct subtree_estimates {
// estimate number of rows in the tree by counting the number of rows
// in the leaves. The stuff in the internal nodes is likely to be off O(1).
u_int64_t nkeys; // number of distinct keys.
u_int64_t ndata;; // number of key-data pairs (previously leafentry_estimate)
u_int64_t dsize; // total size of leafentries
BOOL exact; // are the estimates exact?
};
static struct subtree_estimates const zero_estimates __attribute__((__unused__)) = {0,0,0,TRUE};
static inline void __attribute__((__unused__))
subtract_estimates (struct subtree_estimates *a, struct subtree_estimates *b) {
if (a->nkeys >= b->nkeys) a->nkeys -= b->nkeys; else a->nkeys=0;
if (a->ndata >= b->ndata) a->ndata -= b->ndata; else a->ndata=0;
if (a->dsize >= b->dsize) a->dsize -= b->dsize; else a->dsize=0;
}
static inline void __attribute__((__unused__))
add_estimates (struct subtree_estimates *a, struct subtree_estimates *b) {
a->nkeys += b->nkeys;
a->ndata += b->ndata;
a->dsize += b->dsize;
}
struct brtnode_nonleaf_childinfo { struct brtnode_nonleaf_childinfo {
u_int32_t subtree_fingerprint; u_int32_t subtree_fingerprint;
u_int64_t leafentry_estimate; // estimate how many leafentries are below us. struct subtree_estimates subtree_estimates;
BLOCKNUM blocknum; BLOCKNUM blocknum;
BOOL have_fullhash; // do we have the full hash? BOOL have_fullhash; // do we have the full hash?
u_int32_t fullhash; // the fullhash of the child u_int32_t fullhash; // the fullhash of the child
...@@ -81,7 +107,7 @@ struct brtnode { ...@@ -81,7 +107,7 @@ struct brtnode {
struct brtnode_nonleaf_childinfo *childinfos; /* One extra so we can grow */ struct brtnode_nonleaf_childinfo *childinfos; /* One extra so we can grow */
#define BNC_SUBTREE_FINGERPRINT(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint) #define BNC_SUBTREE_FINGERPRINT(node,i) ((node)->u.n.childinfos[i].subtree_fingerprint)
#define BNC_SUBTREE_LEAFENTRY_ESTIMATE(node,i) ((node)->u.n.childinfos[i].leafentry_estimate) #define BNC_SUBTREE_ESTIMATES(node,i) ((node)->u.n.childinfos[i].subtree_estimates)
#define BNC_BLOCKNUM(node,i) ((node)->u.n.childinfos[i].blocknum) #define BNC_BLOCKNUM(node,i) ((node)->u.n.childinfos[i].blocknum)
#define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer) #define BNC_BUFFER(node,i) ((node)->u.n.childinfos[i].buffer)
#define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer) #define BNC_NBYTESINBUF(node,i) ((node)->u.n.childinfos[i].n_bytes_in_buffer)
...@@ -94,6 +120,7 @@ struct brtnode { ...@@ -94,6 +120,7 @@ struct brtnode {
However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */ However, in the absense of duplicate keys, child 1's keys *are* > childkeys[0]. */
} n; } n;
struct leaf { struct leaf {
struct subtree_estimates leaf_stats; // actually it is exact.
OMT buffer; OMT buffer;
LEAFLOCK leaflock; LEAFLOCK leaflock;
unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */ unsigned int n_bytes_in_buffer; /* How many bytes to represent the OMT (including the per-key overheads, but not including the overheads for the node. */
...@@ -278,7 +305,7 @@ enum brt_layout_version_e { ...@@ -278,7 +305,7 @@ enum brt_layout_version_e {
BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333 BRT_LAYOUT_VERSION_7 = 7, // Diff from 6 to 7: Add exact-bit to leafentry_estimate #818, add magic to header #22, add per-subdatase flags #333
BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6 BRT_LAYOUT_VERSION_8 = 8, // Diff from 7 to 8: Use murmur instead of crc32. We are going to make a simplification and stop supporting version 7 and before. Current As of Beta 1.0.6
BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression. BRT_LAYOUT_VERSION_9 = 9, // Diff from 8 to 9: Variable-sized blocks and compression.
BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order BRT_LAYOUT_VERSION_10 = 10, // Diff from 9 to 10: Variable number of compressed sub-blocks per block, disk byte order == intel byte order, Subtree estimates instead of just leafentry estimates.
BRT_ANTEULTIMATE_VERSION, // the version after the most recent version BRT_ANTEULTIMATE_VERSION, // the version after the most recent version
BRT_LAYOUT_VERSION = BRT_ANTEULTIMATE_VERSION-1 // A hack so I don't have to change this line. BRT_LAYOUT_VERSION = BRT_ANTEULTIMATE_VERSION-1 // A hack so I don't have to change this line.
}; };
......
...@@ -139,7 +139,7 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) { ...@@ -139,7 +139,7 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) {
for (i=0; i<node->u.n.n_children-1; i++) { for (i=0; i<node->u.n.n_children-1; i++) {
csize+=toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]); csize+=toku_brtnode_pivot_key_len(node, node->u.n.childkeys[i]);
} }
size+=(8+4+4+8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and the leafentry_estimate. */ size+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and 1 for the exact bit for the estimates. */
int n_buffers = node->u.n.n_children; int n_buffers = node->u.n.n_children;
assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1); assert(0 <= n_buffers && n_buffers < TREE_FANOUT+1);
for (i=0; i< n_buffers; i++) { for (i=0; i< n_buffers; i++) {
...@@ -159,6 +159,7 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) { ...@@ -159,6 +159,7 @@ static unsigned int toku_serialize_brtnode_size_slow (BRTNODE node) {
&hsize); &hsize);
assert(hsize<=node->u.l.n_bytes_in_buffer); assert(hsize<=node->u.l.n_bytes_in_buffer);
hsize+=4; /* add n entries in buffer table. */ hsize+=4; /* add n entries in buffer table. */
hsize+=3*8; /* add the three leaf stats, but no exact bit. */
return size+hsize; return size+hsize;
} }
} }
...@@ -174,10 +175,11 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) { ...@@ -174,10 +175,11 @@ unsigned int toku_serialize_brtnode_size (BRTNODE node) {
if (node->flags & TOKU_DB_DUPSORT) result += 4*(node->u.n.n_children-1); /* data lengths */ if (node->flags & TOKU_DB_DUPSORT) result += 4*(node->u.n.n_children-1); /* data lengths */
assert(node->u.n.totalchildkeylens < (1<<30)); assert(node->u.n.totalchildkeylens < (1<<30));
result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */ result+=node->u.n.totalchildkeylens; /* the lengths of the pivot keys, without their key lengths. */
result+=(8+4+4+8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and the leafentry_estimate. */ result+=(8+4+4+1+3*8)*(node->u.n.n_children); /* For each child, a child offset, a count for the number of hash table entries, the subtree fingerprint, and 3*8 for the subtree estimates and one for the exact bit. */
result+=node->u.n.n_bytes_in_buffers; result+=node->u.n.n_bytes_in_buffers;
} else { } else {
result+=4; /* n_entries in buffer table. */ result+=4; /* n_entries in buffer table. */
result+=3*8; /* the three leaf stats. */
result+=node->u.l.n_bytes_in_buffer; result+=node->u.l.n_bytes_in_buffer;
if (toku_memory_check) { if (toku_memory_check) {
unsigned int slowresult = toku_serialize_brtnode_size_slow(node); unsigned int slowresult = toku_serialize_brtnode_size_slow(node);
...@@ -330,7 +332,11 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b ...@@ -330,7 +332,11 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
wbuf_int(&w, node->u.n.n_children); wbuf_int(&w, node->u.n.n_children);
for (i=0; i<node->u.n.n_children; i++) { for (i=0; i<node->u.n.n_children; i++) {
wbuf_uint(&w, BNC_SUBTREE_FINGERPRINT(node, i)); wbuf_uint(&w, BNC_SUBTREE_FINGERPRINT(node, i));
wbuf_ulonglong(&w, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i)); struct subtree_estimates *se = &(BNC_SUBTREE_ESTIMATES(node, i));
wbuf_ulonglong(&w, se->nkeys);
wbuf_ulonglong(&w, se->ndata);
wbuf_ulonglong(&w, se->dsize);
wbuf_char (&w, se->exact);
} }
//printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone); //printf("%s:%d w.ndone=%d\n", __FILE__, __LINE__, w.ndone);
for (i=0; i<node->u.n.n_children-1; i++) { for (i=0; i<node->u.n.n_children-1; i++) {
...@@ -369,6 +375,9 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b ...@@ -369,6 +375,9 @@ int toku_serialize_brtnode_to (int fd, BLOCKNUM blocknum, BRTNODE node, struct b
} }
} else { } else {
//printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer)); //printf("%s:%d writing node %lld n_entries=%d\n", __FILE__, __LINE__, node->thisnodename, toku_gpma_n_entries(node->u.l.buffer));
wbuf_ulonglong(&w, node->u.l.leaf_stats.nkeys);
wbuf_ulonglong(&w, node->u.l.leaf_stats.ndata);
wbuf_ulonglong(&w, node->u.l.leaf_stats.dsize);
wbuf_uint(&w, toku_omt_size(node->u.l.buffer)); wbuf_uint(&w, toku_omt_size(node->u.l.buffer));
toku_omt_iterate(node->u.l.buffer, wbufwriteleafentry, &w); toku_omt_iterate(node->u.l.buffer, wbufwriteleafentry, &w);
} }
...@@ -676,9 +685,7 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash ...@@ -676,9 +685,7 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
result->layout_version = rbuf_int(&rc); result->layout_version = rbuf_int(&rc);
{ {
switch (result->layout_version) { switch (result->layout_version) {
case BRT_LAYOUT_VERSION_10: case BRT_LAYOUT_VERSION_10: goto ok_layout_version;
case BRT_LAYOUT_VERSION_9:
goto ok_layout_version;
// Don't support older versions. // Don't support older versions.
} }
r=toku_db_badformat(); r=toku_db_badformat();
...@@ -711,7 +718,11 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash ...@@ -711,7 +718,11 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
u_int32_t childfp = rbuf_int(&rc); u_int32_t childfp = rbuf_int(&rc);
BNC_SUBTREE_FINGERPRINT(result, i)= childfp; BNC_SUBTREE_FINGERPRINT(result, i)= childfp;
check_subtree_fingerprint += childfp; check_subtree_fingerprint += childfp;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(result, i)=rbuf_ulonglong(&rc); struct subtree_estimates *se = &(BNC_SUBTREE_ESTIMATES(result, i));
se->nkeys = rbuf_ulonglong(&rc);
se->ndata = rbuf_ulonglong(&rc);
se->dsize = rbuf_ulonglong(&rc);
se->exact = rbuf_char(&rc);
} }
for (i=0; i<result->u.n.n_children-1; i++) { for (i=0; i<result->u.n.n_children-1; i++) {
if (result->flags & TOKU_DB_DUPSORT) { if (result->flags & TOKU_DB_DUPSORT) {
...@@ -782,6 +793,10 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash ...@@ -782,6 +793,10 @@ int toku_deserialize_brtnode_from (int fd, BLOCKNUM blocknum, u_int32_t fullhash
} }
} }
} else { } else {
result->u.l.leaf_stats.nkeys = rbuf_ulonglong(&rc);
result->u.l.leaf_stats.ndata = rbuf_ulonglong(&rc);
result->u.l.leaf_stats.dsize = rbuf_ulonglong(&rc);
result->u.l.leaf_stats.exact = TRUE;
int n_in_buf = rbuf_int(&rc); int n_in_buf = rbuf_int(&rc);
result->u.l.n_bytes_in_buffer = 0; result->u.l.n_bytes_in_buffer = 0;
result->u.l.seqinsert = 0; result->u.l.seqinsert = 0;
...@@ -1055,6 +1070,7 @@ deserialize_brtheader (u_int32_t size, int fd, DISKOFF off, struct brt_header ** ...@@ -1055,6 +1070,7 @@ deserialize_brtheader (u_int32_t size, int fd, DISKOFF off, struct brt_header **
assert(byte_order_stored == toku_byte_order_host); assert(byte_order_stored == toku_byte_order_host);
h->nodesize = rbuf_int(&rc); h->nodesize = rbuf_int(&rc);
assert(h->layout_version==BRT_LAYOUT_VERSION_10);
BLOCKNUM free_blocks = rbuf_blocknum(&rc); BLOCKNUM free_blocks = rbuf_blocknum(&rc);
BLOCKNUM unused_blocks = rbuf_blocknum(&rc); BLOCKNUM unused_blocks = rbuf_blocknum(&rc);
h->n_named_roots = rbuf_int(&rc); h->n_named_roots = rbuf_int(&rc);
......
...@@ -27,7 +27,7 @@ int toku_testsetup_nonleaf (BRT brt, int height, BLOCKNUM *blocknum, int n_child ...@@ -27,7 +27,7 @@ int toku_testsetup_nonleaf (BRT brt, int height, BLOCKNUM *blocknum, int n_child
int i; int i;
for (i=0; i<n_children; i++) { for (i=0; i<n_children; i++) {
node->u.n.childinfos[i] = (struct brtnode_nonleaf_childinfo){ .subtree_fingerprint = subtree_fingerprints[i], node->u.n.childinfos[i] = (struct brtnode_nonleaf_childinfo){ .subtree_fingerprint = subtree_fingerprints[i],
.leafentry_estimate = 0, .subtree_estimates = zero_estimates,
.blocknum = children[i], .blocknum = children[i],
.n_bytes_in_buffer = 0 }; .n_bytes_in_buffer = 0 };
r = toku_fifo_create(&BNC_BUFFER(node,i)); if (r!=0) return r; r = toku_fifo_create(&BNC_BUFFER(node,i)); if (r!=0) return r;
......
...@@ -254,6 +254,50 @@ static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnu ...@@ -254,6 +254,50 @@ static u_int32_t compute_child_fullhash (CACHEFILE cf, BRTNODE node, int childnu
abort(); return 0; abort(); return 0;
} }
struct fill_leafnode_estimates_state {
struct subtree_estimates *e;
OMTVALUE prevval;
BRTNODE node;
};
static int
fill_leafnode_estimates (OMTVALUE val, u_int32_t UU(idx), void *vs)
{
LEAFENTRY le = val;
struct fill_leafnode_estimates_state *s = vs;
s->e->dsize += le_any_keylen(le) + le_any_vallen(le);
s->e->ndata++;
if ((s->prevval == NULL) ||
(0 == (s->node->flags & TOKU_DB_DUPSORT)) ||
(le_any_keylen(le) != le_any_keylen(s->prevval)) ||
(memcmp(le_any_key(le), le_any_key(s->prevval), le_any_keylen(le))!=0)) { // really should use comparison function
s->e->nkeys++;
}
s->prevval = le;
return 0;
}
static struct subtree_estimates
calc_leaf_stats (BRTNODE node) {
struct subtree_estimates e = zero_estimates;
struct fill_leafnode_estimates_state f = {&e, (OMTVALUE)NULL, node};
toku_omt_iterate(node->u.l.buffer, fill_leafnode_estimates, &f);
return e;
}
static void __attribute__((__unused__))
brt_leaf_check_leaf_stats (BRTNODE node)
{
static int count=0; count++;
if (node->height>0) return;
struct subtree_estimates e = calc_leaf_stats(node);
assert(e.ndata == node->u.l.leaf_stats.ndata);
assert(e.nkeys == node->u.l.leaf_stats.nkeys);
assert(e.dsize == node->u.l.leaf_stats.dsize);
assert(node->u.l.leaf_stats.exact);
}
// This should be done incrementally in most cases.
static void static void
fixup_child_fingerprint (BRTNODE node, int childnum_of_node, BRTNODE child) fixup_child_fingerprint (BRTNODE node, int childnum_of_node, BRTNODE child)
// Effect: Sum the child fingerprint (and leafentry estimates) and store them in NODE. // Effect: Sum the child fingerprint (and leafentry estimates) and store them in NODE.
...@@ -264,21 +308,32 @@ fixup_child_fingerprint (BRTNODE node, int childnum_of_node, BRTNODE child) ...@@ -264,21 +308,32 @@ fixup_child_fingerprint (BRTNODE node, int childnum_of_node, BRTNODE child)
// brt The brt (not used now but it will be for logger) // brt The brt (not used now but it will be for logger)
// logger The logger (not used now but it will be for logger) // logger The logger (not used now but it will be for logger)
{ {
u_int64_t leafentry_estimate = 0; struct subtree_estimates estimates = zero_estimates;
u_int32_t sum = child->local_fingerprint; u_int32_t sum = child->local_fingerprint;
estimates.exact = TRUE;
if (child->height>0) { if (child->height>0) {
int i; int i;
for (i=0; i<child->u.n.n_children; i++) { for (i=0; i<child->u.n.n_children; i++) {
sum += BNC_SUBTREE_FINGERPRINT(child,i); sum += BNC_SUBTREE_FINGERPRINT(child,i);
leafentry_estimate += BNC_SUBTREE_LEAFENTRY_ESTIMATE(child,i); struct subtree_estimates *child_se = &BNC_SUBTREE_ESTIMATES(child,i);
estimates.nkeys += child_se->nkeys;
estimates.ndata += child_se->ndata;
estimates.dsize += child_se->dsize;
if (!child_se->exact) estimates.exact = FALSE;
if (toku_fifo_n_entries(BNC_BUFFER(child,i))!=0) estimates.exact=FALSE;
} }
} else { } else {
leafentry_estimate = toku_omt_size(child->u.l.buffer); estimates = child->u.l.leaf_stats;
#ifdef SLOWSLOW
assert(estimates.ndata == child->u.l.leaf_stats.ndata);
struct fill_leafnode_estimates_state s = {&estimates, (OMTVALUE)NULL};
toku_omt_iterate(child->u.l.buffer, fill_leafnode_estimates, &s);
#endif
} }
// Don't try to get fancy about not modifying the fingerprint if it didn't change. // Don't try to get fancy about not modifying the fingerprint if it didn't change.
// We only call this function if we have reason to believe that the child's fingerprint did change. // We only call this function if we have reason to believe that the child's fingerprint did change.
BNC_SUBTREE_FINGERPRINT(node,childnum_of_node)=sum; BNC_SUBTREE_FINGERPRINT(node,childnum_of_node)=sum;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(node,childnum_of_node)=leafentry_estimate; BNC_SUBTREE_ESTIMATES(node,childnum_of_node)=estimates;
node->dirty=1; node->dirty=1;
} }
...@@ -286,6 +341,7 @@ static inline void ...@@ -286,6 +341,7 @@ static inline void
verify_local_fingerprint_nonleaf (BRTNODE node) verify_local_fingerprint_nonleaf (BRTNODE node)
{ {
if (0) { if (0) {
//brt_leaf_check_leaf_stats(node);
static int count=0; count++; static int count=0; count++;
u_int32_t fp=0; u_int32_t fp=0;
int i; int i;
...@@ -310,16 +366,17 @@ toku_verify_estimates (BRT t, BRTNODE node) { ...@@ -310,16 +366,17 @@ toku_verify_estimates (BRT t, BRTNODE node) {
int r = toku_cachetable_get_and_pin(t->cf, childblocknum, fullhash, &childnode_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, t->h); int r = toku_cachetable_get_and_pin(t->cf, childblocknum, fullhash, &childnode_v, NULL, toku_brtnode_flush_callback, toku_brtnode_fetch_callback, t->h);
assert(r==0); assert(r==0);
BRTNODE childnode = childnode_v; BRTNODE childnode = childnode_v;
// we'll just do this estimate
u_int64_t child_estimate = 0; u_int64_t child_estimate = 0;
if (childnode->height==0) { if (childnode->height==0) {
child_estimate = toku_omt_size(childnode->u.l.buffer); child_estimate = toku_omt_size(childnode->u.l.buffer);
} else { } else {
int i; int i;
for (i=0; i<childnode->u.n.n_children; i++) { for (i=0; i<childnode->u.n.n_children; i++) {
child_estimate += BNC_SUBTREE_LEAFENTRY_ESTIMATE(childnode, i); child_estimate += BNC_SUBTREE_ESTIMATES(childnode, i).ndata;
} }
} }
assert(BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, childnum)==child_estimate); assert(BNC_SUBTREE_ESTIMATES(node, childnum).ndata==child_estimate);
toku_unpin_brtnode(t, childnode); toku_unpin_brtnode(t, childnode);
} }
} }
...@@ -621,6 +678,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height) ...@@ -621,6 +678,7 @@ initialize_empty_brtnode (BRT t, BRTNODE n, BLOCKNUM nodename, int height)
n->u.n.childinfos=0; n->u.n.childinfos=0;
n->u.n.childkeys=0; n->u.n.childkeys=0;
} else { } else {
n->u.l.leaf_stats = zero_estimates;
int r; int r;
r = toku_omt_create(&n->u.l.buffer); r = toku_omt_create(&n->u.l.buffer);
assert(r==0); assert(r==0);
...@@ -687,8 +745,8 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r ...@@ -687,8 +745,8 @@ brt_init_new_root(BRT brt, BRTNODE nodea, BRTNODE nodeb, DBT splitk, CACHEKEY *r
BNC_NBYTESINBUF(newroot, 1)=0; BNC_NBYTESINBUF(newroot, 1)=0;
BNC_SUBTREE_FINGERPRINT(newroot, 0)=0; BNC_SUBTREE_FINGERPRINT(newroot, 0)=0;
BNC_SUBTREE_FINGERPRINT(newroot, 1)=0; BNC_SUBTREE_FINGERPRINT(newroot, 1)=0;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(newroot, 0)=0; BNC_SUBTREE_ESTIMATES(newroot, 0)=zero_estimates;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(newroot, 1)=0; BNC_SUBTREE_ESTIMATES(newroot, 1)=zero_estimates;
verify_local_fingerprint_nonleaf(nodea); verify_local_fingerprint_nonleaf(nodea);
verify_local_fingerprint_nonleaf(nodeb); verify_local_fingerprint_nonleaf(nodeb);
fixup_child_fingerprint(newroot, 0, nodea); fixup_child_fingerprint(newroot, 0, nodea);
...@@ -792,20 +850,48 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) ...@@ -792,20 +850,48 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
u_int32_t i; u_int32_t i;
u_int32_t diff_fp = 0; u_int32_t diff_fp = 0;
u_int32_t diff_size = 0; u_int32_t diff_size = 0;
struct subtree_estimates diff_est = zero_estimates;
LEAFENTRY free_us[n_leafentries-break_at];
for (i=break_at; i<n_leafentries; i++) { for (i=break_at; i<n_leafentries; i++) {
LEAFENTRY prevle = (i>0) ? leafentries[i-1] : 0;
LEAFENTRY oldle = leafentries[i]; LEAFENTRY oldle = leafentries[i];
LEAFENTRY newle = toku_mempool_malloc(&B->u.l.buffer_mempool, leafentry_memsize(oldle), 1); LEAFENTRY newle = toku_mempool_malloc(&B->u.l.buffer_mempool, leafentry_memsize(oldle), 1);
assert(newle!=0); // it's a fresh mpool, so this should always work. assert(newle!=0); // it's a fresh mpool, so this should always work.
BOOL key_is_unique;
{
DBT xdbt,ydbt;
if (t->flags & TOKU_DB_DUPSORT) key_is_unique=TRUE;
else if (prevle==NULL) key_is_unique=TRUE;
else if (t->compare_fun(t->db,
toku_fill_dbt(&xdbt, le_any_key(prevle), le_any_keylen(prevle)),
toku_fill_dbt(&ydbt, le_any_key(oldle), le_any_keylen(oldle)))
==0) {
key_is_unique=FALSE;
} else {
key_is_unique=TRUE;
}
}
if (key_is_unique) diff_est.nkeys++;
diff_est.ndata++;
diff_est.dsize += le_any_keylen(oldle) + le_any_vallen(oldle);
//printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(oldle)+ le_any_vallen(oldle), diff_est.dsize);
diff_fp += toku_le_crc(oldle); diff_fp += toku_le_crc(oldle);
diff_size += OMT_ITEM_OVERHEAD + leafentry_disksize(oldle); diff_size += OMT_ITEM_OVERHEAD + leafentry_disksize(oldle);
memcpy(newle, oldle, leafentry_memsize(oldle)); memcpy(newle, oldle, leafentry_memsize(oldle));
toku_mempool_mfree(&node->u.l.buffer_mempool, oldle, leafentry_memsize(oldle)); free_us[i-break_at] = oldle; // don't free the old leafentries yet, since we compare them in the other iterations of the loops
leafentries[i] = newle; leafentries[i] = newle;
} }
for (i=break_at; i<n_leafentries; i++) {
LEAFENTRY oldle = free_us[i-break_at];
toku_mempool_mfree(&node->u.l.buffer_mempool, oldle, leafentry_memsize(oldle));
}
node->local_fingerprint -= node->rand4fingerprint * diff_fp; node->local_fingerprint -= node->rand4fingerprint * diff_fp;
B ->local_fingerprint += B ->rand4fingerprint * diff_fp; B ->local_fingerprint += B ->rand4fingerprint * diff_fp;
node->u.l.n_bytes_in_buffer -= diff_size; node->u.l.n_bytes_in_buffer -= diff_size;
B ->u.l.n_bytes_in_buffer += diff_size; B ->u.l.n_bytes_in_buffer += diff_size;
subtract_estimates(&node->u.l.leaf_stats, &diff_est);
add_estimates (&B->u.l.leaf_stats, &diff_est);
//printf("%s:%d After subtracint and adding got %lu and %lu\n", __FILE__, __LINE__, node->u.l.leaf_stats.dsize, B->u.l.leaf_stats.dsize);
} }
if ((r = toku_omt_create_from_sorted_array(&B->u.l.buffer, leafentries+break_at, n_leafentries-break_at))) return r; if ((r = toku_omt_create_from_sorted_array(&B->u.l.buffer, leafentries+break_at, n_leafentries-break_at))) return r;
if ((r = toku_omt_create_steal_sorted_array(&node->u.l.buffer, &leafentries, break_at, n_leafentries))) return r; if ((r = toku_omt_create_steal_sorted_array(&node->u.l.buffer, &leafentries, break_at, n_leafentries))) return r;
...@@ -815,6 +901,9 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk) ...@@ -815,6 +901,9 @@ brtleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *splitk)
toku_verify_all_in_mempool(B); toku_verify_all_in_mempool(B);
toku_omt_destroy(&old_omt); toku_omt_destroy(&old_omt);
node->u.l.leaf_stats = calc_leaf_stats(node);
B ->u.l.leaf_stats = calc_leaf_stats(B );
} }
//toku_verify_gpma(node->u.l.buffer); //toku_verify_gpma(node->u.l.buffer);
...@@ -873,8 +962,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -873,8 +962,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
B->u.n.n_children =n_children_in_b; B->u.n.n_children =n_children_in_b;
if (0) { if (0) {
printf("%s:%d %p (%" PRId64 ") splits, old estimates:", __FILE__, __LINE__, node, node->thisnodename.b); printf("%s:%d %p (%" PRId64 ") splits, old estimates:", __FILE__, __LINE__, node, node->thisnodename.b);
int i; //int i;
for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i)); //for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i));
printf("\n"); printf("\n");
} }
...@@ -890,7 +979,7 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -890,7 +979,7 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
if (r!=0) return r; if (r!=0) return r;
BNC_NBYTESINBUF(B,i)=0; BNC_NBYTESINBUF(B,i)=0;
BNC_SUBTREE_FINGERPRINT(B,i)=0; BNC_SUBTREE_FINGERPRINT(B,i)=0;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(B,i)=0; BNC_SUBTREE_ESTIMATES(B,i)=zero_estimates;
} }
verify_local_fingerprint_nonleaf(node); verify_local_fingerprint_nonleaf(node);
...@@ -948,8 +1037,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl ...@@ -948,8 +1037,8 @@ brt_nonleaf_split (BRT t, BRTNODE node, BRTNODE *nodea, BRTNODE *nodeb, DBT *spl
BNC_SUBTREE_FINGERPRINT(B, targchild) = BNC_SUBTREE_FINGERPRINT(node, i); BNC_SUBTREE_FINGERPRINT(B, targchild) = BNC_SUBTREE_FINGERPRINT(node, i);
BNC_SUBTREE_FINGERPRINT(node, i) = 0; BNC_SUBTREE_FINGERPRINT(node, i) = 0;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(B, targchild) = BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i); BNC_SUBTREE_ESTIMATES(B, targchild) = BNC_SUBTREE_ESTIMATES(node, i);
BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i) = 0; BNC_SUBTREE_ESTIMATES(node, i) = zero_estimates;
assert(BNC_NBYTESINBUF(node, i) == 0); assert(BNC_NBYTESINBUF(node, i) == 0);
} }
...@@ -1022,7 +1111,7 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -1022,7 +1111,7 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum,
XREALLOC_N(node->u.n.n_children+1, node->u.n.childkeys); XREALLOC_N(node->u.n.n_children+1, node->u.n.childkeys);
// Slide the children over. // Slide the children over.
BNC_SUBTREE_FINGERPRINT (node, node->u.n.n_children+1)=0; BNC_SUBTREE_FINGERPRINT (node, node->u.n.n_children+1)=0;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, node->u.n.n_children+1)=0; BNC_SUBTREE_ESTIMATES (node, node->u.n.n_children+1)=zero_estimates;
for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) { for (cnum=node->u.n.n_children; cnum>childnum+1; cnum--) {
node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1]; node->u.n.childinfos[cnum] = node->u.n.childinfos[cnum-1];
} }
...@@ -1033,8 +1122,8 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum, ...@@ -1033,8 +1122,8 @@ handle_split_of_child (BRT t, BRTNODE node, int childnum,
BNC_HAVE_FULLHASH(node, childnum+1) = TRUE; BNC_HAVE_FULLHASH(node, childnum+1) = TRUE;
BNC_FULLHASH(node, childnum+1) = childb->fullhash; BNC_FULLHASH(node, childnum+1) = childb->fullhash;
// BNC_SUBTREE_FINGERPRINT(node, childnum)=0; // leave the subtreefingerprint alone for the child, so we can log the change // BNC_SUBTREE_FINGERPRINT(node, childnum)=0; // leave the subtreefingerprint alone for the child, so we can log the change
BNC_SUBTREE_FINGERPRINT (node, childnum+1)=0; BNC_SUBTREE_FINGERPRINT(node, childnum+1)=0;
BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, childnum+1)=0; BNC_SUBTREE_ESTIMATES (node, childnum+1)=zero_estimates;
fixup_child_fingerprint(node, childnum, childa); fixup_child_fingerprint(node, childnum, childa);
fixup_child_fingerprint(node, childnum+1, childb); fixup_child_fingerprint(node, childnum+1, childb);
r=toku_fifo_create(&BNC_BUFFER(node,childnum+1)); assert(r==0); r=toku_fifo_create(&BNC_BUFFER(node,childnum+1)); assert(r==0);
...@@ -1096,8 +1185,8 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react) ...@@ -1096,8 +1185,8 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react)
{ {
if (0) { if (0) {
printf("%s:%d Node %" PRId64 "->u.n.n_children=%d estimates=", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children); printf("%s:%d Node %" PRId64 "->u.n.n_children=%d estimates=", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children);
int i; //int i;
for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i)); //for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i));
printf("\n"); printf("\n");
} }
assert(node->height>0); assert(node->height>0);
...@@ -1146,8 +1235,8 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react) ...@@ -1146,8 +1235,8 @@ brt_split_child (BRT t, BRTNODE node, int childnum, BOOL *did_react)
int r = handle_split_of_child (t, node, childnum, nodea, nodeb, &splitk); int r = handle_split_of_child (t, node, childnum, nodea, nodeb, &splitk);
if (0) { if (0) {
printf("%s:%d Node %" PRId64 "->u.n.n_children=%d estimates=", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children); printf("%s:%d Node %" PRId64 "->u.n.n_children=%d estimates=", __FILE__, __LINE__, node->thisnodename.b, node->u.n.n_children);
int i; //int i;
for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i)); //for (i=0; i<node->u.n.n_children; i++) printf(" %" PRIu64, BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i));
printf("\n"); printf("\n");
} }
return r; return r;
...@@ -1416,6 +1505,37 @@ apply_cmd_to_leaf (BRT_CMD cmd, ...@@ -1416,6 +1505,37 @@ apply_cmd_to_leaf (BRT_CMD cmd,
abort(); return 0; abort(); return 0;
} }
static int
other_key_matches (BRTNODE node, u_int32_t idx, LEAFENTRY le)
{
OMTVALUE other_lev = 0;
int r = toku_omt_fetch(node->u.l.buffer, idx, &other_lev, (OMTCURSOR)NULL);
assert(r==0);
LEAFENTRY other_le = other_lev;
u_int32_t other_keylen = le_any_keylen(other_le);
if (other_keylen == le_any_keylen(le)
&& memcmp(le_any_key(other_le), le_any_key(le), other_keylen)==0) // really should use comparison function
return 1;
else
return 0;
}
static void
maybe_bump_nkeys (BRTNODE node, u_int32_t idx, LEAFENTRY le, int direction) {
int keybump=direction;
if (0 != (node->flags & TOKU_DB_DUPSORT)) {
if (idx>0) {
if (other_key_matches(node, idx-1, le)) keybump=0;
}
if (idx+1<toku_omt_size(node->u.l.buffer)) {
if (other_key_matches(node, idx+1, le)) keybump=0;
}
}
node->u.l.leaf_stats.nkeys += keybump;;
assert(node->u.l.leaf_stats.exact);
}
static int static int
brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd, brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
u_int32_t idx, LEAFENTRY le) u_int32_t idx, LEAFENTRY le)
...@@ -1423,6 +1543,8 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd, ...@@ -1423,6 +1543,8 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
// idx is the location where it goes // idx is the location where it goes
// le is old leafentry // le is old leafentry
{ {
// brt_leaf_check_leaf_stats(node);
u_int32_t newlen=0, newdisksize=0; u_int32_t newlen=0, newdisksize=0;
LEAFENTRY new_le=0; LEAFENTRY new_le=0;
void *maybe_free = 0; void *maybe_free = 0;
...@@ -1443,9 +1565,22 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd, ...@@ -1443,9 +1565,22 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
if (le && new_le) { if (le && new_le) {
// If we are replacing a leafentry, then the counts on the estimates remain unchanged, but the size might change
{
u_int32_t oldlen = le_any_vallen(le);
assert(node->u.l.leaf_stats.dsize >= oldlen);
assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
node->u.l.leaf_stats.dsize -= oldlen;
node->u.l.leaf_stats.dsize += le_any_vallen(new_le); // add it in two pieces to avoid ugly overflow
assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
}
node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(le); node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(le);
node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le); node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le);
//printf("%s:%d Added %u-%u got %lu\n", __FILE__, __LINE__, le_any_keylen(new_le), le_any_vallen(le), node->u.l.leaf_stats.dsize);
// the ndata and nkeys remains unchanged
u_int32_t size = leafentry_memsize(le); u_int32_t size = leafentry_memsize(le);
// This mfree must occur after the mempool_malloc so that when the mempool is compressed everything is accounted for. // This mfree must occur after the mempool_malloc so that when the mempool is compressed everything is accounted for.
...@@ -1461,19 +1596,38 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd, ...@@ -1461,19 +1596,38 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
if (le) { if (le) {
// It's there, note that it's gone and remove it from the mempool // It's there, note that it's gone and remove it from the mempool
// Figure out if one of the other keys is the same key
maybe_bump_nkeys(node, idx, le, -1);
if ((r = toku_omt_delete_at(node->u.l.buffer, idx))) goto return_r; if ((r = toku_omt_delete_at(node->u.l.buffer, idx))) goto return_r;
node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(le); node->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + leafentry_disksize(le);
node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le); node->local_fingerprint -= node->rand4fingerprint * toku_le_crc(le);
{
u_int32_t oldlen = le_any_vallen(le) + le_any_keylen(le);
assert(node->u.l.leaf_stats.dsize >= oldlen);
node->u.l.leaf_stats.dsize -= oldlen;
}
assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
node->u.l.leaf_stats.ndata --;
toku_mempool_mfree(&node->u.l.buffer_mempool, 0, leafentry_memsize(le)); // Must pass 0, since le may be no good any more. toku_mempool_mfree(&node->u.l.buffer_mempool, 0, leafentry_memsize(le)); // Must pass 0, since le may be no good any more.
} }
if (new_le) { if (new_le) {
if ((r = toku_omt_insert_at(node->u.l.buffer, new_le, idx))) goto return_r; if ((r = toku_omt_insert_at(node->u.l.buffer, new_le, idx))) goto return_r;
node->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + newdisksize; node->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + newdisksize;
node->local_fingerprint += node->rand4fingerprint*toku_le_crc(new_le); node->local_fingerprint += node->rand4fingerprint*toku_le_crc(new_le);
node->u.l.leaf_stats.dsize += le_any_vallen(new_le) + le_any_keylen(new_le);
assert(node->u.l.leaf_stats.dsize < (1U<<31)); // make sure we didn't underflow
node->u.l.leaf_stats.ndata ++;
// Look at the key to the left and the one to the right. If both are different then increment nkeys.
maybe_bump_nkeys(node, idx, new_le, +1);
} }
} }
r=0; r=0;
...@@ -1482,6 +1636,8 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd, ...@@ -1482,6 +1636,8 @@ brt_leaf_apply_cmd_once (BRTNODE node, BRT_CMD cmd,
if (maybe_free) toku_free(maybe_free); // if (maybe_free) toku_free(maybe_free); //
// brt_leaf_check_leaf_stats(node);
return r; return r;
} }
...@@ -1854,16 +2010,31 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) { ...@@ -1854,16 +2010,31 @@ merge_leaf_nodes (BRTNODE a, BRTNODE b) {
LEAFENTRY new_le = mempool_malloc_from_omt(omta, &a->u.l.buffer_mempool, le_size, 0); LEAFENTRY new_le = mempool_malloc_from_omt(omta, &a->u.l.buffer_mempool, le_size, 0);
assert(new_le); assert(new_le);
memcpy(new_le, le, le_size); memcpy(new_le, le, le_size);
int r = toku_omt_insert_at(omta, new_le, toku_omt_size(a->u.l.buffer)); int idx = toku_omt_size(a->u.l.buffer);
int r = toku_omt_insert_at(omta, new_le, idx);
assert(r==0); assert(r==0);
a->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size; a->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size;
a->local_fingerprint += a->rand4fingerprint * le_crc; a->local_fingerprint += a->rand4fingerprint * le_crc;
a->u.l.leaf_stats.ndata++;
maybe_bump_nkeys(a, idx, new_le, +1);
a->u.l.leaf_stats.dsize+= le_any_keylen(le) + le_any_vallen(le);
//printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+le_any_vallen(le), a->u.l.leaf_stats.dsize);
} }
{ {
maybe_bump_nkeys(b, 0, le, -1);
int r = toku_omt_delete_at(omtb, 0); int r = toku_omt_delete_at(omtb, 0);
assert(r==0); assert(r==0);
b->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + le_size; b->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + le_size;
b->local_fingerprint -= b->rand4fingerprint * le_crc; b->local_fingerprint -= b->rand4fingerprint * le_crc;
b->u.l.leaf_stats.ndata--;
b->u.l.leaf_stats.dsize-= le_any_keylen(le) + le_any_vallen(le);
//printf("%s:%d Subed %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+le_any_vallen(le), b->u.l.leaf_stats.dsize);
assert(b->u.l.leaf_stats.ndata < 1U<<31);
assert(b->u.l.leaf_stats.nkeys < 1U<<31);
assert(b->u.l.leaf_stats.dsize < 1U<<31);
toku_mempool_mfree(&b->u.l.buffer_mempool, 0, le_size); toku_mempool_mfree(&b->u.l.buffer_mempool, 0, le_size);
} }
} }
...@@ -1896,17 +2067,31 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk) ...@@ -1896,17 +2067,31 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
memcpy(new_le, le, le_size); memcpy(new_le, le, le_size);
int r = toku_omt_insert_at(omtto, new_le, to_idx); int r = toku_omt_insert_at(omtto, new_le, to_idx);
assert(r==0); assert(r==0);
maybe_bump_nkeys(to, to_idx, le, +1);
to ->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size; to ->u.l.n_bytes_in_buffer += OMT_ITEM_OVERHEAD + le_size;
to ->local_fingerprint += to->rand4fingerprint * le_crc; to ->local_fingerprint += to->rand4fingerprint * le_crc;
to->u.l.leaf_stats.ndata++;
to->u.l.leaf_stats.dsize+= le_any_keylen(le) + le_any_vallen(le);
//printf("%s:%d Added %u got %lu\n", __FILE__, __LINE__, le_any_keylen(le)+ le_any_vallen(le), to->u.l.leaf_stats.dsize);
} }
{ {
maybe_bump_nkeys(from, from_idx, le, -1);
int r = toku_omt_delete_at(omtfrom, from_idx); int r = toku_omt_delete_at(omtfrom, from_idx);
assert(r==0); assert(r==0);
from->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + le_size; from->u.l.n_bytes_in_buffer -= OMT_ITEM_OVERHEAD + le_size;
from->local_fingerprint -= from->rand4fingerprint * le_crc; from->local_fingerprint -= from->rand4fingerprint * le_crc;
from->u.l.leaf_stats.ndata--;
from->u.l.leaf_stats.dsize-= le_any_keylen(le) + le_any_vallen(le);
assert(from->u.l.leaf_stats.ndata < 1U<<31);
assert(from->u.l.leaf_stats.nkeys < 1U<<31);
//printf("%s:%d Removed %u get %lu\n", __FILE__, __LINE__, le_any_keylen(le)+ le_any_vallen(le), from->u.l.leaf_stats.dsize);
toku_mempool_mfree(&from->u.l.buffer_mempool, 0, le_size); toku_mempool_mfree(&from->u.l.buffer_mempool, 0, le_size);
} }
} }
assert(from->u.l.leaf_stats.dsize < 1U<<31);
assert(toku_omt_size(a->u.l.buffer)>0); assert(toku_omt_size(a->u.l.buffer)>0);
{ {
LEAFENTRY le = fetch_from_buf(a->u.l.buffer, toku_omt_size(a->u.l.buffer)-1); LEAFENTRY le = fetch_from_buf(a->u.l.buffer, toku_omt_size(a->u.l.buffer)-1);
...@@ -1924,7 +2109,8 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk) ...@@ -1924,7 +2109,8 @@ balance_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair **splitk)
static int static int
maybe_merge_pinned_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair *parent_splitk, BOOL *did_merge, struct kv_pair **splitk) maybe_merge_pinned_leaf_nodes (BRTNODE parent, int childnum_of_parent,
BRTNODE a, BRTNODE b, struct kv_pair *parent_splitk, BOOL *did_merge, struct kv_pair **splitk)
// Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = TRUE. (We do this if the resulting node is not fissible) // Effect: Either merge a and b into one one node (merge them into a) and set *did_merge = TRUE. (We do this if the resulting node is not fissible)
// or distribute the leafentries evenly between a and b. (If a and be are already evenly distributed, we may do nothing.) // or distribute the leafentries evenly between a and b. (If a and be are already evenly distributed, we may do nothing.)
{ {
...@@ -1940,14 +2126,19 @@ maybe_merge_pinned_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair *parent_spli ...@@ -1940,14 +2126,19 @@ maybe_merge_pinned_leaf_nodes (BRTNODE a, BRTNODE b, struct kv_pair *parent_spli
} }
// one is less than 1/4 of a node, and together they are more than 3/4 of a node. // one is less than 1/4 of a node, and together they are more than 3/4 of a node.
toku_free(parent_splitk); // We don't need the parent_splitk any more. If we need a splitk (if we don't merge) we'll malloc a new one. toku_free(parent_splitk); // We don't need the parent_splitk any more. If we need a splitk (if we don't merge) we'll malloc a new one.
return balance_leaf_nodes(a, b, splitk); int r = balance_leaf_nodes(a, b, splitk);
if (r != 0) return r;
} else { } else {
// we are merging them. // we are merging them.
*did_merge = TRUE; *did_merge = TRUE;
*splitk = 0; *splitk = 0;
toku_free(parent_splitk); // if we are merging, the splitk gets freed. toku_free(parent_splitk); // if we are merging, the splitk gets freed.
return merge_leaf_nodes(a, b); int r = merge_leaf_nodes(a, b);
if (r != 0) return r;
} }
fixup_child_fingerprint(parent, childnum_of_parent, a);
fixup_child_fingerprint(parent, childnum_of_parent+1, b);
return 0;
} }
static int static int
...@@ -2028,7 +2219,7 @@ maybe_merge_pinned_nodes (BRT t, ...@@ -2028,7 +2219,7 @@ maybe_merge_pinned_nodes (BRT t,
verify_local_fingerprint_nonleaf(a); verify_local_fingerprint_nonleaf(a);
parent->dirty = 1; // just to make sure parent->dirty = 1; // just to make sure
if (a->height == 0) { if (a->height == 0) {
return maybe_merge_pinned_leaf_nodes(a, b, parent_splitk, did_merge, splitk); return maybe_merge_pinned_leaf_nodes(parent, childnum_of_parent, a, b, parent_splitk, did_merge, splitk);
} else { } else {
int r = maybe_merge_pinned_nonleaf_nodes(t, parent, childnum_of_parent, parent_splitk, a, b, did_merge, splitk); int r = maybe_merge_pinned_nonleaf_nodes(t, parent, childnum_of_parent, parent_splitk, a, b, did_merge, splitk);
verify_local_fingerprint_nonleaf(a); verify_local_fingerprint_nonleaf(a);
...@@ -2194,6 +2385,8 @@ brt_handle_maybe_reactive_child_at_root (BRT brt, CACHEKEY *rootp, BRTNODE *node ...@@ -2194,6 +2385,8 @@ brt_handle_maybe_reactive_child_at_root (BRT brt, CACHEKEY *rootp, BRTNODE *node
int r = brt_nonleaf_split(brt, node, &nodea, &nodeb, &splitk); int r = brt_nonleaf_split(brt, node, &nodea, &nodeb, &splitk);
if (r!=0) return r; if (r!=0) return r;
} }
//verify_local_fingerprint_nonleaf(nodea);
//verify_local_fingerprint_nonleaf(nodeb);
return brt_init_new_root(brt, nodea, nodeb, splitk, rootp, logger, nodep); return brt_init_new_root(brt, nodea, nodeb, splitk, rootp, logger, nodep);
} }
case RE_FUSIBLE: case RE_FUSIBLE:
...@@ -4278,7 +4471,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, u_int32_t fu ...@@ -4278,7 +4471,7 @@ static void toku_brt_keyrange_internal (BRT brt, CACHEKEY nodename, u_int32_t fu
for (i=0; i<node->u.n.n_children; i++) { for (i=0; i<node->u.n.n_children; i++) {
int prevcomp = (i==0) ? -1 : compares[i-1]; int prevcomp = (i==0) ? -1 : compares[i-1];
int nextcomp = (i+1 >= n_keys) ? 1 : compares[i]; int nextcomp = (i+1 >= n_keys) ? 1 : compares[i];
int subest = BNC_SUBTREE_LEAFENTRY_ESTIMATE(node, i); int subest = BNC_SUBTREE_ESTIMATES(node, i).ndata;
if (nextcomp < 0) { if (nextcomp < 0) {
// We're definitely looking too far to the left // We're definitely looking too far to the left
*less += subest; *less += subest;
...@@ -4335,6 +4528,39 @@ int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u ...@@ -4335,6 +4528,39 @@ int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u
return 0; return 0;
} }
int toku_brt_stat64 (BRT brt, TOKUTXN UU(txn), u_int64_t *nkeys, u_int64_t *ndata, u_int64_t *dsize) {
assert(brt->h);
u_int32_t fullhash;
CACHEKEY *rootp = toku_calculate_root_offset_pointer(brt, &fullhash);
CACHEKEY root = *rootp;
void *node_v;
int r = toku_cachetable_get_and_pin(brt->cf, root, fullhash,
&node_v, NULL,
toku_brtnode_flush_callback, toku_brtnode_fetch_callback, brt->h);
if (r!=0) return r;
BRTNODE node = node_v;
if (node->height==0) {
*nkeys = node->u.l.leaf_stats.nkeys;
*ndata = node->u.l.leaf_stats.ndata;
*dsize = node->u.l.leaf_stats.dsize;
} else {
*nkeys = *ndata = *dsize = 0;
int i;
for (i=0; i<node->u.n.n_children; i++) {
struct subtree_estimates *se = &BNC_SUBTREE_ESTIMATES(node, i);
*nkeys += se->nkeys;
*ndata += se->ndata;
*dsize += se->dsize;
}
}
r = toku_cachetable_unpin(brt->cf, root, fullhash, CACHETABLE_CLEAN, 0);
if (r!=0) return r;
return 0;
}
/* ********************* debugging dump ************************ */ /* ********************* debugging dump ************************ */
static int static int
toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen) { toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lorange, ITEMLEN lolen, bytevec hirange, ITEMLEN hilen) {
...@@ -4363,7 +4589,13 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo ...@@ -4363,7 +4589,13 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo
fprintf(file, "\n"); fprintf(file, "\n");
} }
for (i=0; i< node->u.n.n_children; i++) { for (i=0; i< node->u.n.n_children; i++) {
fprintf(file, "%*schild %d buffered (%d entries):\n", depth+1, "", i, toku_fifo_n_entries(BNC_BUFFER(node,i))); fprintf(file, "%*schild %d buffered (%d entries):", depth+1, "", i, toku_fifo_n_entries(BNC_BUFFER(node,i)));
{
struct subtree_estimates *e = &BNC_SUBTREE_ESTIMATES(node, i);
fprintf(file, " est={n=%" PRIu64 " k=%" PRIu64 " s=%" PRIu64 " e=%d}",
e->ndata, e->nkeys, e->dsize, e->exact);
}
fprintf(file, "\n");
FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid, FIFO_ITERATE(BNC_BUFFER(node,i), key, keylen, data, datalen, type, xid,
{ {
data=data; datalen=datalen; keylen=keylen; data=data; datalen=datalen; keylen=keylen;
...@@ -4389,9 +4621,13 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo ...@@ -4389,9 +4621,13 @@ toku_dump_brtnode (FILE *file, BRT brt, BLOCKNUM blocknum, int depth, bytevec lo
fprintf(file, "%*sNode %" PRId64 " nodesize=%u height=%d n_bytes_in_buffer=%u keyrange (key only)=", fprintf(file, "%*sNode %" PRId64 " nodesize=%u height=%d n_bytes_in_buffer=%u keyrange (key only)=",
depth, "", blocknum.b, node->nodesize, node->height, node->u.l.n_bytes_in_buffer); depth, "", blocknum.b, node->nodesize, node->height, node->u.l.n_bytes_in_buffer);
if (lorange) { toku_print_BYTESTRING(file, lolen, (void*)lorange); } else { fprintf(file, "-\\infty"); } fprintf(file, " "); if (lorange) { toku_print_BYTESTRING(file, lolen, (void*)lorange); } else { fprintf(file, "-\\infty"); } fprintf(file, " ");
if (hirange) { toku_print_BYTESTRING(file, hilen, (void*)hirange); } else { fprintf(file, "\\infty"); } fprintf(file, "\n"); if (hirange) { toku_print_BYTESTRING(file, hilen, (void*)hirange); } else { fprintf(file, "\\infty"); }
fprintf(file, " est={n=%" PRIu64 " k=%" PRIu64 " s=%" PRIu64 " e=%d}",
node->u.l.leaf_stats.ndata, node->u.l.leaf_stats.nkeys, node->u.l.leaf_stats.dsize, node->u.l.leaf_stats.exact);
fprintf(file, "\n");
int size = toku_omt_size(node->u.l.buffer); int size = toku_omt_size(node->u.l.buffer);
int i; int i;
if (0)
for (i=0; i<size; i++) { for (i=0; i<size; i++) {
OMTVALUE v = 0; OMTVALUE v = 0;
r = toku_omt_fetch(node->u.l.buffer, i, &v, 0); r = toku_omt_fetch(node->u.l.buffer, i, &v, 0);
......
...@@ -125,6 +125,7 @@ enum brt_header_flags { ...@@ -125,6 +125,7 @@ enum brt_header_flags {
}; };
int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater); int toku_brt_keyrange (BRT brt, DBT *key, u_int64_t *less, u_int64_t *equal, u_int64_t *greater);
int toku_brt_stat64 (BRT, TOKUTXN, u_int64_t *nkeys, u_int64_t *ndata, u_int64_t *dsize);
void toku_brt_init(void); void toku_brt_init(void);
void toku_brt_destroy(void); void toku_brt_destroy(void);
......
...@@ -118,7 +118,8 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) { ...@@ -118,7 +118,8 @@ dump_node (int f, BLOCKNUM blocknum, struct brt_header *h) {
printf(" subleafentry_estimates={"); printf(" subleafentry_estimates={");
for (i=0; i<n->u.n.n_children; i++) { for (i=0; i<n->u.n.n_children; i++) {
if (i>0) printf(" "); if (i>0) printf(" ");
printf("%llu", (unsigned long long)(BNC_SUBTREE_LEAFENTRY_ESTIMATE(n, i))); struct subtree_estimates *est = &(BNC_SUBTREE_ESTIMATES(n, i));
printf("{nkey=%" PRIu64 " ndata=%" PRIu64 " dsize=%" PRIu64 "}", est->nkeys, est->ndata, est->dsize);
} }
printf("}\n"); printf("}\n");
printf(" pivots:\n"); printf(" pivots:\n");
......
...@@ -36,8 +36,14 @@ static void test_serialize(void) { ...@@ -36,8 +36,14 @@ static void test_serialize(void) {
BNC_BLOCKNUM(&sn, 1).b = 35; BNC_BLOCKNUM(&sn, 1).b = 35;
BNC_SUBTREE_FINGERPRINT(&sn, 0) = random(); BNC_SUBTREE_FINGERPRINT(&sn, 0) = random();
BNC_SUBTREE_FINGERPRINT(&sn, 1) = random(); BNC_SUBTREE_FINGERPRINT(&sn, 1) = random();
BNC_SUBTREE_LEAFENTRY_ESTIMATE(&sn, 0) = random() + (((long long)random())<<32); BNC_SUBTREE_ESTIMATES(&sn, 0).ndata = random() + (((long long)random())<<32);
BNC_SUBTREE_LEAFENTRY_ESTIMATE(&sn, 1) = random() + (((long long)random())<<32); BNC_SUBTREE_ESTIMATES(&sn, 1).ndata = random() + (((long long)random())<<32);
BNC_SUBTREE_ESTIMATES(&sn, 0).nkeys = random() + (((long long)random())<<32);
BNC_SUBTREE_ESTIMATES(&sn, 1).nkeys = random() + (((long long)random())<<32);
BNC_SUBTREE_ESTIMATES(&sn, 0).dsize = random() + (((long long)random())<<32);
BNC_SUBTREE_ESTIMATES(&sn, 1).dsize = random() + (((long long)random())<<32);
BNC_SUBTREE_ESTIMATES(&sn, 0).exact = random()%2;
BNC_SUBTREE_ESTIMATES(&sn, 1).exact = random()%2;
r = toku_fifo_create(&BNC_BUFFER(&sn,0)); assert(r==0); r = toku_fifo_create(&BNC_BUFFER(&sn,0)); assert(r==0);
r = toku_fifo_create(&BNC_BUFFER(&sn,1)); assert(r==0); r = toku_fifo_create(&BNC_BUFFER(&sn,1)); assert(r==0);
r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, (TXNID)0); assert(r==0); sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, (TXNID)0, "a", 2, "aval", 5); r = toku_fifo_enq(BNC_BUFFER(&sn,0), "a", 2, "aval", 5, BRT_NONE, (TXNID)0); assert(r==0); sn.local_fingerprint += randval*toku_calc_fingerprint_cmd(BRT_NONE, (TXNID)0, "a", 2, "aval", 5);
...@@ -85,7 +91,9 @@ static void test_serialize(void) { ...@@ -85,7 +91,9 @@ static void test_serialize(void) {
int i; int i;
for (i=0; i<2; i++) { for (i=0; i<2; i++) {
assert(BNC_SUBTREE_FINGERPRINT(dn, i)==BNC_SUBTREE_FINGERPRINT(&sn, i)); assert(BNC_SUBTREE_FINGERPRINT(dn, i)==BNC_SUBTREE_FINGERPRINT(&sn, i));
assert(BNC_SUBTREE_LEAFENTRY_ESTIMATE(dn, i)==BNC_SUBTREE_LEAFENTRY_ESTIMATE(&sn, i)); assert(BNC_SUBTREE_ESTIMATES(dn, i).nkeys==BNC_SUBTREE_ESTIMATES(&sn, i).nkeys);
assert(BNC_SUBTREE_ESTIMATES(dn, i).ndata==BNC_SUBTREE_ESTIMATES(&sn, i).ndata);
assert(BNC_SUBTREE_ESTIMATES(dn, i).dsize==BNC_SUBTREE_ESTIMATES(&sn, i).dsize);
} }
assert(dn->local_fingerprint==sn.local_fingerprint); assert(dn->local_fingerprint==sn.local_fingerprint);
} }
......
...@@ -66,6 +66,7 @@ BDB_DONTRUN_TESTS = \ ...@@ -66,6 +66,7 @@ BDB_DONTRUN_TESTS = \
keyrange-dupsort \ keyrange-dupsort \
keyrange-dupsort-unflat \ keyrange-dupsort-unflat \
manyfiles \ manyfiles \
stat64 \
test938c \ test938c \
test1324 \ test1324 \
helgrind1 \ helgrind1 \
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include "test.h"
#include <db.h>
#include <sys/stat.h>
static void
test_stat64 (unsigned int N)
{
system("rm -rf " ENVDIR);
toku_os_mkdir(ENVDIR, S_IRWXU+S_IRWXG+S_IRWXO);
int r;
DB_ENV *env;
DB *db;
DB_TXN *txn;
r = db_env_create(&env, 0); CKERR(r);
r = env->open(env, ENVDIR, DB_INIT_LOCK|DB_INIT_LOG|DB_INIT_MPOOL|DB_INIT_TXN|DB_CREATE|DB_PRIVATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r = db_create(&db, env, 0); CKERR(r);
{
r=env->txn_begin(env, 0, &txn, 0); assert(r==0);
r=db->open(db, txn, "foo.db", 0, DB_BTREE, DB_CREATE, S_IRWXU+S_IRWXG+S_IRWXO); CKERR(r);
r=txn->commit(txn, 0); assert(r==0);
}
r=env->txn_begin(env, 0, &txn, 0); CKERR(r);
unsigned int i;
u_int64_t dsize=0;
for (i=0; i<N; i++) {
char hello[30], there[30];
snprintf(hello, sizeof(hello), "hello%d", i);
snprintf(there, sizeof(there), "there%d", i);
DBT key, val;
r=db->put(db, txn,
dbt_init(&key, hello, strlen(hello)+1),
dbt_init(&val, there, strlen(there)+1),
DB_YESOVERWRITE);
dsize += strlen(hello)+1 + strlen(there)+1;
CKERR(r);
}
r=txn->commit(txn, 0); CKERR(r);
r=env->txn_begin(env, 0, &txn, 0); CKERR(r);
DB_BTREE_STAT64 s;
r=db->stat64(db, txn, &s); CKERR(r);
if (verbose) {
printf("nkeys=%" PRIu64 "\nndata=%" PRIu64 "\ndsize=%" PRIu64 "\n",
s.bt_nkeys, s.bt_ndata, s.bt_dsize);
printf("expected dsize=%" PRIu64 "\n", dsize);
}
assert(s.bt_nkeys==N);
assert(s.bt_ndata==N);
assert(s.bt_dsize==dsize);
r=txn->commit(txn, 0); CKERR(r);
r=db->close(db, 0); CKERR(r);
r=env->close(env, 0); CKERR(r);
}
int
test_main (int argc, const char *argv[])
{
parse_args(argc, argv);
test_stat64(40000);
test_stat64(400000);
return 0;
}
...@@ -3156,14 +3156,17 @@ static int toku_db_set_pagesize(DB *db, u_int32_t pagesize) { ...@@ -3156,14 +3156,17 @@ static int toku_db_set_pagesize(DB *db, u_int32_t pagesize) {
return r; return r;
} }
#if 0 static int toku_db_stat64(DB * db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
static int toku_db_stat(DB * db, void *v, u_int32_t flags) {
HANDLE_PANICKED_DB(db); HANDLE_PANICKED_DB(db);
v=v; flags=flags; return toku_brt_stat64(db->i->brt, txn->i->tokutxn, &s->bt_nkeys, &s->bt_ndata, &s->bt_dsize);
toku_ydb_barf(); }
abort(); static int locked_db_stat64 (DB *db, DB_TXN *txn, DB_BTREE_STAT64 *s) {
toku_ydb_lock();
int r = toku_db_stat64(db, txn, s);
toku_ydb_unlock();
return r;
} }
#endif
static int toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) { static int toku_db_key_range64(DB* db, DB_TXN* txn __attribute__((__unused__)), DBT* key, u_int64_t* less, u_int64_t* equal, u_int64_t* greater, int* is_exact) {
HANDLE_PANICKED_DB(db); HANDLE_PANICKED_DB(db);
...@@ -3471,7 +3474,7 @@ static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) { ...@@ -3471,7 +3474,7 @@ static int toku_db_create(DB ** db, DB_ENV * env, u_int32_t flags) {
SDB(set_pagesize); SDB(set_pagesize);
SDB(set_flags); SDB(set_flags);
SDB(get_flags); SDB(get_flags);
// SDB(stat); SDB(stat64);
SDB(fd); SDB(fd);
SDB(pre_acquire_read_lock); SDB(pre_acquire_read_lock);
SDB(pre_acquire_table_lock); SDB(pre_acquire_table_lock);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment