Commit dda9e4e1 authored by Rich Prohaska's avatar Rich Prohaska

save the hashtables. addresses #250

git-svn-id: file:///svn/tokudb@1598 c7de825b-a66e-492c-adef-691d508d4ae1
parent 46d4357e
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
/* Hash table with chaining. */
#include "hashtable.h"
#include "memory.h"
#include "primes.h"
// #include "../include/ydb-constants.h"
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include "key.h"
#include "yerror.h"
#include "hashfun.h"
static int hashelt_list_verify(struct hashelt_list *helist) {
HASHELT e = helist->head;
if (e == 0)
return helist->tail == 0;
while (e->next)
e = e->next;
return helist->tail == e;
}
static inline void hashelt_list_init(struct hashelt_list *helist) {
helist->head = helist->tail = 0;
}
static inline void hashelt_list_append(struct hashelt_list *helist, HASHELT e) {
assert(hashelt_list_verify(helist));
e->next = 0;
if (helist->tail)
helist->tail->next = e;
else
helist->head = e;
helist->tail = e;
assert(hashelt_list_verify(helist));
}
static inline HASHELT hashelt_list_pop(struct hashelt_list *helist) {
assert(hashelt_list_verify(helist));
HASHELT e = helist->head;
if (e) {
helist->head = e->next;
if (helist->head == 0)
helist->tail = 0;
assert(hashelt_list_verify(helist));
}
return e;
}
static inline HASHELT hashelt_list_peek(struct hashelt_list *helist) {
return helist->head;
}
int toku_hashtable_create (HASHTABLE *h) {
HASHTABLE MALLOC(tab);
unsigned int i;
if (tab==0) return -1;
tab->n_keys=0;
tab->primeidx=0;
tab->arraysize=toku_get_prime(tab->primeidx);
assert(sizeof(*tab->array)==sizeof(void*));
tab->array = toku_calloc(tab->arraysize, sizeof(*tab->array));
for (i=0; i<tab->arraysize; i++) tab->array[i]=0;
tab->allow_dups = 1;
*h=tab;
return 0;
}
int toku_hashtable_set_dups (HASHTABLE tab, unsigned int allow_dups) {
tab->allow_dups = allow_dups;
return 0;
}
static void hash_find_internal (HASHTABLE tab, unsigned int hash, const unsigned char *key, ITEMLEN keylen, HASHDUP *dup_ptr, HASHDUP **prev_ptr) {
unsigned int h = hash % tab->arraysize;
HASHDUP dup;
HASHDUP *prev = &tab->array[h];
for (dup=*prev; dup; prev=&dup->next, dup=*prev) {
HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he);
if (keylen==he->keylen && memcmp(key, he->keyval, keylen)==0) {
*prev_ptr = prev;
*dup_ptr = dup;
return;
}
}
*prev_ptr = prev;
*dup_ptr = 0;
}
int toku_hash_find_idx (HASHTABLE tab, bytevec key, ITEMLEN keylen, int idx, bytevec *data, ITEMLEN *datalen, int *type) {
HASHDUP dup, *prev;
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev);
if (dup==0) {
return -1;
} else {
HASHELT he = hashelt_list_peek(&dup->kdlist);
int i;
for (i=0; i<idx; i++) {
he = he->next;
if (he == 0)
return -2;
}
*data = &he->keyval[he->keylen];
*datalen = he->vallen;
*type = he->type;
return 0;
}
}
int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type) {
HASHDUP dup, *prev;
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev);
if (dup==0) {
return -1;
} else {
HASHELT he = hashelt_list_peek(&dup->kdlist);
*data = &he->keyval[he->keylen];
*datalen = he->vallen;
*type = he->type;
return 0;
}
}
int toku_hash_rehash_everything (HASHTABLE tab, unsigned int primeindexdelta) {
int newprimeindex = primeindexdelta+tab->primeidx;
assert(newprimeindex>=0);
unsigned int newarraysize = toku_get_prime(newprimeindex);
HASHDUP *newarray = toku_calloc(newarraysize, sizeof(*tab->array));
unsigned int i;
//printf("%s:%d newarraysize=%d\n", __FILE__, __LINE__, newarraysize);
assert(newarray!=0);
tab->primeidx=newprimeindex;
for (i=0; i<newarraysize; i++) newarray[i]=0;
for (i=0; i<tab->arraysize; i++) {
HASHDUP dup;
while ((dup=tab->array[i])!=0) {
HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he);
//unsigned int hk = hash_key((unsigned char *)he->key, he->keylen);
unsigned int h = he->hash%newarraysize;
//assert(he->hash==hk);
tab->array[i] = dup->next;
dup->next = newarray[h];
newarray[h] = dup;
}
}
toku_free(tab->array);
// printf("Freed\n");
tab->array=newarray;
tab->arraysize=newarraysize;
//printf("Done growing or shrinking\n");
return 0;
}
int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *val, ITEMLEN vallen, int type)
{
unsigned int hk = hash_key (key,keylen);
unsigned int h = hk%tab->arraysize;
HASHDUP dup,*prev_ptr;
hash_find_internal(tab, hk, key, keylen, &dup, &prev_ptr);
if (dup == 0) {
dup = toku_malloc(sizeof *dup);
assert(dup);
hashelt_list_init(&dup->kdlist);
dup->next = tab->array[h];
tab->array[h]=dup;
} else if (!tab->allow_dups)
return BRT_ALREADY_THERE;
HASHELT he=toku_malloc(sizeof(*he)+keylen+vallen);
assert(he); // ?????
he->type = type;
he->keylen = keylen;
he->vallen = vallen;
memmove(&he->keyval[0], key, keylen);
memmove(&he->keyval[keylen], val, vallen);
he->hash = hk;
hashelt_list_append(&dup->kdlist, he);
tab->n_keys++;
if (tab->n_keys > tab->arraysize) {
return toku_hash_rehash_everything(tab, +1);
}
return BRT_OK;
}
int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen) {
HASHDUP dup, *prev_ptr;
//printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize);
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr);
if (dup==0) return DB_NOTFOUND;
else {
assert(*prev_ptr==dup);
HASHELT he = hashelt_list_pop(&dup->kdlist);
assert(he);
//printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val);
toku_free_n(he, sizeof(*he)+he->keylen+he->vallen);
tab->n_keys--;
if (!hashelt_list_peek(&dup->kdlist)) {
/* delete the dups from the hash list */
*prev_ptr = dup->next;
toku_free_n(dup, sizeof *dup);
}
if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) {
return toku_hash_rehash_everything(tab, -1);
}
return BRT_OK;
}
}
int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen) {
HASHDUP dup, *prev_ptr;
//printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize);
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr);
if (dup==0) return DB_NOTFOUND;
else {
assert(*prev_ptr==dup);
/* delete the dups from the hash list */
*prev_ptr = dup->next;
/* delete all of the kd pairs in the dup list */
HASHELT he;
while ((he = hashelt_list_pop(&dup->kdlist)) != 0 ) {
//printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val);
toku_free_n(he, sizeof(*he)+he->keylen+he->vallen);
tab->n_keys--;
}
toku_free_n(dup, sizeof *dup);
if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) {
return toku_hash_rehash_everything(tab, -1);
}
return BRT_OK;
}
}
int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, int *type, long int *randomnumber) {
unsigned int i;
unsigned int usei = (*randomnumber)%h->arraysize;
for (i=0; i<h->arraysize; i++, usei++) {
if (usei>=h->arraysize) usei=0;
HASHDUP dup=h->array[usei];
if (dup) {
HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he);
*key = &he->keyval[0];
*keylen = he->keylen;
*data = &he->keyval[he->keylen];
*datalen = he->vallen;
*type = he->type;
*randomnumber = usei;
return 0;
}
}
return -1;
}
#if 0
int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen) {
bytevec best_k=0, best_d;
ITEMLEN best_kl, best_dl;
HASHTABLE_ITERATE(h, this_k, this_kl, this_d, this_dl,
({
if (best_k==0 || toku_keycompare(best_k, best_kl, this_k, this_kl)<0) {
best_k = this_k;
best_kl = this_kl;
best_d = this_d;
best_dl = this_dl;
}
}));
if (best_k) {
*key = best_k;
*keylen = best_kl;
*data = best_d;
*datalen = best_dl;
return 0;
} else {
return -1;
}
}
#endif
void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key, ITEMLEN keylen, bytevec data, ITEMLEN datalen, int type, void*args), void* args) {
/*
int i;
for (i=0; i<tab->arraysize; i++) {
HASHELT he;
for (he=tab->array[i]; he; he=he->next) {
f(he->key, he->keylen, he->val, he->vallen, args);
}
}
*/
HASHTABLE_ITERATE(tab, key, keylen, val, vallen, type, f(key,keylen,val,vallen,type,args));
}
int toku_hashtable_n_entries(HASHTABLE tab) {
return tab->n_keys;
}
/* Frees the list, but doesn't free the keys. */
static void hasheltlist_free (HASHELT elt) {
if (elt==0) return;
else {
hasheltlist_free(elt->next);
toku_free_n(elt, sizeof(*elt)+elt->keylen+elt->vallen);
}
}
/* Frees the table, but doesn't do anything to the contents of the table. The keys are still alloc'd. The internal storage of the hashtable is freed. */
void toku_hashtable_free(HASHTABLE *tab) {
//printf("%s:%d free hashtable %p\n", __FILE__, __LINE__, tab);
toku_hashtable_clear(*tab);
//printf("%s:%d free %p\n", __FILE__, __LINE__, tab);n
toku_free((*tab)->array);
toku_free_n(*tab, sizeof(**tab));
*tab=0;
}
void toku_hashtable_clear(HASHTABLE tab) {
unsigned int i;
for (i=0; i<tab->arraysize; i++) {
HASHDUP dup = tab->array[i];
while (dup) {
HASHDUP nextdup = dup->next;
hasheltlist_free(hashelt_list_peek(&dup->kdlist));
toku_free_n(dup, sizeof *dup);
dup = nextdup;
}
tab->array[i]=0;
}
tab->n_keys = 0;
}
#ifndef HASHTABLE_H
#define HASHTABLE_H
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include "brttypes.h"
/* Hash table with chaining. */
/* The keys and values are byte sequences. */
/* The keys and values are malloc'd by the hashtable. */
/* Duplicate keys are allowed by default and are stored in a FIFO list */
typedef struct hashtable *HASHTABLE;
int toku_hashtable_create (HASHTABLE*);
/* Configure the hash table for duplicate keys.
allow_dups != 0 -> duplications allowed, allow_dups == 0 -> no duplicates */
int toku_hashtable_set_dups (HASHTABLE, unsigned int allow_dups);
/* Return 0 if the key is found in the hashtable, -1 otherwise. */
/* Warning: The data returned points to the internals of the hashtable. It is set to "const" to try to prevent you from messing it up. */
int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type);
/* match on key, index on duplicates */
int toku_hash_find_idx (HASHTABLE tab, bytevec key, ITEMLEN keylen, int idx, bytevec *data, ITEMLEN *datalen, int *type);
/* Insert the key/data pair into the hash table.
If the key is not in the hash table then insert it.
If the key already exists and duplicates are allowed then append it to the list of duplicates.
If the key already exists and duplicates are not allowed then return an error */
int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type);
/* Delete the first entry with the given key
It is OK to delete something that isn't there. */
int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen);
/* Delete all entries with the given key */
int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen);
void toku_hashtable_free(HASHTABLE *tab);
int toku_hashtable_n_entries(HASHTABLE);
void toku_hashtable_clear(HASHTABLE);
int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen, int *type, long int *randomnumber);
//int hashtable_find_last(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytevec *data, ITEMLEN *datalen);
typedef struct hashelt *HASHELT;
struct hashelt {
HASHELT next;
unsigned int hash;
int type;
ITEMLEN keylen;
ITEMLEN vallen;
char keyval[]; /* the first KEYLEN bytes are the key. The next bytes are the value. */
};
struct hashelt_list {
HASHELT head;
HASHELT tail;
};
typedef struct hashdup *HASHDUP;
struct hashdup {
HASHDUP next;
struct hashelt_list kdlist;
};
struct hashtable {
HASHDUP *array;
unsigned int n_keys;
unsigned int arraysize;
unsigned int primeidx;
unsigned int allow_dups;
};
/* You cannot add or delete elements from the hashtable while iterating. */
void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, void*), void*);
// If you don't want to use something, do something like use "key __attribute__((__unused__))" for keyvar.
#define HASHTABLE_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,typevar,body) ({ \
unsigned int hi_counter; \
for (hi_counter=0; hi_counter<table->arraysize; hi_counter++) { \
HASHDUP hi_dup; \
for (hi_dup=table->array[hi_counter]; hi_dup; hi_dup=hi_dup->next) { \
HASHELT hi_he; \
for (hi_he=hi_dup->kdlist.head; hi_he; hi_he=hi_he->next) { \
const char *keyvar = &hi_he->keyval[0]; \
ITEMLEN keylenvar = hi_he->keylen; \
const char *datavar = &hi_he->keyval[hi_he->keylen]; \
ITEMLEN datalenvar = hi_he->vallen; \
int typevar = hi_he->type; \
body; \
}}}})
#endif
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2007 Tokutek Inc. All rights reserved."
#include "key.h"
#include "hashtable.h"
#include "memory.h"
#include "primes.h"
#include <stdlib.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <arpa/inet.h>
void verify_hash_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl,
int N, int *data, char *saw) {
char *kv = (char*)kv_v;
char *dv = (char*)dv_v;
int num, k;
assert(kv[0]=='k');
assert(dv[0]=='d');
assert(strcmp(kv+1, dv+1)==0);
assert(strlen(kv)+1==kl);
assert(strlen(dv)+1==dl);
num = atoi(kv+1);
for (k=0; k<N; k++) {
if (data[k]==num) {
assert(!saw[k]);
saw[k]=1;
return;
}
}
fprintf(stderr, "%s isn't there\n", kv); abort();
}
void verify_htable_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl, int type,
int N, int *data, char *saw) {
char *kv = (char*)kv_v;
char *dv = (char*)dv_v;
int num, k;
assert(kv[0]=='k');
assert(dv[0]=='d');
assert(strcmp(kv+1, dv+1)==0);
assert(strlen(kv)+1==kl);
assert(strlen(dv)+1==dl);
assert(type == 0);
num = atoi(kv+1);
for (k=0; k<N; k++) {
if (data[k]==num) {
assert(!saw[k]);
saw[k]=1;
return;
}
}
fprintf(stderr, "%s isn't there\n", kv); abort();
}
void verify_htable (HASHTABLE htable, int N, int *data, char *saw) {
int j;
for (j=0; j<N; j++) {
saw[j]=0;
}
HASHTABLE_ITERATE(htable, kv, kl, dv, dl, type,
verify_htable_instance (kv, kl, dv, dl, type,
N, data, saw));
for (j=0; j<N; j++) {
assert(saw[j]);
}
}
void test0 (void) {
int r, i, j;
HASHTABLE htable;
int n_ops=1000;
int *data=malloc(sizeof(*data)*n_ops);
char*saw =malloc(sizeof(*saw)*n_ops);
int data_n = 0;
assert(data!=0);
r = toku_hashtable_create(&htable); assert(r==0);
assert(toku_hashtable_n_entries(htable)==0);
#if 0
{
bytevec kv=(void*)0xdeadbeef;
bytevec dv=(void*)0xbeefdead;
ITEMLEN kl=42, dl=43;
r = mdict_find_last(htable,&kv,&kl,&dv,&dl);
assert(r!=0);
assert((unsigned long)kv==0xdeadbeef);
assert((unsigned long)dv==0xbeefdead);
assert(kl==42);
assert(dl==43);
}
#endif
for (i=0; i<n_ops; i++) {
if (random()%4==1) {
// Delete something random
} else if (random()%2 == 0) {
// Insert something
try_another_random:
{
int ra = random()%(1<<30);
char kv[100], dv[100];
for (j=0; j<data_n; j++) {
if (ra==data[j]) goto try_another_random;
}
snprintf(kv, 99, "k%d", ra);
snprintf(dv, 99, "d%d", ra);
toku_hash_insert(htable, kv, strlen(kv)+1, dv, strlen(dv)+1, 0);
data[data_n++]=ra;
}
} else {
// Look up something
}
verify_htable(htable, data_n, data, saw);
}
toku_hashtable_free(&htable);
free(data);
free(saw);
}
void test1(void) {
HASHTABLE table;
int i, r;
r = toku_hashtable_create(&table); assert(r==0);
for (i=0; i<100; i++) {
char keys[4][100], vals[4][100];
int j;
for (j=0; j<4; j++) {
snprintf(keys[j], 100, "k%ld", (long)(random()));
snprintf(vals[j], 100, "v%d", j);
toku_hash_insert(table, keys[j], strlen(keys[j])+1, vals[j], strlen(vals[j])+1, 0);
}
for (j=0; j<4; j++) {
bytevec key, val;
ITEMLEN keylen, vallen;
int type;
long int randnum=random();
r = toku_hashtable_random_pick(table, &key, &keylen, &val, &vallen, &type, &randnum);
assert(r==0);
r = toku_hash_delete(table, key, keylen);
assert(r==0);
}
}
toku_hashtable_free(&table);
}
void test_insert_nodup(int n) {
HASHTABLE t;
int r;
r = toku_hashtable_create(&t);
assert(r == 0);
toku_hashtable_set_dups(t, 0);
int keys[n], vals[n];
int i;
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r == 0);
}
for (i=0; i<n; i++) {
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &keys[i], sizeof keys[i], &data, &datalen, &type);
assert(r == 0);
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
}
/* try to insert duplicates should fail */
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r != 0);
}
toku_hashtable_free(&t);
assert(t == 0);
}
void test_insert_dup(int n, int do_delete_all) {
HASHTABLE t;
int r;
r = toku_hashtable_create(&t);
assert(r == 0);
toku_hashtable_set_dups(t, 1);
int keys[n], vals[n];
int dupkey = n + n/2;
int i;
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r == 0);
}
for (i=0; i<n; i++) {
int key = htonl(dupkey);
int val = i;
r = toku_hash_insert(t, &key, sizeof key, &val, sizeof val, i);
assert(r == 0);
}
for (i=0; i<n; i++) {
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &keys[i], sizeof keys[i], &data, &datalen, &type);
assert(r == 0);
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
}
for (i=0; ; i++) {
int key = htonl(dupkey);
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &key, sizeof key, &data, &datalen, &type);
if (r != 0) break;
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
if (do_delete_all)
r = toku_hash_delete_all(t, &key, sizeof key);
else
r = toku_hash_delete(t, &key, sizeof key);
assert(r == 0);
}
if (do_delete_all)
assert(i == 1);
else
assert(i == n);
toku_hashtable_free(&t);
assert(t == 0);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
toku_test_primes();
test0();
test1();
test_insert_nodup(1000);
test_insert_dup(1000, 0);
test_insert_dup(1000, 1);
toku_malloc_cleanup();
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment