Commit 6667bb28 authored by Rich Prohaska's avatar Rich Prohaska

add dupsort insert, delete, lookup

git-svn-id: file:///svn/tokudb@634 c7de825b-a66e-492c-adef-691d508d4ae1
parent 0da80e56
This diff is collapsed.
...@@ -12,6 +12,46 @@ ...@@ -12,6 +12,46 @@
#include "yerror.h" #include "yerror.h"
#include "hashfun.h" #include "hashfun.h"
static int hashelt_list_verify(struct hashelt_list *helist) {
HASHELT e = helist->head;
if (e == 0)
return helist->tail == 0;
while (e->next)
e = e->next;
return helist->tail == e;
}
static inline void hashelt_list_init(struct hashelt_list *helist) {
helist->head = helist->tail = 0;
}
static inline void hashelt_list_append(struct hashelt_list *helist, HASHELT e) {
assert(hashelt_list_verify(helist));
e->next = 0;
if (helist->tail)
helist->tail->next = e;
else
helist->head = e;
helist->tail = e;
assert(hashelt_list_verify(helist));
}
static inline HASHELT hashelt_list_pop(struct hashelt_list *helist) {
assert(hashelt_list_verify(helist));
HASHELT e = helist->head;
if (e) {
helist->head = e->next;
if (helist->head == 0)
helist->tail = 0;
assert(hashelt_list_verify(helist));
}
return e;
}
static inline HASHELT hashelt_list_peek(struct hashelt_list *helist) {
return helist->head;
}
int toku_hashtable_create (HASHTABLE *h) { int toku_hashtable_create (HASHTABLE *h) {
HASHTABLE MALLOC(tab); HASHTABLE MALLOC(tab);
unsigned int i; unsigned int i;
...@@ -22,31 +62,39 @@ int toku_hashtable_create (HASHTABLE *h) { ...@@ -22,31 +62,39 @@ int toku_hashtable_create (HASHTABLE *h) {
assert(sizeof(*tab->array)==sizeof(void*)); assert(sizeof(*tab->array)==sizeof(void*));
tab->array = toku_calloc(tab->arraysize, sizeof(*tab->array)); tab->array = toku_calloc(tab->arraysize, sizeof(*tab->array));
for (i=0; i<tab->arraysize; i++) tab->array[i]=0; for (i=0; i<tab->arraysize; i++) tab->array[i]=0;
tab->allow_dups = 1;
*h=tab; *h=tab;
return 0; return 0;
} }
static void hash_find_internal (HASHTABLE tab, unsigned int hash, const unsigned char *key, ITEMLEN keylen, HASHELT *hashelt, HASHELT **prev_ptr) { int toku_hashtable_set_dups (HASHTABLE tab, unsigned int allow_dups) {
tab->allow_dups = allow_dups;
return 0;
}
static void hash_find_internal (HASHTABLE tab, unsigned int hash, const unsigned char *key, ITEMLEN keylen, HASHDUP *dup_ptr, HASHDUP **prev_ptr) {
unsigned int h = hash % tab->arraysize; unsigned int h = hash % tab->arraysize;
HASHELT he; HASHDUP dup;
HASHELT *prev = &tab->array[h]; HASHDUP *prev = &tab->array[h];
for (he=*prev; he; prev=&he->next, he=*prev) { for (dup=*prev; dup; prev=&dup->next, dup=*prev) {
HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he);
if (keylen==he->keylen && memcmp(key, he->keyval, keylen)==0) { if (keylen==he->keylen && memcmp(key, he->keyval, keylen)==0) {
*prev_ptr = prev; *prev_ptr = prev;
*hashelt = he; *dup_ptr = dup;
return; return;
} }
} }
*prev_ptr = prev; *prev_ptr = prev;
*hashelt = 0; *dup_ptr = 0;
} }
int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type) { int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec *data, ITEMLEN *datalen, int *type) {
HASHELT he, *prev_ptr; HASHDUP dup, *prev;
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &he, &prev_ptr); hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev);
if (he==0) { if (dup==0) {
return -1; return -1;
} else { } else {
HASHELT he = dup->kdlist.head;
*data = &he->keyval[he->keylen]; *data = &he->keyval[he->keylen];
*datalen = he->vallen; *datalen = he->vallen;
*type = he->type; *type = he->type;
...@@ -58,21 +106,22 @@ int toku_hash_rehash_everything (HASHTABLE tab, unsigned int primeindexdelta) { ...@@ -58,21 +106,22 @@ int toku_hash_rehash_everything (HASHTABLE tab, unsigned int primeindexdelta) {
int newprimeindex = primeindexdelta+tab->primeidx; int newprimeindex = primeindexdelta+tab->primeidx;
assert(newprimeindex>=0); assert(newprimeindex>=0);
unsigned int newarraysize = get_prime(newprimeindex); unsigned int newarraysize = get_prime(newprimeindex);
HASHELT *newarray = toku_calloc(newarraysize, sizeof(*tab->array)); HASHDUP *newarray = toku_calloc(newarraysize, sizeof(*tab->array));
unsigned int i; unsigned int i;
//printf("%s:%d newarraysize=%d\n", __FILE__, __LINE__, newarraysize); //printf("%s:%d newarraysize=%d\n", __FILE__, __LINE__, newarraysize);
assert(newarray!=0); assert(newarray!=0);
tab->primeidx=newprimeindex; tab->primeidx=newprimeindex;
for (i=0; i<newarraysize; i++) newarray[i]=0; for (i=0; i<newarraysize; i++) newarray[i]=0;
for (i=0; i<tab->arraysize; i++) { for (i=0; i<tab->arraysize; i++) {
HASHELT he; HASHDUP dup;
while ((he=tab->array[i])!=0) { while ((dup=tab->array[i])!=0) {
HASHELT he = hashelt_list_peek(&dup->kdlist); assert(he);
//unsigned int hk = hash_key((unsigned char *)he->key, he->keylen); //unsigned int hk = hash_key((unsigned char *)he->key, he->keylen);
unsigned int h = he->hash%newarraysize; unsigned int h = he->hash%newarraysize;
//assert(he->hash==hk); //assert(he->hash==hk);
tab->array[i] = he->next; tab->array[i] = dup->next;
he->next = newarray[h]; dup->next = newarray[h];
newarray[h] = he; newarray[h] = dup;
} }
} }
toku_free(tab->array); toku_free(tab->array);
...@@ -87,46 +136,82 @@ int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void ...@@ -87,46 +136,82 @@ int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void
{ {
unsigned int hk = hash_key (key,keylen); unsigned int hk = hash_key (key,keylen);
unsigned int h = hk%tab->arraysize; unsigned int h = hk%tab->arraysize;
{ HASHDUP dup,*prev_ptr;
HASHELT he,*prev_ptr; hash_find_internal(tab, hk, key, keylen, &dup, &prev_ptr);
hash_find_internal(tab, hk, key, keylen, &he, &prev_ptr); if (dup == 0) {
if (he!=0) { dup = toku_malloc(sizeof *dup);
return BRT_ALREADY_THERE; assert(dup);
} hashelt_list_init(&dup->kdlist);
dup->next = tab->array[h];
tab->array[h]=dup;
} else if (!tab->allow_dups)
return BRT_ALREADY_THERE;
HASHELT he=toku_malloc(sizeof(*he)+keylen+vallen);
assert(he); // ?????
he->type = type;
he->keylen = keylen;
he->vallen = vallen;
memmove(&he->keyval[0], key, keylen);
memmove(&he->keyval[keylen], val, vallen);
he->hash = hk;
hashelt_list_append(&dup->kdlist, he);
tab->n_keys++;
if (tab->n_keys > tab->arraysize) {
return toku_hash_rehash_everything(tab, +1);
} }
{ return BRT_OK;
/* Otherwise the key is not already present, so we need to add it. */ }
HASHELT he=toku_malloc(sizeof(*he)+keylen+vallen);
assert(he); // ????? int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen) {
he->type = type; HASHDUP dup, *prev_ptr;
he->keylen = keylen; //printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize);
he->vallen = vallen; hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr);
memmove(&he->keyval[0], key, keylen); if (dup==0) return DB_NOTFOUND;
memmove(&he->keyval[keylen], val, vallen); else {
assert(*prev_ptr==dup);
he->hash = hk;
he->next = tab->array[h]; HASHELT he = hashelt_list_pop(&dup->kdlist);
tab->array[h]=he; assert(he);
tab->n_keys++; //printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val);
if (tab->n_keys > tab->arraysize) { toku_free_n(he, sizeof(*he)+he->keylen+he->vallen);
return toku_hash_rehash_everything(tab, +1); tab->n_keys--;
if (!hashelt_list_peek(&dup->kdlist)) {
/* delete the dups from the hash list */
*prev_ptr = dup->next;
toku_free_n(dup, sizeof *dup);
}
if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) {
return toku_hash_rehash_everything(tab, -1);
} }
return BRT_OK; return BRT_OK;
} }
} }
int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen) { int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen) {
HASHELT he, *prev_ptr; HASHDUP dup, *prev_ptr;
//printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize); //printf("%s:%d deleting %s (bucket %d)\n", __FILE__, __LINE__, key, hash_key(key,keylen)%tab->arraysize);
hash_find_internal(tab, hash_key (key, keylen), key, keylen, &he, &prev_ptr); hash_find_internal(tab, hash_key (key, keylen), key, keylen, &dup, &prev_ptr);
if (he==0) return DB_NOTFOUND; if (dup==0) return DB_NOTFOUND;
else { else {
//printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val); assert(*prev_ptr==dup);
assert(*prev_ptr==he); /* delete the dups from the hash list */
*prev_ptr = he->next; *prev_ptr = dup->next;
//printf("Freeing %s %s\n", he->key, he->val);
toku_free_n(he, sizeof(*he)+he->keylen+he->vallen); /* delete all of the kd pairs in the dup list */
tab->n_keys--; HASHELT he;
while ((he = hashelt_list_pop(&dup->kdlist)) != 0 ) {
//printf("%s:%d deleting %s %s\n", __FILE__, __LINE__, he->key, he->val);
toku_free_n(he, sizeof(*he)+he->keylen+he->vallen);
tab->n_keys--;
}
toku_free_n(dup, sizeof *dup);
if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) { if ((tab->n_keys * 4 < tab->arraysize) && tab->primeidx>0) {
return toku_hash_rehash_everything(tab, -1); return toku_hash_rehash_everything(tab, -1);
...@@ -141,8 +226,9 @@ int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytev ...@@ -141,8 +226,9 @@ int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytev
unsigned int usei = (*randomnumber)%h->arraysize; unsigned int usei = (*randomnumber)%h->arraysize;
for (i=0; i<h->arraysize; i++, usei++) { for (i=0; i<h->arraysize; i++, usei++) {
if (usei>=h->arraysize) usei=0; if (usei>=h->arraysize) usei=0;
HASHELT he=h->array[usei]; HASHDUP dup=h->array[usei];
if (he) { if (dup) {
HASHELT he = dup->kdlist.head; assert(he);
*key = &he->keyval[0]; *key = &he->keyval[0];
*keylen = he->keylen; *keylen = he->keylen;
*data = &he->keyval[he->keylen]; *data = &he->keyval[he->keylen];
...@@ -220,7 +306,13 @@ void toku_hashtable_free(HASHTABLE *tab) { ...@@ -220,7 +306,13 @@ void toku_hashtable_free(HASHTABLE *tab) {
void toku_hashtable_clear(HASHTABLE tab) { void toku_hashtable_clear(HASHTABLE tab) {
unsigned int i; unsigned int i;
for (i=0; i<tab->arraysize; i++) { for (i=0; i<tab->arraysize; i++) {
hasheltlist_free(tab->array[i]); HASHDUP dup = tab->array[i];
while (dup) {
HASHDUP nextdup = dup->next;
hasheltlist_free(hashelt_list_peek(&dup->kdlist));
toku_free_n(dup, sizeof *dup);
dup = nextdup;
}
tab->array[i]=0; tab->array[i]=0;
} }
tab->n_keys = 0; tab->n_keys = 0;
......
...@@ -5,20 +5,37 @@ ...@@ -5,20 +5,37 @@
/* Hash table with chaining. */ /* Hash table with chaining. */
/* The keys and values are byte sequences. */ /* The keys and values are byte sequences. */
/* The keys and values are malloc'd by the hashtable. */ /* The keys and values are malloc'd by the hashtable. */
/* Duplicate keys are allowed by default and are stored in a FIFO list */
typedef struct hashtable *HASHTABLE; typedef struct hashtable *HASHTABLE;
int toku_hashtable_create (HASHTABLE*); int toku_hashtable_create (HASHTABLE*);
/* Configure the hash table for duplicate keys.
allow_dups != 0 -> duplications allowed, allow_dups == 0 -> no duplicates */
int toku_hashtable_set_dups (HASHTABLE, unsigned int allow_dups);
/* Return 0 if the key is found in the hashtable, -1 otherwise. */ /* Return 0 if the key is found in the hashtable, -1 otherwise. */
/* Warning: The data returned points to the internals of the hashtable. It is set to "const" to try to prevent you from messing it up. */ /* Warning: The data returned points to the internals of the hashtable. It is set to "const" to try to prevent you from messing it up. */
int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen, int *type); int toku_hash_find (HASHTABLE tab, bytevec key, ITEMLEN keylen, bytevec*data, ITEMLEN *datalen, int *type);
/* Replace the key if it was already there. */ /* Insert the key/data pair into the hash table.
If the key is not in the hash table then insert it.
If the key already exists and duplicates are allowed then append it to the list of duplicates.
If the key already exists and duplicates are not allowed then return an error */
int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type); int toku_hash_insert (HASHTABLE tab, const void *key, ITEMLEN keylen, const void *data, ITEMLEN datalen, int type);
/* It is OK to delete something that isn't there. */ /* Delete the first entry with the given key
It is OK to delete something that isn't there. */
int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen); int toku_hash_delete (HASHTABLE tab, const void *key, ITEMLEN keylen);
/* Delete all entries with the given key */
int toku_hash_delete_all (HASHTABLE tab, const void *key, ITEMLEN keylen);
void toku_hashtable_free(HASHTABLE *tab); void toku_hashtable_free(HASHTABLE *tab);
int toku_hashtable_n_entries(HASHTABLE); int toku_hashtable_n_entries(HASHTABLE);
...@@ -29,35 +46,50 @@ int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytev ...@@ -29,35 +46,50 @@ int toku_hashtable_random_pick(HASHTABLE h, bytevec *key, ITEMLEN *keylen, bytev
typedef struct hashelt *HASHELT; typedef struct hashelt *HASHELT;
struct hashelt { struct hashelt {
unsigned int hash;
HASHELT next; HASHELT next;
unsigned int hash;
int type; int type;
ITEMLEN keylen; ITEMLEN keylen;
ITEMLEN vallen; ITEMLEN vallen;
char keyval[]; /* the first KEYLEN bytes are the key. The next bytes are the value. */ char keyval[]; /* the first KEYLEN bytes are the key. The next bytes are the value. */
}; };
struct hashelt_list {
HASHELT head;
HASHELT tail;
};
typedef struct hashdup *HASHDUP;
struct hashdup {
HASHDUP next;
struct hashelt_list kdlist;
};
struct hashtable { struct hashtable {
HASHDUP *array;
unsigned int n_keys; unsigned int n_keys;
unsigned int arraysize; unsigned int arraysize;
unsigned int primeidx; unsigned int primeidx;
HASHELT *array; unsigned int allow_dups;
}; };
/* You cannot add or delete elements from the hashtable while iterating. */ /* You cannot add or delete elements from the hashtable while iterating. */
void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, void*), void*); void toku_hashtable_iterate (HASHTABLE tab, void(*f)(bytevec key,ITEMLEN keylen,bytevec data,ITEMLEN datalen,int type, void*), void*);
// If you don't want to use something, do something like use "key __attribute__((__unused__))" for keyvar. // If you don't want to use something, do something like use "key __attribute__((__unused__))" for keyvar.
#define HASHTABLE_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,typevar,body) ({ \ #define HASHTABLE_ITERATE(table,keyvar,keylenvar,datavar,datalenvar,typevar,body) ({ \
unsigned int hi_counter; \ unsigned int hi_counter; \
for (hi_counter=0; hi_counter<table->arraysize; hi_counter++) { \ for (hi_counter=0; hi_counter<table->arraysize; hi_counter++) { \
HASHELT hi_he; \ HASHDUP hi_dup; \
for (hi_he=table->array[hi_counter]; hi_he; hi_he=hi_he->next) { \ for (hi_dup=table->array[hi_counter]; hi_dup; hi_dup=hi_dup->next) { \
const char *keyvar = &hi_he->keyval[0]; \ HASHELT hi_he; \
ITEMLEN keylenvar = hi_he->keylen; \ for (hi_he=hi_dup->kdlist.head; hi_he; hi_he=hi_he->next) { \
const char *datavar = &hi_he->keyval[hi_he->keylen]; \ const char *keyvar = &hi_he->keyval[0]; \
ITEMLEN datalenvar = hi_he->vallen; \ ITEMLEN keylenvar = hi_he->keylen; \
int typevar = hi_he->type; \ const char *datavar = &hi_he->keyval[hi_he->keylen]; \
body; \ ITEMLEN datalenvar = hi_he->vallen; \
}}}) int typevar = hi_he->type; \
body; \
}}}})
#endif #endif
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <string.h> #include <string.h>
#include <arpa/inet.h>
void verify_hash_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl, void verify_hash_instance (bytevec kv_v, ITEMLEN kl, bytevec dv_v, ITEMLEN dl,
int N, int *data, char *saw) { int N, int *data, char *saw) {
...@@ -139,10 +140,122 @@ void test1(void) { ...@@ -139,10 +140,122 @@ void test1(void) {
toku_hashtable_free(&table); toku_hashtable_free(&table);
} }
void test_insert_nodup(int n) {
HASHTABLE t;
int r;
r = toku_hashtable_create(&t);
assert(r == 0);
toku_hashtable_set_dups(t, 0);
int keys[n], vals[n];
int i;
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r == 0);
}
for (i=0; i<n; i++) {
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &keys[i], sizeof keys[i], &data, &datalen, &type);
assert(r == 0);
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
}
/* try to insert duplicates should fail */
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r != 0);
}
toku_hashtable_free(&t);
assert(t == 0);
}
void test_insert_dup(int n, int do_delete_all) {
HASHTABLE t;
int r;
r = toku_hashtable_create(&t);
assert(r == 0);
toku_hashtable_set_dups(t, 1);
int keys[n], vals[n];
int dupkey = n + n/2;
int i;
for (i=0; i<n; i++) {
keys[i] = htonl(i);
vals[i] = i;
r = toku_hash_insert(t, &keys[i], sizeof keys[i], &vals[i], sizeof vals[i], i);
assert(r == 0);
}
for (i=0; i<n; i++) {
int key = htonl(dupkey);
int val = i;
r = toku_hash_insert(t, &key, sizeof key, &val, sizeof val, i);
assert(r == 0);
}
for (i=0; i<n; i++) {
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &keys[i], sizeof keys[i], &data, &datalen, &type);
assert(r == 0);
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
}
for (i=0; ; i++) {
int key = htonl(dupkey);
bytevec data; ITEMLEN datalen; int type;
r = toku_hash_find(t, &key, sizeof key, &data, &datalen, &type);
if (r != 0) break;
assert(datalen == sizeof vals[i]);
assert(type == i);
int vv;
memcpy(&vv, data, datalen);
assert(vv == vals[i]);
if (do_delete_all)
r = toku_hash_delete_all(t, &key, sizeof key);
else
r = toku_hash_delete(t, &key, sizeof key);
assert(r == 0);
}
if (do_delete_all)
assert(i == 1);
else
assert(i == n);
toku_hashtable_free(&t);
assert(t == 0);
}
int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) { int main (int argc __attribute__((__unused__)), char *argv[] __attribute__((__unused__))) {
test_primes(); test_primes();
test0(); test0();
test1(); test1();
test_insert_nodup(1000);
test_insert_dup(1000, 0);
test_insert_dup(1000, 1);
malloc_cleanup(); malloc_cleanup();
return 0; return 0;
} }
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment