Commit f1331409 authored by Bradley C. Kuszmaul's avatar Bradley C. Kuszmaul Committed by Yoni Fogel

#3316 #3318 #3339

 * #3316 quicklz is now part of the brt layer (but it won't be enabled for compression in 5.0.2, #3339)
 * #3318 (add valgrind suppresseions)
{{{
svn merge -r28917:29103 https://svn.tokutek.com/tokudb/toku/tokudb.3316
}}}
.
[t:3316] [t:3318] [t:3339]


git-svn-id: file:///svn/toku/tokudb@29107 c7de825b-a66e-492c-adef-691d508d4ae1
parent d02f12e5
......@@ -108,7 +108,7 @@ else
NEWBRT_O_FILES = newbrt.o
endif
NEWBRT_O_FILES += brtloader.$(OEXT)
NEWBRT_O_FILES += brtloader.$(OEXT) quicklz.$(OEXT) compress.$(OEXT)
brtloader.$(OEXT): $(DEPEND_COMPILE)
ifeq ($(BRTLOADER),cilk)
......
......@@ -29,6 +29,8 @@
#include "leaflock.h"
#include "c_dialects.h"
// Uncomment the following to use quicklz
C_BEGIN
#ifndef BRT_FANOUT
......
......@@ -197,6 +197,9 @@ enum {
extended_node_header_overhead = (4+ // nodesize
4+ // flags
4), // height
node_tail_overhead = 4, // the 1764 checksum on the entire uncompressed node.
};
#include "sub_block.h"
......@@ -212,7 +215,7 @@ addupsize (OMTVALUE lev, u_int32_t UU(idx), void *vp) {
static unsigned int
toku_serialize_brtnode_size_slow (BRTNODE node) {
unsigned int size = node_header_overhead + extended_node_header_overhead;
unsigned int size = node_header_overhead + extended_node_header_overhead + node_tail_overhead;
if (node->height > 0) {
unsigned int hsize=0;
unsigned int csize=0;
......@@ -251,7 +254,7 @@ toku_serialize_brtnode_size_slow (BRTNODE node) {
// This is the size of the uncompressed data, not including the compression headers
unsigned int
toku_serialize_brtnode_size (BRTNODE node) {
unsigned int result = node_header_overhead + extended_node_header_overhead;
unsigned int result = node_header_overhead + extended_node_header_overhead + node_tail_overhead;
invariant(sizeof(toku_off_t)==8);
if (node->height > 0) {
result += 4; /* n_children */
......@@ -413,6 +416,8 @@ serialize_node(BRTNODE node, char *buf, size_t calculated_size, int n_sub_blocks
else
serialize_leaf(node, n_sub_blocks, sub_block, &wb);
u_int32_t end_to_end_checksum = x1764_memory(buf, calculated_size-4);
wbuf_nocrc_int(&wb, end_to_end_checksum);
invariant(wb.ndone == wb.size);
invariant(calculated_size==wb.ndone);
}
......@@ -694,9 +699,17 @@ deserialize_brtnode_nonleaf_from_rbuf (BRTNODE result, bytevec magic, struct rbu
toku_fifo_size_hint(BNC_BUFFER(result,i), child_buffer_map[i].size);
}
// deserialize all child buffers, like the function says
deserialize_all_child_buffers(result, rb, child_buffer_map, num_cores);
// Must compute the checksum now (rather than at the end, while we still have the pointer to the buffer)
if (result->layout_version_read_from_disk >= BRT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) {
u_int32_t expected_xsum = toku_dtoh32(*(u_int32_t*)(rb->buf+rb->size-4));
u_int32_t actual_xsum = x1764_memory(rb->buf, rb->size-4);
if (expected_xsum!=actual_xsum) {
return toku_db_badformat();
}
}
return 0;
}
......@@ -792,7 +805,14 @@ deserialize_brtnode_leaf_from_rbuf (BRTNODE result, bytevec magic, struct rbuf *
//toku_verify_counts(result);
if (rb->ndone != rb->size) { //Verify we read exactly the entire block.
if (result->layout_version >= BRT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM) {
u_int32_t expected_xsum = rbuf_int(rb);
u_int32_t actual_xsum = x1764_memory(rb->buf, rb->size-4);
if (expected_xsum!=actual_xsum) {
return toku_db_badformat();
}
}
if (rb->ndone != rb->size) { //Verify we read exactly the entire block, except for the final checksum.
r = toku_db_badformat(); goto died_1;
}
......@@ -833,7 +853,6 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b
rbuf_int(rb); // ignore rand4fingerprint
rbuf_int(rb); // ignore localfingerprint
}
// printf("%s:%d read %08x\n", __FILE__, __LINE__, result->local_fingerprint);
result->dirty = 0;
result->fullhash = fullhash;
//printf("height==%d\n", result->height);
......@@ -844,6 +863,7 @@ deserialize_brtnode_from_rbuf (BLOCKNUM blocknum, u_int32_t fullhash, BRTNODE *b
result->u.l.leaflock_pool = toku_cachefile_leaflock_pool(h->cf);
r = deserialize_brtnode_leaf_from_rbuf(result, magic, rb);
}
if (r!=0) goto died0;
//printf("%s:%d Ok got %lld n_children=%d\n", __FILE__, __LINE__, result->thisnodename, result->n_children);
......
......@@ -17,12 +17,14 @@ enum brt_layout_version_e {
BRT_LAYOUT_VERSION_11 = 11, // Diff from 10 to 11: Nested transaction leafentries (completely redesigned). BRT_CMDs on disk now support XIDS (multiple txnids) instead of exactly one.
BRT_LAYOUT_VERSION_12 = 12, // Diff from 11 to 12: Added BRT_CMD 'BRT_INSERT_NO_OVERWRITE', compressed block format, num old blocks
BRT_LAYOUT_VERSION_13 = 13, // Diff from 12 to 13: Fixed loader pivot bug, added build_id to every node, timestamps to brtheader
BRT_LAYOUT_VERSION_14 = 14, // Diff from 13 to 14: Added MVCC, deprecated TOKU_DB_VALCMP_BUILTIN(_13), Remove fingerprints
BRT_LAYOUT_VERSION_14 = 14, // Diff from 13 to 14: Added MVCC; deprecated TOKU_DB_VALCMP_BUILTIN(_13); Remove fingerprints; Support QUICKLZ; add end-to-end checksum on uncompressed data.
BRT_NEXT_VERSION, // the version after the current version
BRT_LAYOUT_VERSION = BRT_NEXT_VERSION-1, // A hack so I don't have to change this line.
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_13 // Minimum version supported
BRT_LAYOUT_MIN_SUPPORTED_VERSION = BRT_LAYOUT_VERSION_13, // Minimum version supported
// Define these symbolically so the knowledge of exactly which layout version got rid of fingerprints isn't spread all over the code.
BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT = BRT_LAYOUT_VERSION_13,
BRT_FIRST_LAYOUT_VERSION_WITH_END_TO_END_CHECKSUM = BRT_LAYOUT_VERSION_14,
};
// Define this symbolically so the knowledge of exactly which layout version got rid of fingerprints isn't spread all over the code.
#define BRT_LAST_LAYOUT_VERSION_WITH_FINGERPRINT BRT_LAYOUT_VERSION_13
#endif
......@@ -14,7 +14,6 @@
#include <errno.h>
#include <toku_assert.h>
#include <string.h>
#include "zlib.h"
#include <fcntl.h>
#include "x1764.h"
#include "brtloader-internal.h"
......@@ -2850,6 +2849,8 @@ static void finish_leafnode (struct dbout *out, struct leaf_buf *lbuf, int progr
putbuf_int32_at(&lbuf->dbuf, lbuf->partitions_p+12, partition_map.size);
putbuf_int32_at(&lbuf->dbuf, lbuf->n_in_buf_p, lbuf->n_in_buf);
u_int32_t xsum = x1764_memory(lbuf->dbuf.buf, lbuf->dbuf.off);
putbuf_int32(&lbuf->dbuf, xsum);
result = lbuf->dbuf.error;
if (result == 0) {
......
/* -*- mode: C; c-basic-offset: 4 -*- */
#ident "Copyright (c) 2011 Tokutek Inc. All rights reserved."
#ident "$Id$"
#include <zlib.h>
#include "compress.h"
#include "memory.h"
#include "quicklz.h"
#include "toku_assert.h"
size_t toku_compress_bound (enum toku_compression_method a, size_t size)
// See compress.h for the specification of this function.
{
switch (a) {
case TOKU_QUICKLZ_METHOD:
return size+400 + 1; // quicklz manual says 400 bytes is enough. We need one more byte for the rfc1950-style header byte. bits 0-3 are 9, bits 4-7 are the QLZ_COMPRESSION_LEVEL.
case TOKU_ZLIB_METHOD:
return compressBound (size);
}
// fall through for bad enum (thus compiler can warn us if we didn't use all the enums
assert(0);
}
static const int zlib_compression_level = 5;
void toku_compress (enum toku_compression_method a,
// the following types and naming conventions come from zlib.h
Bytef *dest, uLongf *destLen,
const Bytef *source, uLong sourceLen)
// See compress.h for the specification of this function.
{
assert(sourceLen < (1LL << 32));
switch (a) {
case TOKU_ZLIB_METHOD: {
int r = compress2(dest, destLen, source, sourceLen, zlib_compression_level);
assert(r == Z_OK);
assert((dest[0]&0xF) == TOKU_ZLIB_METHOD);
return;
}
case TOKU_QUICKLZ_METHOD: {
if (sourceLen==0) {
// quicklz requires at least one byte, so we handle this ourselves
assert(1 <= *destLen);
*destLen = 1;
} else {
qlz_state_compress *MALLOC(qsc);
size_t actual_destlen = qlz_compress(source, (char*)(dest+1), sourceLen, qsc);
assert(actual_destlen +1 <= *destLen);
*destLen = actual_destlen+1; // add one for the rfc1950-style header byte.
toku_free(qsc);
}
// Fill in that first byte
dest[0] = TOKU_QUICKLZ_METHOD + (QLZ_COMPRESSION_LEVEL << 4);
return;
}}
// default fall through to error.
assert(0);
}
void toku_decompress (Bytef *dest, uLongf destLen,
const Bytef *source, uLongf sourceLen)
// See compress.h for the specification of this function.
{
assert(sourceLen>=1); // need at least one byte for the RFC header.
switch (source[0] & 0xF) {
case TOKU_ZLIB_METHOD: {
uLongf actual_destlen = destLen;
int r = uncompress(dest, &actual_destlen, source, sourceLen);
assert(r == Z_OK);
assert(actual_destlen == destLen);
return;
}
case TOKU_QUICKLZ_METHOD:
if (sourceLen>1) {
qlz_state_decompress *MALLOC(qsd);
uLongf actual_destlen = qlz_decompress((char*)source+1, dest, qsd);
assert(actual_destlen == destLen);
toku_free(qsd);
} else {
// length 1 means there is no data, so do nothing.
assert(destLen==0);
}
return;
}
// default fall through to error.
assert(0);
}
/* -*- mode: C; c-basic-offset: 4 -*- */
#ifndef TOKU_COMPRESS_H
#define TOKU_COMPRESS_H
#ident "$Id$"
#include <zlib.h>
// The following provides an abstraction of quicklz and zlib.
// We offer two compression methods: ZLIB and QUICKLZ.
// The resulting byte string includes enough information for us to decompress it. That is, we can tell whether it's z-compressed or qz-compressed.
enum toku_compression_method {
TOKU_ZLIB_METHOD = 8, // RFC 1950 says use 8 for zlib. It reserves 15 to allow more bytes.
TOKU_QUICKLZ_METHOD = 9 // We use 9 for QUICKLZ with compression level = 3. I couldn't find any standard for any other numbers, so I just use 9. -Bradley
};
size_t toku_compress_bound (enum toku_compression_method a, size_t size);
// Effect: Return the number of bytes needed to compress a buffer of size SIZE using compression method A.
// Typically, the result is a little bit larger than SIZE, since some data cannot be compressed.
// Usage note: It may help to know roughly how much space is involved.
// zlib's bound is something like (size + (size>>12) + (size>>14) + (size>>25) + 13.
// quicklz's bound is something like size+400.
void toku_compress (enum toku_compression_method a,
// the following types and naming conventions come from zlib.h
Bytef *dest, uLongf *destLen,
const Bytef *source, uLong sourceLen);
// Effect: Using compression method A, compress SOURCE into DEST. The number of bytes to compress is passed in SOURCELEN.
// On input: *destLen is the size of the buffer.
// On output: *destLen is the size of the actual compressed data.
// Usage note: sourceLen may be be zero (unlike for quicklz, which requires sourceLen>0).
// Requires: The buffer must be big enough to hold the compressed data. (That is *destLen >= compressBound(a, sourceLen))
// Requires: sourceLen < 2^32.
// Usage note: Although we *try* to assert if the DESTLEN isn't big enough, it's possible that it's too late by then (in the case of quicklz which offers
// no way to avoid a buffer overrun.) So we require that that DESTLEN is big enough.
// Rationale: zlib's argument order is DEST then SOURCE with the size of the buffer passed in *destLen, and the size of the result returned in *destLen.
// quicklz's argument order is SOURCE then DEST with the size returned (and it has no way to verify that an overright didn't happen).
// We use zlib's calling conventions partly because it is safer, and partly because it is more established.
// We also use zlib's ugly camel case convention for destLen and sourceLen.
// Unlike zlib, we return no error codes. Instead, we require that the data be OK and the size of the buffers is OK, and assert if there's a problem.
void toku_decompress (Bytef *dest, uLongf destLen,
const Bytef *source, uLongf sourceLen);
// Effect: Decompress source (length sourceLen) into dest (length destLen)
// This function can decompress data compressed with either zlib or quicklz compression methods (calling toku_compress(), which puts an appropriate header on so we know which it is.)
// Requires: destLen is equal to the actual decompressed size of the data.
// Requires: The source must have been properly compressed.
#endif
......@@ -11,7 +11,6 @@
#include "toku_os.h"
#if TOKU_WINDOWS
#include "zlib.h"
#include "toku_pthread.h"
#include <dirent.h>
#else
......@@ -22,7 +21,6 @@
#include <sys/resource.h>
#include <sys/time.h>
#include <unistd.h>
#include <zlib.h>
#endif
#include <ctype.h>
......
// Fast data compression library
// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
// lar@quicklz.com
//
// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
// released into public must be open source) or under a commercial license if such
// has been acquired (see http://www.quicklz.com/order.html). The commercial license
// does not cover derived or ported versions created by third parties under GPL.
// 1.5.0 final
#include "quicklz.h"
#if QLZ_VERSION_MAJOR != 1 || QLZ_VERSION_MINOR != 5 || QLZ_VERSION_REVISION != 0
#error quicklz.c and quicklz.h have different versions
#endif
#if (defined(__X86__) || defined(__i386__) || defined(i386) || defined(_M_IX86) || defined(__386__) || defined(__x86_64__) || defined(_M_X64))
#define X86X64
#endif
#define MINOFFSET 2
#define UNCONDITIONAL_MATCHLEN 6
#define UNCOMPRESSED_END 4
#define CWORD_LEN 4
#if QLZ_COMPRESSION_LEVEL == 1 && defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
#define OFFSET_BASE source
#define CAST (ui32)(size_t)
#else
#define OFFSET_BASE 0
#define CAST
#endif
int qlz_get_setting(int setting)
{
switch (setting)
{
case 0: return QLZ_COMPRESSION_LEVEL;
case 1: return sizeof(qlz_state_compress);
case 2: return sizeof(qlz_state_decompress);
case 3: return QLZ_STREAMING_BUFFER;
#ifdef QLZ_MEMORY_SAFE
case 6: return 1;
#else
case 6: return 0;
#endif
case 7: return QLZ_VERSION_MAJOR;
case 8: return QLZ_VERSION_MINOR;
case 9: return QLZ_VERSION_REVISION;
}
return -1;
}
#if QLZ_COMPRESSION_LEVEL == 1
static int same(const unsigned char *src, size_t n)
{
while(n > 0 && *(src + n) == *src)
n--;
return n == 0 ? 1 : 0;
}
#endif
static void reset_table_compress(qlz_state_compress *state)
{
int i;
for(i = 0; i < QLZ_HASH_VALUES; i++)
{
#if QLZ_COMPRESSION_LEVEL == 1
state->hash[i].offset = 0;
#else
state->hash_counter[i] = 0;
#endif
}
}
static void reset_table_decompress(qlz_state_decompress *state)
{
int i;
(void)state;
(void)i;
#if QLZ_COMPRESSION_LEVEL == 2
for(i = 0; i < QLZ_HASH_VALUES; i++)
{
state->hash_counter[i] = 0;
}
#endif
}
static __inline ui32 hash_func(ui32 i)
{
#if QLZ_COMPRESSION_LEVEL == 2
return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1);
#else
return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1);
#endif
}
static __inline ui32 fast_read(void const *src, ui32 bytes)
{
#ifndef X86X64
unsigned char *p = (unsigned char*)src;
switch (bytes)
{
case 4:
return(*p | *(p + 1) << 8 | *(p + 2) << 16 | *(p + 3) << 24);
case 3:
return(*p | *(p + 1) << 8 | *(p + 2) << 16);
case 2:
return(*p | *(p + 1) << 8);
case 1:
return(*p);
}
return 0;
#else
if (bytes >= 1 && bytes <= 4)
return *((ui32*)src);
else
return 0;
#endif
}
static __inline ui32 hashat(const unsigned char *src)
{
ui32 fetch, hash;
fetch = fast_read(src, 3);
hash = hash_func(fetch);
return hash;
}
static __inline void fast_write(ui32 f, void *dst, size_t bytes)
{
#ifndef X86X64
unsigned char *p = (unsigned char*)dst;
switch (bytes)
{
case 4:
*p = (unsigned char)f;
*(p + 1) = (unsigned char)(f >> 8);
*(p + 2) = (unsigned char)(f >> 16);
*(p + 3) = (unsigned char)(f >> 24);
return;
case 3:
*p = (unsigned char)f;
*(p + 1) = (unsigned char)(f >> 8);
*(p + 2) = (unsigned char)(f >> 16);
return;
case 2:
*p = (unsigned char)f;
*(p + 1) = (unsigned char)(f >> 8);
return;
case 1:
*p = (unsigned char)f;
return;
}
#else
switch (bytes)
{
case 4:
*((ui32*)dst) = f;
return;
case 3:
*((ui32*)dst) = f;
return;
case 2:
*((ui16 *)dst) = (ui16)f;
return;
case 1:
*((unsigned char*)dst) = (unsigned char)f;
return;
}
#endif
}
size_t qlz_size_decompressed(const char *source)
{
ui32 n, r;
n = (((*source) & 2) == 2) ? 4 : 1;
r = fast_read(source + 1 + n, n);
r = r & (0xffffffff >> ((4 - n)*8));
return r;
}
size_t qlz_size_compressed(const char *source)
{
ui32 n, r;
n = (((*source) & 2) == 2) ? 4 : 1;
r = fast_read(source + 1, n);
r = r & (0xffffffff >> ((4 - n)*8));
return r;
}
static
size_t qlz_size_header(const char *source)
{
size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1;
return n;
}
static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n)
{
// Caution if modifying memcpy_up! Overlap of dst and src must be special handled.
#ifndef X86X64
unsigned char *end = dst + n;
while(dst < end)
{
*dst = *src;
dst++;
src++;
}
#else
ui32 f = 0;
do
{
*(ui32 *)(dst + f) = *(ui32 *)(src + f);
f += MINOFFSET + 1;
}
while (f < n);
#endif
}
static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s)
{
#if QLZ_COMPRESSION_LEVEL == 1
ui32 hash;
hash = hashat(s);
state->hash[hash].offset = s;
state->hash_counter[hash] = 1;
#elif QLZ_COMPRESSION_LEVEL == 2
ui32 hash;
unsigned char c;
hash = hashat(s);
c = state->hash_counter[hash];
state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s;
c++;
state->hash_counter[hash] = c;
#endif
(void)state;
(void)s;
}
#if QLZ_COMPRESSION_LEVEL <= 2
static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max)
{
while(*lh < max)
{
(*lh)++;
update_hash(state, *lh);
}
}
#endif
static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state)
{
const unsigned char *last_byte = source + size - 1;
const unsigned char *src = source;
unsigned char *cword_ptr = destination;
unsigned char *dst = destination + CWORD_LEN;
ui32 cword_val = 1U << 31;
const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
ui32 fetch = 0;
unsigned int lits = 0;
(void) lits;
if(src <= last_matchstart)
fetch = fast_read(src, 3);
while(src <= last_matchstart)
{
if ((cword_val & 1) == 1)
{
// store uncompressed if compression ratio is too low
if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5))
return 0;
fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
cword_ptr = dst;
dst += CWORD_LEN;
cword_val = 1U << 31;
fetch = fast_read(src, 3);
}
#if QLZ_COMPRESSION_LEVEL == 1
{
const unsigned char *o;
ui32 hash, cached;
hash = hash_func(fetch);
cached = fetch ^ state->hash[hash].cache;
state->hash[hash].cache = fetch;
o = state->hash[hash].offset + OFFSET_BASE;
state->hash[hash].offset = CAST(src - OFFSET_BASE);
#ifdef X86X64
if ((cached & 0xffffff) == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
{
if(cached != 0)
{
#else
if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6))))
{
if (*(o + 3) != *(src + 3))
{
#endif
hash <<= 4;
cword_val = (cword_val >> 1) | (1U << 31);
fast_write((3 - 2) | hash, dst, 2);
src += 3;
dst += 2;
}
else
{
const unsigned char *old_src = src;
size_t matchlen;
hash <<= 4;
cword_val = (cword_val >> 1) | (1U << 31);
src += 4;
if(*(o + (src - old_src)) == *src)
{
src++;
if(*(o + (src - old_src)) == *src)
{
size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1;
size_t remaining = q > 255 ? 255 : q;
src++;
while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining)
src++;
}
}
matchlen = src - old_src;
if (matchlen < 18)
{
fast_write((ui32)(matchlen - 2) | hash, dst, 2);
dst += 2;
}
else
{
fast_write((ui32)(matchlen << 16) | hash, dst, 3);
dst += 3;
}
}
fetch = fast_read(src, 3);
lits = 0;
}
else
{
lits++;
*dst = *src;
src++;
dst++;
cword_val = (cword_val >> 1);
#ifdef X86X64
fetch = fast_read(src, 3);
#else
fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16);
#endif
}
}
#elif QLZ_COMPRESSION_LEVEL >= 2
{
const unsigned char *o, *offset2;
ui32 hash, matchlen, k, m, best_k = 0;
unsigned char c;
size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1);
(void)best_k;
//hash = hashat(src);
fetch = fast_read(src, 3);
hash = hash_func(fetch);
c = state->hash_counter[hash];
offset2 = state->hash[hash].offset[0];
if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0)
{
matchlen = 3;
if(*(offset2 + matchlen) == *(src + matchlen))
{
matchlen = 4;
while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining)
matchlen++;
}
}
else
matchlen = 0;
for(k = 1; k < QLZ_POINTERS && c > k; k++)
{
o = state->hash[hash].offset[k];
#if QLZ_COMPRESSION_LEVEL == 3
if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
#elif QLZ_COMPRESSION_LEVEL == 2
if(*(src + matchlen) == *(o + matchlen) && ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET)
#endif
{
m = 3;
while(*(o + m) == *(src + m) && m < remaining)
m++;
#if QLZ_COMPRESSION_LEVEL == 3
if ((m > matchlen) || (m == matchlen && o > offset2))
#elif QLZ_COMPRESSION_LEVEL == 2
if (m > matchlen)
#endif
{
offset2 = o;
matchlen = m;
best_k = k;
}
}
}
o = offset2;
state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
c++;
state->hash_counter[hash] = c;
#if QLZ_COMPRESSION_LEVEL == 3
if(matchlen > 2 && src - o < 131071)
{
ui32 u;
size_t offset = src - o;
for(u = 1; u < matchlen; u++)
{
hash = hashat(src + u);
c = state->hash_counter[hash]++;
state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u;
}
cword_val = (cword_val >> 1) | (1U << 31);
src += matchlen;
if(matchlen == 3 && offset <= 63)
{
*dst = (unsigned char)(offset << 2);
dst++;
}
else if (matchlen == 3 && offset <= 16383)
{
ui32 f = (ui32)((offset << 2) | 1);
fast_write(f, dst, 2);
dst += 2;
}
else if (matchlen <= 18 && offset <= 1023)
{
ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2;
fast_write(f, dst, 2);
dst += 2;
}
else if(matchlen <= 33)
{
ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3;
fast_write(f, dst, 3);
dst += 3;
}
else
{
ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3;
fast_write(f, dst, 4);
dst += 4;
}
}
else
{
*dst = *src;
src++;
dst++;
cword_val = (cword_val >> 1);
}
#elif QLZ_COMPRESSION_LEVEL == 2
if(matchlen > 2)
{
cword_val = (cword_val >> 1) | (1U << 31);
src += matchlen;
if (matchlen < 10)
{
ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5);
fast_write(f, dst, 2);
dst += 2;
}
else
{
ui32 f = best_k | (matchlen << 16) | (hash << 5);
fast_write(f, dst, 3);
dst += 3;
}
}
else
{
*dst = *src;
src++;
dst++;
cword_val = (cword_val >> 1);
}
#endif
}
#endif
}
while (src <= last_byte)
{
if ((cword_val & 1) == 1)
{
fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
cword_ptr = dst;
dst += CWORD_LEN;
cword_val = 1U << 31;
}
#if QLZ_COMPRESSION_LEVEL < 3
if (src <= last_byte - 3)
{
#if QLZ_COMPRESSION_LEVEL == 1
ui32 hash, fetchv;
fetchv = fast_read(src, 3);
hash = hash_func(fetch);
state->hash[hash].offset = CAST(src - OFFSET_BASE);
state->hash[hash].cache = fetchv;
#elif QLZ_COMPRESSION_LEVEL == 2
ui32 hash;
unsigned char c;
hash = hashat(src);
c = state->hash_counter[hash];
state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src;
c++;
state->hash_counter[hash] = c;
#endif
}
#endif
*dst = *src;
src++;
dst++;
cword_val = (cword_val >> 1);
}
while((cword_val & 1) != 1)
cword_val = (cword_val >> 1);
fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN);
// min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument
return dst - destination < 9 ? 9 : dst - destination;
}
static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history)
{
const unsigned char *src = source + qlz_size_header((const char *)source);
unsigned char *dst = destination;
const unsigned char *last_destination_byte = destination + size - 1;
ui32 cword_val = 1;
const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END;
unsigned char *last_hashed = destination - 1;
const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1;
static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
(void) last_source_byte;
(void) last_hashed;
(void) state;
(void) history;
for(;;)
{
ui32 fetch;
if (cword_val == 1)
{
#ifdef QLZ_MEMORY_SAFE
if(src + CWORD_LEN - 1 > last_source_byte)
return 0;
#endif
cword_val = fast_read(src, CWORD_LEN);
src += CWORD_LEN;
}
#ifdef QLZ_MEMORY_SAFE
if(src + 4 - 1 > last_source_byte)
return 0;
#endif
fetch = fast_read(src, 4);
if ((cword_val & 1) == 1)
{
ui32 matchlen;
const unsigned char *offset2;
#if QLZ_COMPRESSION_LEVEL == 1
ui32 hash;
cword_val = cword_val >> 1;
hash = (fetch >> 4) & 0xfff;
offset2 = (const unsigned char *)(size_t)state->hash[hash].offset;
if((fetch & 0xf) != 0)
{
matchlen = (fetch & 0xf) + 2;
src += 2;
}
else
{
matchlen = *(src + 2);
src += 3;
}
#elif QLZ_COMPRESSION_LEVEL == 2
ui32 hash;
unsigned char c;
cword_val = cword_val >> 1;
hash = (fetch >> 5) & 0x7ff;
c = (unsigned char)(fetch & 0x3);
offset2 = state->hash[hash].offset[c];
if((fetch & (28)) != 0)
{
matchlen = ((fetch >> 2) & 0x7) + 2;
src += 2;
}
else
{
matchlen = *(src + 2);
src += 3;
}
#elif QLZ_COMPRESSION_LEVEL == 3
ui32 offset;
cword_val = cword_val >> 1;
if ((fetch & 3) == 0)
{
offset = (fetch & 0xff) >> 2;
matchlen = 3;
src++;
}
else if ((fetch & 2) == 0)
{
offset = (fetch & 0xffff) >> 2;
matchlen = 3;
src += 2;
}
else if ((fetch & 1) == 0)
{
offset = (fetch & 0xffff) >> 6;
matchlen = ((fetch >> 2) & 15) + 3;
src += 2;
}
else if ((fetch & 127) != 3)
{
offset = (fetch >> 7) & 0x1ffff;
matchlen = ((fetch >> 2) & 0x1f) + 2;
src += 3;
}
else
{
offset = (fetch >> 15);
matchlen = ((fetch >> 7) & 255) + 3;
src += 4;
}
offset2 = dst - offset;
#endif
#ifdef QLZ_MEMORY_SAFE
if(offset2 < history || offset2 > dst - MINOFFSET - 1)
return 0;
if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1))
return 0;
#endif
memcpy_up(dst, offset2, matchlen);
dst += matchlen;
#if QLZ_COMPRESSION_LEVEL <= 2
update_hash_upto(state, &last_hashed, dst - matchlen);
last_hashed = dst - 1;
#endif
}
else
{
if (dst < last_matchstart)
{
unsigned int n = bitlut[cword_val & 0xf];
#ifdef X86X64
*(ui32 *)dst = *(ui32 *)src;
#else
memcpy_up(dst, src, 4);
#endif
cword_val = cword_val >> n;
dst += n;
src += n;
#if QLZ_COMPRESSION_LEVEL <= 2
update_hash_upto(state, &last_hashed, dst - 3);
#endif
}
else
{
while(dst <= last_destination_byte)
{
if (cword_val == 1)
{
src += CWORD_LEN;
cword_val = 1U << 31;
}
#ifdef QLZ_MEMORY_SAFE
if(src >= last_source_byte + 1)
return 0;
#endif
*dst = *src;
dst++;
src++;
cword_val = cword_val >> 1;
}
#if QLZ_COMPRESSION_LEVEL <= 2
update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant
#endif
return size;
}
}
}
}
size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state)
{
size_t r;
ui32 compressed;
size_t base;
if(size == 0 || size > 0xffffffff - 400)
return 0;
if(size < 216)
base = 3;
else
base = 9;
#if QLZ_STREAMING_BUFFER > 0
if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER)
#endif
{
reset_table_compress(state);
r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state);
#if QLZ_STREAMING_BUFFER > 0
reset_table_compress(state);
#endif
if(r == base)
{
memcpy(destination + base, source, size);
r = size + base;
compressed = 0;
}
else
{
compressed = 1;
}
state->stream_counter = 0;
}
#if QLZ_STREAMING_BUFFER > 0
else
{
unsigned char *src = state->stream_buffer + state->stream_counter;
memcpy(src, source, size);
r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state);
if(r == base)
{
memcpy(destination + base, src, size);
r = size + base;
compressed = 0;
reset_table_compress(state);
}
else
{
compressed = 1;
}
state->stream_counter += size;
}
#endif
if(base == 3)
{
*destination = (unsigned char)(0 | compressed);
*(destination + 1) = (unsigned char)r;
*(destination + 2) = (unsigned char)size;
}
else
{
*destination = (unsigned char)(2 | compressed);
fast_write((ui32)r, destination + 1, 4);
fast_write((ui32)size, destination + 5, 4);
}
*destination |= (QLZ_COMPRESSION_LEVEL << 2);
*destination |= (1 << 6);
*destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4);
// 76543210
// 01SSLLHC
return r;
}
size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state)
{
size_t dsiz = qlz_size_decompressed(source);
#if QLZ_STREAMING_BUFFER > 0
if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER)
#endif
{
if((*source & 1) == 1)
{
reset_table_decompress(state);
dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination);
}
else
{
memcpy(destination, source + qlz_size_header(source), dsiz);
}
state->stream_counter = 0;
reset_table_decompress(state);
}
#if QLZ_STREAMING_BUFFER > 0
else
{
unsigned char *dst = state->stream_buffer + state->stream_counter;
if((*source & 1) == 1)
{
dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer);
}
else
{
memcpy(dst, source + qlz_size_header(source), dsiz);
reset_table_decompress(state);
}
memcpy(destination, dst, dsiz);
state->stream_counter += dsiz;
}
#endif
return dsiz;
}
#ifndef QLZ_HEADER
#define QLZ_HEADER
// Fast data compression library
// Copyright (C) 2006-2011 Lasse Mikkel Reinhold
// lar@quicklz.com
//
// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything
// released into public must be open source) or under a commercial license if such
// has been acquired (see http://www.quicklz.com/order.html). The commercial license
// does not cover derived or ported versions created by third parties under GPL.
// You can edit following user settings. Data must be decompressed with the same
// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed
// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially
// zeroed out (see manual). First #ifndef makes it possible to define settings from
// the outside like the compiler command line.
// 1.5.0 final
#ifndef QLZ_COMPRESSION_LEVEL
//#define QLZ_COMPRESSION_LEVEL 1
//#define QLZ_COMPRESSION_LEVEL 2
#define QLZ_COMPRESSION_LEVEL 3
#define QLZ_STREAMING_BUFFER 0
//#define QLZ_STREAMING_BUFFER 100000
//#define QLZ_STREAMING_BUFFER 1000000
//#define QLZ_MEMORY_SAFE
#endif
#define QLZ_VERSION_MAJOR 1
#define QLZ_VERSION_MINOR 5
#define QLZ_VERSION_REVISION 0
// Using size_t, memset() and memcpy()
#include <string.h>
// Verify compression level
#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3
#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3
#endif
typedef unsigned int ui32;
typedef unsigned short int ui16;
// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values!
#if QLZ_COMPRESSION_LEVEL == 1
#define QLZ_POINTERS 1
#define QLZ_HASH_VALUES 4096
#elif QLZ_COMPRESSION_LEVEL == 2
#define QLZ_POINTERS 4
#define QLZ_HASH_VALUES 2048
#elif QLZ_COMPRESSION_LEVEL == 3
#define QLZ_POINTERS 16
#define QLZ_HASH_VALUES 4096
#endif
// Detect if pointer size is 64-bit. It's not fatal if some 64-bit target is not detected because this is only for adding an optional 64-bit optimization.
#if defined _LP64 || defined __LP64__ || defined __64BIT__ || _ADDR64 || defined _WIN64 || defined __arch64__ || __WORDSIZE == 64 || (defined __sparc && defined __sparcv9) || defined __x86_64 || defined __amd64 || defined __x86_64__ || defined _M_X64 || defined _M_IA64 || defined __ia64 || defined __IA64__
#define QLZ_PTR_64
#endif
// hash entry
typedef struct
{
#if QLZ_COMPRESSION_LEVEL == 1
ui32 cache;
#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0
unsigned int offset;
#else
const unsigned char *offset;
#endif
#else
const unsigned char *offset[QLZ_POINTERS];
#endif
} qlz_hash_compress;
typedef struct
{
#if QLZ_COMPRESSION_LEVEL == 1
const unsigned char *offset;
#else
const unsigned char *offset[QLZ_POINTERS];
#endif
} qlz_hash_decompress;
// states
typedef struct
{
#if QLZ_STREAMING_BUFFER > 0
unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
#endif
size_t stream_counter;
qlz_hash_compress hash[QLZ_HASH_VALUES];
unsigned char hash_counter[QLZ_HASH_VALUES];
} qlz_state_compress;
#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2
typedef struct
{
#if QLZ_STREAMING_BUFFER > 0
unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
#endif
qlz_hash_decompress hash[QLZ_HASH_VALUES];
unsigned char hash_counter[QLZ_HASH_VALUES];
size_t stream_counter;
} qlz_state_decompress;
#elif QLZ_COMPRESSION_LEVEL == 3
typedef struct
{
#if QLZ_STREAMING_BUFFER > 0
unsigned char stream_buffer[QLZ_STREAMING_BUFFER];
#endif
#if QLZ_COMPRESSION_LEVEL <= 2
qlz_hash_decompress hash[QLZ_HASH_VALUES];
#endif
size_t stream_counter;
} qlz_state_decompress;
#endif
#if defined (__cplusplus)
extern "C" {
#endif
// Public functions of QuickLZ
size_t qlz_size_decompressed(const char *source);
size_t qlz_size_compressed(const char *source);
size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state);
size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state);
int qlz_get_setting(int setting);
#if defined (__cplusplus)
}
#endif
#endif
......@@ -6,12 +6,13 @@
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include "quicklz.h"
#include <zlib.h>
#include "toku_assert.h"
#include "x1764.h"
#include "threadpool.h"
#include "sub_block.h"
#include "compress.h"
void
sub_block_init(struct sub_block *sub_block) {
......@@ -120,6 +121,19 @@ compress_work_init(struct compress_work *w, struct sub_block *sub_block) {
w->sub_block = sub_block;
}
static enum toku_compression_method toku_compress_method = TOKU_QUICKLZ_METHOD;
void toku_set_default_compression_method (enum toku_compression_method a) {
switch (a) {
case TOKU_ZLIB_METHOD:
case TOKU_QUICKLZ_METHOD:
toku_compress_method = a;
return;
}
// fall through to error
assert(0);
}
void
compress_sub_block(struct sub_block *sub_block) {
// compress it
......@@ -127,12 +141,9 @@ compress_sub_block(struct sub_block *sub_block) {
Bytef *compressed_ptr = (Bytef *) sub_block->compressed_ptr;
uLongf uncompressed_len = sub_block->uncompressed_size;
uLongf real_compressed_len = sub_block->compressed_size_bound;
int compression_level = 5;
int r = compress2((Bytef*)compressed_ptr, &real_compressed_len,
(Bytef*)uncompressed_ptr, uncompressed_len,
compression_level);
assert(r == Z_OK);
toku_compress(toku_compress_method,
compressed_ptr, &real_compressed_len,
uncompressed_ptr, uncompressed_len);
sub_block->compressed_size = real_compressed_len; // replace the compressed size estimate with the real size
// checksum it
......@@ -234,16 +245,9 @@ decompress_sub_block(void *compress_ptr, u_int32_t compress_size, void *uncompre
if (verbose_decompress_sub_block) fprintf(stderr, "%s:%d xsum %u expected %u\n", __FUNCTION__, __LINE__, xsum, expected_xsum);
result = EINVAL;
} else {
// decompress
uLongf destlen = uncompress_size;
int r = uncompress(uncompress_ptr, &destlen, compress_ptr, compress_size);
if (r != Z_OK || destlen != uncompress_size) {
if (verbose_decompress_sub_block) fprintf(stderr, "%s:%d uncompress %d %lu %u\n", __FUNCTION__, __LINE__, r, destlen, uncompress_size);
result = EINVAL;
toku_decompress(uncompress_ptr, uncompress_size, compress_ptr, compress_size);
}
}
return result;
}
......
......@@ -5,10 +5,15 @@
#ident "Copyright (c) 2007-2010 Tokutek Inc. All rights reserved."
#ident "The technology is licensed by the Massachusetts Institute of Technology, Rutgers State University of New Jersey, and the Research Foundation of State University of New York at Stony Brook under United States of America Serial No. 11/760379 and to the patents and/or patent applications resulting from it."
#include <compress.h>
#if defined(__cplusplus) || defined(__cilkplusplus)
extern "C" {
#endif
void toku_set_default_compression_method (enum toku_compression_method a);
// Effect: for the following functions, set the default compression method.
static const int max_sub_blocks = 8;
static const int target_sub_block_size = 512*1024;
......
/* -*- mode: C; c-basic-offset: 4 -*- */
// Test quicklz.
// Compare to compress-test which tests the toku compression (which is a composite of quicklz and zlib).
#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved."
#ident "$Id$"
#include "test.h"
#include "compress.h"
static void test_compress_buf_method (unsigned char *buf, int i, enum toku_compression_method m) {
int bound = toku_compress_bound(m, i);
unsigned char *MALLOC_N(bound, cb);
uLongf actual_clen = bound;
toku_compress(m, cb, &actual_clen, buf, i);
unsigned char *MALLOC_N(i, ubuf);
toku_decompress(ubuf, i, cb, actual_clen);
assert(0==memcmp(ubuf, buf, i));
toku_free(ubuf);
toku_free(cb);
}
static void test_compress_buf (unsigned char *buf, int i) {
test_compress_buf_method(buf, i, TOKU_ZLIB_METHOD);
test_compress_buf_method(buf, i, TOKU_QUICKLZ_METHOD);
}
static void test_compress_i (int i) {
unsigned char *MALLOC_N(i, b);
for (int j=0; j<i; j++) b[j] = random()%256;
test_compress_buf (b, i);
for (int j=0; j<i; j++) b[j] = 0;
test_compress_buf (b, i);
for (int j=0; j<i; j++) b[j] = 0xFF;
test_compress_buf (b, i);
toku_free(b);
}
static void test_compress (void) {
// unlike quicklz, we can handle length 0.
for (int i=0; i<100; i++) {
test_compress_i(i);
}
test_compress_i(1024);
test_compress_i(1024*1024*4);
test_compress_i(1024*1024*4 - 123); // just some random lengths
}
int test_main (int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_compress();
return 0;
}
/* -*- mode: C; c-basic-offset: 4 -*- */
// Test quicklz.
// Compare to compress-test which tests the toku compression (which is a composite of quicklz and zlib).
#ident "Copyright (c) 2010 Tokutek Inc. All rights reserved."
#ident "$Id$"
#include "test.h"
#include "quicklz.h"
static void test_qlz_random_i (int i) {
if (verbose) printf("i=%d\n", i);
qlz_state_compress *MALLOC(compress_state);
qlz_state_decompress *MALLOC(decompress_state);
char *MALLOC_N(i, m);
char *MALLOC_N(i, m2);
for (int j=0; j<i; j++) {
m[j] = (random()%256)-128;
}
int csize_bound = i+400;
char *MALLOC_N(csize_bound, c);
memset(compress_state, 0, sizeof(*compress_state));
memset(decompress_state, 0, sizeof(*decompress_state));
int s = qlz_compress(m, c, i, compress_state);
assert(s <= csize_bound);
int r = qlz_decompress(c, m2, decompress_state);
assert(r==i);
assert(memcmp(m, m2, i)==0);
toku_free(m);
toku_free(c);
toku_free(m2);
toku_free(compress_state);
toku_free(decompress_state);
}
static void test_qlz_random (void) {
// quicklz cannot handle i==0.
for (int i=1; i<100; i++) {
test_qlz_random_i(i);
}
for (int i=64; i<=1024*1024*8; i*=4) {
test_qlz_random_i(i);
test_qlz_random_i(i+random()%i);
}
}
int test_main (int argc, const char *argv[]) {
default_parse_args(argc, argv);
test_qlz_random();
return 0;
}
......@@ -95,4 +95,13 @@
fun:deflate
fun:compress2
}
{
qlz_is_not_valgrind_clean
Memcheck:Cond
fun:qlz_compress_core
}
{
qlz_is_not_valgrind_clean
Memcheck:Cond
fun:qlz_compress
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment