Commit 1fb4e4de authored by Nigel Tao's avatar Nigel Tao

compress/flate: use a constant hash table size for Best Speed.

This makes compress/flate's version of Snappy diverge from the upstream
golang/snappy version, but the latter has a goal of matching C++ snappy
output byte-for-byte. Both C++ and the asm version of golang/snappy can
use a smaller N for the O(N) zero-initialization of the hash table when
the input is small, even if the pure Go golang/snappy algorithm cannot:
"var table [tableSize]uint16" zeroes all tableSize elements.

For this package, we don't have the match-C++-snappy goal, so we can use
a different (constant) hash table size.

This is a small win, in terms of throughput and output size, but it also
enables us to re-use the (constant size) hash table between
encodeBestSpeed calls, avoiding the cost of zero-initializing the hash
table altogether. This will be implemented in follow-up commits.

This package's benchmarks:
name                    old speed      new speed      delta
EncodeDigitsSpeed1e4-8  72.8MB/s ± 1%  73.5MB/s ± 1%  +0.86%  (p=0.000 n=10+10)
EncodeDigitsSpeed1e5-8  77.5MB/s ± 1%  78.0MB/s ± 0%  +0.69%  (p=0.000 n=10+10)
EncodeDigitsSpeed1e6-8  82.0MB/s ± 1%  82.7MB/s ± 1%  +0.85%   (p=0.000 n=10+9)
EncodeTwainSpeed1e4-8   65.1MB/s ± 1%  65.6MB/s ± 0%  +0.78%   (p=0.000 n=10+9)
EncodeTwainSpeed1e5-8   80.0MB/s ± 0%  80.6MB/s ± 1%  +0.66%   (p=0.000 n=9+10)
EncodeTwainSpeed1e6-8   81.6MB/s ± 1%  82.1MB/s ± 1%  +0.55%  (p=0.017 n=10+10)

Input size in bytes, output size (and time taken) before and after on
some larger files:
1073741824   57269781 (  3183ms)   57269781 (  3177ms) adresser.001
1000000000  391052000 ( 11071ms)  391051996 ( 11067ms) enwik9
1911399616  378679516 ( 13450ms)  378679514 ( 13079ms) gob-stream
8558382592 3972329193 ( 99962ms) 3972329193 ( 91290ms) rawstudio-mint14.tar
 200000000  200015265 (   776ms)  200015265 (   774ms) sharnd.out

Thanks to Klaus Post for the original suggestion on cl/21021.

Change-Id: Ia4c63a8d1b92c67e1765ec5c3c8c69d289d9a6ce
Reviewed-on: https://go-review.googlesource.com/22604Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent 5edcff01
...@@ -7,7 +7,14 @@ package flate ...@@ -7,7 +7,14 @@ package flate
// This encoding algorithm, which prioritizes speed over output size, is // This encoding algorithm, which prioritizes speed over output size, is
// based on Snappy's LZ77-style encoder: github.com/golang/snappy // based on Snappy's LZ77-style encoder: github.com/golang/snappy
const maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset. const (
maxOffset = 1 << logMaxOffsetSize // Maximum deflate offset.
tableBits = 14 // Bits used in the table.
tableSize = 1 << tableBits // Size of the table.
tableMask = tableSize - 1 // Mask for table indices. Redundant, but can eliminate bounds checks.
tableShift = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
)
func load32(b []byte, i int) uint32 { func load32(b []byte, i int) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line. b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
...@@ -20,8 +27,8 @@ func load64(b []byte, i int) uint64 { ...@@ -20,8 +27,8 @@ func load64(b []byte, i int) uint64 {
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
} }
func hash(u, shift uint32) uint32 { func hash(u uint32) uint32 {
return (u * 0x1e35a7bd) >> shift return (u * 0x1e35a7bd) >> tableShift
} }
// These constants are defined by the Snappy implementation so that its // These constants are defined by the Snappy implementation so that its
...@@ -40,24 +47,11 @@ func encodeBestSpeed(dst []token, src []byte) []token { ...@@ -40,24 +47,11 @@ func encodeBestSpeed(dst []token, src []byte) []token {
return emitLiteral(dst, src) return emitLiteral(dst, src)
} }
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive. // Initialize the hash table.
//
// The table element type is uint16, as s < sLimit and sLimit < len(src) // The table element type is uint16, as s < sLimit and sLimit < len(src)
// and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535. // and len(src) <= maxStoreBlockSize and maxStoreBlockSize == 65535.
const ( var table [tableSize]uint16
maxTableSize = 1 << 14
// tableMask is redundant, but helps the compiler eliminate bounds
// checks.
tableMask = maxTableSize - 1
)
shift := uint32(32 - 8)
for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
shift--
}
// In Go, all array elements are zero-initialized, so there is no advantage
// to a smaller tableSize per se. However, it matches the C++ algorithm,
// and in the asm versions of this code, we can get away with zeroing only
// the first tableSize elements.
var table [maxTableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin // sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are // lets us use a fast path for emitLiteral in the main loop, while we are
...@@ -70,7 +64,7 @@ func encodeBestSpeed(dst []token, src []byte) []token { ...@@ -70,7 +64,7 @@ func encodeBestSpeed(dst []token, src []byte) []token {
// The encoded form must start with a literal, as there are no previous // The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1. // bytes to copy, so we start looking for hash matches at s == 1.
s := 1 s := 1
nextHash := hash(load32(src, s), shift) nextHash := hash(load32(src, s))
for { for {
// Copied from the C++ snappy implementation: // Copied from the C++ snappy implementation:
...@@ -102,7 +96,7 @@ func encodeBestSpeed(dst []token, src []byte) []token { ...@@ -102,7 +96,7 @@ func encodeBestSpeed(dst []token, src []byte) []token {
} }
candidate = int(table[nextHash&tableMask]) candidate = int(table[nextHash&tableMask])
table[nextHash&tableMask] = uint16(s) table[nextHash&tableMask] = uint16(s)
nextHash = hash(load32(src, nextS), shift) nextHash = hash(load32(src, nextS))
if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) { if s-candidate < maxOffset && load32(src, s) == load32(src, candidate) {
break break
} }
...@@ -152,13 +146,13 @@ func encodeBestSpeed(dst []token, src []byte) []token { ...@@ -152,13 +146,13 @@ func encodeBestSpeed(dst []token, src []byte) []token {
// are faster as one load64 call (with some shifts) instead of // are faster as one load64 call (with some shifts) instead of
// three load32 calls. // three load32 calls.
x := load64(src, s-1) x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift) prevHash := hash(uint32(x >> 0))
table[prevHash&tableMask] = uint16(s - 1) table[prevHash&tableMask] = uint16(s - 1)
currHash := hash(uint32(x>>8), shift) currHash := hash(uint32(x >> 8))
candidate = int(table[currHash&tableMask]) candidate = int(table[currHash&tableMask])
table[currHash&tableMask] = uint16(s) table[currHash&tableMask] = uint16(s)
if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) { if s-candidate >= maxOffset || uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift) nextHash = hash(uint32(x >> 16))
s++ s++
break break
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment