Commit f20b1809 authored by Klaus Post's avatar Klaus Post Committed by Brad Fitzpatrick

compress/flate: eliminate most common bounds checks

This uses the SSA compiler to eliminate various unneeded bounds
checks in loops and various lookups.

This fixes the low hanging fruit, without any major code changes.

name                       old time/op    new time/op    delta
EncodeDigitsHuffman1e4-8     49.9µs ± 1%    48.1µs ± 1%  -3.74%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8      476µs ± 1%     458µs ± 1%  -3.58%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8     4.80ms ± 2%    4.56ms ± 1%  -5.07%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8        305µs ± 3%     290µs ± 2%  -5.03%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8       3.67ms ± 2%    3.49ms ± 2%  -4.78%   (p=0.000 n=9+10)
EncodeDigitsSpeed1e6-8       38.3ms ± 2%    35.8ms ± 1%  -6.58%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8      361µs ± 2%     346µs ± 3%  -4.12%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8     5.24ms ± 2%    4.96ms ± 3%  -5.38%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8     56.5ms ± 3%    52.2ms ± 2%  -7.68%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8     362µs ± 2%     343µs ± 1%  -5.20%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8    5.26ms ± 3%    4.98ms ± 2%  -5.48%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8    56.0ms ± 4%    52.1ms ± 1%  -7.01%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8      70.9µs ± 3%    64.7µs ± 1%  -8.68%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8       556µs ± 2%     524µs ± 2%  -5.84%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8      5.54ms ± 3%    5.22ms ± 2%  -5.70%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8         294µs ± 3%     284µs ± 1%  -3.71%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8        2.59ms ± 2%    2.48ms ± 1%  -4.14%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8        25.6ms ± 1%    24.3ms ± 1%  -5.28%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8       419µs ± 2%     396µs ± 1%  -5.59%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8      6.23ms ± 4%    5.75ms ± 1%  -7.83%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8      66.2ms ± 2%    61.4ms ± 1%  -7.22%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8      426µs ± 1%     405µs ± 1%  -4.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8     6.80ms ± 1%    6.32ms ± 1%  -6.97%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8     74.6ms ± 3%    68.7ms ± 1%  -7.90%   (p=0.000 n=10+9)

name                       old speed      new speed      delta
EncodeDigitsHuffman1e4-8    200MB/s ± 1%   208MB/s ± 1%  +3.88%   (p=0.000 n=10+9)
EncodeDigitsHuffman1e5-8    210MB/s ± 1%   218MB/s ± 1%  +3.71%  (p=0.000 n=10+10)
EncodeDigitsHuffman1e6-8    208MB/s ± 2%   219MB/s ± 1%  +5.32%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e4-8     32.8MB/s ± 3%  34.5MB/s ± 2%  +5.29%   (p=0.000 n=10+9)
EncodeDigitsSpeed1e5-8     27.2MB/s ± 2%  28.6MB/s ± 2%  +5.29%  (p=0.000 n=10+10)
EncodeDigitsSpeed1e6-8     26.1MB/s ± 2%  27.9MB/s ± 1%  +7.02%   (p=0.000 n=9+10)
EncodeDigitsDefault1e4-8   27.7MB/s ± 2%  28.9MB/s ± 3%  +4.30%   (p=0.000 n=10+9)
EncodeDigitsDefault1e5-8   19.1MB/s ± 2%  20.2MB/s ± 3%  +5.69%  (p=0.000 n=10+10)
EncodeDigitsDefault1e6-8   17.7MB/s ± 3%  19.2MB/s ± 2%  +8.31%  (p=0.000 n=10+10)
EncodeDigitsCompress1e4-8  27.6MB/s ± 2%  29.1MB/s ± 1%  +5.47%   (p=0.000 n=10+9)
EncodeDigitsCompress1e5-8  19.0MB/s ± 3%  20.1MB/s ± 2%  +5.78%  (p=0.000 n=10+10)
EncodeDigitsCompress1e6-8  17.9MB/s ± 4%  19.2MB/s ± 1%  +7.50%  (p=0.000 n=10+10)
EncodeTwainHuffman1e4-8     141MB/s ± 3%   154MB/s ± 1%  +9.46%   (p=0.000 n=10+9)
EncodeTwainHuffman1e5-8     180MB/s ± 2%   191MB/s ± 2%  +6.19%  (p=0.000 n=10+10)
EncodeTwainHuffman1e6-8     181MB/s ± 3%   192MB/s ± 2%  +6.02%  (p=0.000 n=10+10)
EncodeTwainSpeed1e4-8      34.0MB/s ± 3%  35.3MB/s ± 1%  +3.84%  (p=0.000 n=10+10)
EncodeTwainSpeed1e5-8      38.7MB/s ± 2%  40.3MB/s ± 1%  +4.30%   (p=0.000 n=10+9)
EncodeTwainSpeed1e6-8      39.1MB/s ± 1%  41.2MB/s ± 1%  +5.57%   (p=0.000 n=9+10)
EncodeTwainDefault1e4-8    23.9MB/s ± 2%  25.3MB/s ± 1%  +5.91%   (p=0.000 n=10+9)
EncodeTwainDefault1e5-8    16.0MB/s ± 4%  17.4MB/s ± 1%  +8.47%   (p=0.000 n=10+9)
EncodeTwainDefault1e6-8    15.1MB/s ± 2%  16.3MB/s ± 1%  +7.76%  (p=0.000 n=10+10)
EncodeTwainCompress1e4-8   23.5MB/s ± 1%  24.7MB/s ± 1%  +5.24%   (p=0.000 n=9+10)
EncodeTwainCompress1e5-8   14.7MB/s ± 1%  15.8MB/s ± 1%  +7.50%   (p=0.000 n=9+10)
EncodeTwainCompress1e6-8   13.4MB/s ± 3%  14.6MB/s ± 1%  +8.57%   (p=0.000 n=10+9)

Change-Id: I5c7e84c2f9ea4d38a2115995705eebb93387e22f
Reviewed-on: https://go-review.googlesource.com/21759Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 012557b3
......@@ -73,8 +73,8 @@ type compressor struct {
// hashPrev[hashHead[hashValue] & windowMask] contains the previous index
// with the same hash value.
chainHead int
hashHead []uint32
hashPrev []uint32
hashHead [hashSize]uint32
hashPrev [windowSize]uint32
hashOffset int
// input window: unprocessed data is window[index:windowEnd]
......@@ -188,12 +188,13 @@ func (d *compressor) fillWindow(b []byte) {
var newH uint32
for i, val := range dst {
di := i + index
newH = val & hashMask
newH = val
hh := &d.hashHead[newH&hashMask]
// Get previous value with the same hash.
// Our chain should point to the previous value.
d.hashPrev[di&windowMask] = d.hashHead[newH]
d.hashPrev[di&windowMask] = *hh
// Set the head of the hash chain to us.
d.hashHead[newH] = uint32(di + d.hashOffset)
*hh = uint32(di + d.hashOffset)
}
d.hash = newH
}
......@@ -293,6 +294,7 @@ func bulkHash4(b []byte, dst []uint32) {
// bytes in size.
func matchLen(a, b []byte, max int) int {
a = a[:max]
b = b[:len(a)]
for i, av := range a {
if b[i] != av {
return i
......@@ -302,8 +304,6 @@ func matchLen(a, b []byte, max int) int {
}
func (d *compressor) initDeflate() {
d.hashHead = make([]uint32, hashSize)
d.hashPrev = make([]uint32, windowSize)
d.window = make([]byte, 2*windowSize)
d.hashOffset = 1
d.tokens = make([]token, 0, maxFlateBlockTokens+1)
......@@ -358,9 +358,10 @@ Loop:
if d.index < d.maxInsertIndex {
// Update the hash
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
d.chainHead = int(d.hashHead[d.hash])
hh := &d.hashHead[d.hash&hashMask]
d.chainHead = int(*hh)
d.hashPrev[d.index&windowMask] = uint32(d.chainHead)
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
*hh = uint32(d.index + d.hashOffset)
}
prevLength := d.length
prevOffset := d.offset
......@@ -404,9 +405,10 @@ Loop:
d.hash = hash4(d.window[d.index : d.index+minMatchLength])
// Get previous value with the same hash.
// Our chain should point to the previous value.
d.hashPrev[d.index&windowMask] = d.hashHead[d.hash]
hh := &d.hashHead[d.hash&hashMask]
d.hashPrev[d.index&windowMask] = *hh
// Set the head of the hash chain to us.
d.hashHead[d.hash] = uint32(d.index + d.hashOffset)
*hh = uint32(d.index + d.hashOffset)
}
}
if d.fastSkipHashing == skipNever {
......@@ -531,9 +533,6 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
return nil
}
// hzeroes is used for zeroing the hash slice.
var hzeroes [256]uint32
func (d *compressor) reset(w io.Writer) {
d.w.reset(w)
d.sync = false
......@@ -543,15 +542,13 @@ func (d *compressor) reset(w io.Writer) {
d.windowEnd = 0
default:
d.chainHead = -1
for s := d.hashHead; len(s) > 0; {
n := copy(s, hzeroes[:])
s = s[n:]
for i := range d.hashHead {
d.hashHead[i] = 0
}
for s := d.hashPrev; len(s) > 0; s = s[len(hzeroes):] {
copy(s, hzeroes[:])
for i := range d.hashPrev {
d.hashPrev[i] = 0
}
d.hashOffset = 1
d.index, d.windowEnd = 0, 0
d.blockStart, d.byteAvailable = 0, false
d.tokens = d.tokens[:0]
......
......@@ -84,11 +84,11 @@ type huffmanBitWriter struct {
bits uint64
nbits uint
bytes [bufferSize]byte
codegenFreq [codegenCodeCount]int32
nbytes int
literalFreq []int32
offsetFreq []int32
codegen []uint8
codegenFreq []int32
literalEncoding *huffmanEncoder
offsetEncoding *huffmanEncoder
codegenEncoding *huffmanEncoder
......@@ -101,7 +101,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
literalFreq: make([]int32, maxNumLit),
offsetFreq: make([]int32, offsetCodeCount),
codegen: make([]uint8, maxNumLit+offsetCodeCount+1),
codegenFreq: make([]int32, codegenCodeCount),
literalEncoding: newHuffmanEncoder(maxNumLit),
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
......@@ -143,12 +142,13 @@ func (w *huffmanBitWriter) writeBits(b int32, nb uint) {
w.bits >>= 48
w.nbits -= 48
n := w.nbytes
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n >= bufferFlushSize {
_, w.err = w.w.Write(w.bytes[:n])
......@@ -293,12 +293,13 @@ func (w *huffmanBitWriter) writeCode(c hcode) {
w.bits >>= 48
w.nbits -= 48
n := w.nbytes
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n >= bufferFlushSize {
_, w.err = w.w.Write(w.bytes[:n])
......@@ -428,13 +429,13 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
}
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
w.codegenEncoding.bitLength(w.codegenFreq) +
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
int64(extraBits) +
int64(w.codegenFreq[16]*2) +
int64(w.codegenFreq[17]*3) +
......@@ -482,7 +483,7 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens []token, eof bool, input []b
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, w.offsetEncoding)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens := len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
......@@ -609,13 +610,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets, w.literalEncoding, huffOffset)
w.codegenEncoding.generate(w.codegenFreq, 7)
w.codegenEncoding.generate(w.codegenFreq[:], 7)
numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
}
headerSize := int64(3+5+5+4+(3*numCodegens)) +
w.codegenEncoding.bitLength(w.codegenFreq) +
w.codegenEncoding.bitLength(w.codegenFreq[:]) +
int64(w.codegenFreq[16]*2) +
int64(w.codegenFreq[17]*3) +
int64(w.codegenFreq[18]*7)
......@@ -639,7 +640,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
// Huffman.
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
encoding := w.literalEncoding.codes
encoding := w.literalEncoding.codes[:257]
n := w.nbytes
for _, t := range input {
// Bitwriting inlined, ~30% speedup
......@@ -653,12 +654,13 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
bits := w.bits
w.bits >>= 48
w.nbits -= 48
w.bytes[n+0] = byte(bits)
w.bytes[n+1] = byte(bits >> 8)
w.bytes[n+2] = byte(bits >> 16)
w.bytes[n+3] = byte(bits >> 24)
w.bytes[n+4] = byte(bits >> 32)
w.bytes[n+5] = byte(bits >> 40)
bytes := w.bytes[n : n+6]
bytes[0] = byte(bits)
bytes[1] = byte(bits >> 8)
bytes[2] = byte(bits >> 16)
bytes[3] = byte(bits >> 24)
bytes[4] = byte(bits >> 32)
bytes[5] = byte(bits >> 40)
n += 6
if n < bufferFlushSize {
continue
......@@ -677,6 +679,7 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte) {
//
// len(h) must be >= 256, and h's elements must be all zeroes.
func histogram(b []byte, h []int32) {
h = h[:256]
for _, t := range b {
h[t]++
}
......
......@@ -96,8 +96,8 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
func generateFixedOffsetEncoding() *huffmanEncoder {
h := newHuffmanEncoder(30)
codes := h.codes
for ch := uint16(0); ch < 30; ch++ {
codes[ch] = hcode{code: reverseBits(ch, 5), len: 5}
for ch := range codes {
codes[ch] = hcode{code: reverseBits(uint16(ch), 5), len: 5}
}
return h
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment