Commit 1b5d04c5 authored by Ivan Krasin's avatar Ivan Krasin Committed by Russ Cox

compress/flate: fix Huffman tree bug

Incorporate refactoring and a regression test from https://golang.org/cl/4538090/

R=rsc, go.peter.90, imkrasin
CC=golang-dev, mirtchovski
https://golang.org/cl/4524070
parent e8c87a7d
...@@ -15,9 +15,6 @@ const ( ...@@ -15,9 +15,6 @@ const (
// The largest offset code. // The largest offset code.
offsetCodeCount = 30 offsetCodeCount = 30
// The largest offset code in the extensions.
extendedOffsetCodeCount = 42
// The special code used to mark the end of a block. // The special code used to mark the end of a block.
endBlockMarker = 256 endBlockMarker = 256
...@@ -100,11 +97,11 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter { ...@@ -100,11 +97,11 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
return &huffmanBitWriter{ return &huffmanBitWriter{
w: w, w: w,
literalFreq: make([]int32, maxLit), literalFreq: make([]int32, maxLit),
offsetFreq: make([]int32, extendedOffsetCodeCount), offsetFreq: make([]int32, offsetCodeCount),
codegen: make([]uint8, maxLit+extendedOffsetCodeCount+1), codegen: make([]uint8, maxLit+offsetCodeCount+1),
codegenFreq: make([]int32, codegenCodeCount), codegenFreq: make([]int32, codegenCodeCount),
literalEncoding: newHuffmanEncoder(maxLit), literalEncoding: newHuffmanEncoder(maxLit),
offsetEncoding: newHuffmanEncoder(extendedOffsetCodeCount), offsetEncoding: newHuffmanEncoder(offsetCodeCount),
codegenEncoding: newHuffmanEncoder(codegenCodeCount), codegenEncoding: newHuffmanEncoder(codegenCodeCount),
} }
} }
...@@ -290,13 +287,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n ...@@ -290,13 +287,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
} }
w.writeBits(firstBits, 3) w.writeBits(firstBits, 3)
w.writeBits(int32(numLiterals-257), 5) w.writeBits(int32(numLiterals-257), 5)
if numOffsets > offsetCodeCount { w.writeBits(int32(numOffsets-1), 5)
// Extended version of decompressor
w.writeBits(int32(offsetCodeCount+((numOffsets-(1+offsetCodeCount))>>3)), 5)
w.writeBits(int32((numOffsets-(1+offsetCodeCount))&0x7), 3)
} else {
w.writeBits(int32(numOffsets-1), 5)
}
w.writeBits(int32(numCodegens-4), 4) w.writeBits(int32(numCodegens-4), 4)
for i := 0; i < numCodegens; i++ { for i := 0; i < numCodegens; i++ {
...@@ -368,24 +359,17 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { ...@@ -368,24 +359,17 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
tokens = tokens[0 : n+1] tokens = tokens[0 : n+1]
tokens[n] = endBlockMarker tokens[n] = endBlockMarker
totalLength := -1 // Subtract 1 for endBlock.
for _, t := range tokens { for _, t := range tokens {
switch t.typ() { switch t.typ() {
case literalType: case literalType:
w.literalFreq[t.literal()]++ w.literalFreq[t.literal()]++
totalLength++
break
case matchType: case matchType:
length := t.length() length := t.length()
offset := t.offset() offset := t.offset()
totalLength += int(length + 3)
w.literalFreq[lengthCodesStart+lengthCode(length)]++ w.literalFreq[lengthCodesStart+lengthCode(length)]++
w.offsetFreq[offsetCode(offset)]++ w.offsetFreq[offsetCode(offset)]++
break
} }
} }
w.literalEncoding.generate(w.literalFreq, 15)
w.offsetEncoding.generate(w.offsetFreq, 15)
// get the number of literals // get the number of literals
numLiterals := len(w.literalFreq) numLiterals := len(w.literalFreq)
...@@ -394,15 +378,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { ...@@ -394,15 +378,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
} }
// get the number of offsets // get the number of offsets
numOffsets := len(w.offsetFreq) numOffsets := len(w.offsetFreq)
for numOffsets > 1 && w.offsetFreq[numOffsets-1] == 0 { for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
numOffsets-- numOffsets--
} }
if numOffsets == 0 {
// We haven't found a single match. If we want to go with the dynamic encoding,
// we should count at least one offset to be sure that the offset huffman tree could be encoded.
w.offsetFreq[0] = 1
numOffsets = 1
}
w.literalEncoding.generate(w.literalFreq, 15)
w.offsetEncoding.generate(w.offsetFreq, 15)
storedBytes := 0 storedBytes := 0
if input != nil { if input != nil {
storedBytes = len(input) storedBytes = len(input)
} }
var extraBits int64 var extraBits int64
var storedSize int64 var storedSize int64 = math.MaxInt64
if storedBytes <= maxStoreBlockSize && input != nil { if storedBytes <= maxStoreBlockSize && input != nil {
storedSize = int64((storedBytes + 5) * 8) storedSize = int64((storedBytes + 5) * 8)
// We only bother calculating the costs of the extra bits required by // We only bother calculating the costs of the extra bits required by
...@@ -417,34 +411,29 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { ...@@ -417,34 +411,29 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
// First four offset codes have extra size = 0. // First four offset codes have extra size = 0.
extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode]) extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
} }
} else {
storedSize = math.MaxInt32
} }
// Figure out which generates smaller code, fixed Huffman, dynamic // Figure out smallest code.
// Huffman, or just storing the data. // Fixed Huffman baseline.
var fixedSize int64 = math.MaxInt64 var size = int64(3) +
if numOffsets <= offsetCodeCount { fixedLiteralEncoding.bitLength(w.literalFreq) +
fixedSize = int64(3) + fixedOffsetEncoding.bitLength(w.offsetFreq) +
fixedLiteralEncoding.bitLength(w.literalFreq) + extraBits
fixedOffsetEncoding.bitLength(w.offsetFreq) + var literalEncoding = fixedLiteralEncoding
extraBits var offsetEncoding = fixedOffsetEncoding
}
// Dynamic Huffman?
var numCodegens int
// Generate codegen and codegenFrequencies, which indicates how to encode // Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding. // the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets) w.generateCodegen(numLiterals, numOffsets)
w.codegenEncoding.generate(w.codegenFreq, 7) w.codegenEncoding.generate(w.codegenFreq, 7)
numCodegens := len(w.codegenFreq) numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 { for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens-- numCodegens--
} }
extensionSummand := 0
if numOffsets > offsetCodeCount {
extensionSummand = 3
}
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) + dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
// Following line is an extension.
int64(extensionSummand) +
w.codegenEncoding.bitLength(w.codegenFreq) + w.codegenEncoding.bitLength(w.codegenFreq) +
int64(extraBits) + int64(extraBits) +
int64(w.codegenFreq[16]*2) + int64(w.codegenFreq[16]*2) +
...@@ -454,26 +443,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) { ...@@ -454,26 +443,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
w.literalEncoding.bitLength(w.literalFreq) + w.literalEncoding.bitLength(w.literalFreq) +
w.offsetEncoding.bitLength(w.offsetFreq) w.offsetEncoding.bitLength(w.offsetFreq)
if storedSize < fixedSize && storedSize < dynamicSize { if dynamicSize < size {
size = dynamicSize
literalEncoding = w.literalEncoding
offsetEncoding = w.offsetEncoding
}
// Stored bytes?
if storedSize < size {
w.writeStoredHeader(storedBytes, eof) w.writeStoredHeader(storedBytes, eof)
w.writeBytes(input[0:storedBytes]) w.writeBytes(input[0:storedBytes])
return return
} }
var literalEncoding *huffmanEncoder
var offsetEncoding *huffmanEncoder
if fixedSize <= dynamicSize { // Huffman.
if literalEncoding == fixedLiteralEncoding {
w.writeFixedHeader(eof) w.writeFixedHeader(eof)
literalEncoding = fixedLiteralEncoding
offsetEncoding = fixedOffsetEncoding
} else { } else {
// Write the header.
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof) w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
literalEncoding = w.literalEncoding
offsetEncoding = w.offsetEncoding
} }
// Write the tokens.
for _, t := range tokens { for _, t := range tokens {
switch t.typ() { switch t.typ() {
case literalType: case literalType:
......
...@@ -363,7 +363,12 @@ func (s literalNodeSorter) Less(i, j int) bool { ...@@ -363,7 +363,12 @@ func (s literalNodeSorter) Less(i, j int) bool {
func (s literalNodeSorter) Swap(i, j int) { s.a[i], s.a[j] = s.a[j], s.a[i] } func (s literalNodeSorter) Swap(i, j int) { s.a[i], s.a[j] = s.a[j], s.a[i] }
func sortByFreq(a []literalNode) { func sortByFreq(a []literalNode) {
s := &literalNodeSorter{a, func(i, j int) bool { return a[i].freq < a[j].freq }} s := &literalNodeSorter{a, func(i, j int) bool {
if a[i].freq == a[j].freq {
return a[i].literal < a[j].literal
}
return a[i].freq < a[j].freq
}}
sort.Sort(s) sort.Sort(s)
} }
......
...@@ -77,8 +77,6 @@ type huffmanDecoder struct { ...@@ -77,8 +77,6 @@ type huffmanDecoder struct {
// Initialize Huffman decoding tables from array of code lengths. // Initialize Huffman decoding tables from array of code lengths.
func (h *huffmanDecoder) init(bits []int) bool { func (h *huffmanDecoder) init(bits []int) bool {
// TODO(rsc): Return false sometimes.
// Count number of codes of each length, // Count number of codes of each length,
// compute min and max length. // compute min and max length.
var count [maxCodeLen + 1]int var count [maxCodeLen + 1]int
......
...@@ -6,6 +6,7 @@ package zlib ...@@ -6,6 +6,7 @@ package zlib
import ( import (
"bytes" "bytes"
"fmt"
"io" "io"
"io/ioutil" "io/ioutil"
"os" "os"
...@@ -17,15 +18,13 @@ var filenames = []string{ ...@@ -17,15 +18,13 @@ var filenames = []string{
"../testdata/pi.txt", "../testdata/pi.txt",
} }
var data = []string{
"test a reasonable sized string that can be compressed",
}
// Tests that compressing and then decompressing the given file at the given compression level and dictionary // Tests that compressing and then decompressing the given file at the given compression level and dictionary
// yields equivalent bytes to the original file. // yields equivalent bytes to the original file.
func testFileLevelDict(t *testing.T, fn string, level int, d string) { func testFileLevelDict(t *testing.T, fn string, level int, d string) {
// Read dictionary, if given.
var dict []byte
if d != "" {
dict = []byte(d)
}
// Read the file, as golden output. // Read the file, as golden output.
golden, err := os.Open(fn) golden, err := os.Open(fn)
if err != nil { if err != nil {
...@@ -33,17 +32,25 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { ...@@ -33,17 +32,25 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
return return
} }
defer golden.Close() defer golden.Close()
b0, err0 := ioutil.ReadAll(golden)
// Read the file again, and push it through a pipe that compresses at the write end, and decompresses at the read end. if err0 != nil {
raw, err := os.Open(fn) t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
if err != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
return return
} }
testLevelDict(t, fn, b0, level, d)
}
func testLevelDict(t *testing.T, fn string, b0 []byte, level int, d string) {
// Make dictionary, if given.
var dict []byte
if d != "" {
dict = []byte(d)
}
// Push data through a pipe that compresses at the write end, and decompresses at the read end.
piper, pipew := io.Pipe() piper, pipew := io.Pipe()
defer piper.Close() defer piper.Close()
go func() { go func() {
defer raw.Close()
defer pipew.Close() defer pipew.Close()
zlibw, err := NewWriterDict(pipew, level, dict) zlibw, err := NewWriterDict(pipew, level, dict)
if err != nil { if err != nil {
...@@ -51,25 +58,14 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { ...@@ -51,25 +58,14 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
return return
} }
defer zlibw.Close() defer zlibw.Close()
var b [1024]byte _, err = zlibw.Write(b0)
for { if err == os.EPIPE {
n, err0 := raw.Read(b[0:]) // Fail, but do not report the error, as some other (presumably reported) error broke the pipe.
if err0 != nil && err0 != os.EOF { return
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0) }
return if err != nil {
} t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
_, err1 := zlibw.Write(b[0:n]) return
if err1 == os.EPIPE {
// Fail, but do not report the error, as some other (presumably reportable) error broke the pipe.
return
}
if err1 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
return
}
if err0 == os.EOF {
break
}
} }
}() }()
zlibr, err := NewReaderDict(piper, dict) zlibr, err := NewReaderDict(piper, dict)
...@@ -79,13 +75,8 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { ...@@ -79,13 +75,8 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
} }
defer zlibr.Close() defer zlibr.Close()
// Compare the two. // Compare the decompressed data.
b0, err0 := ioutil.ReadAll(golden)
b1, err1 := ioutil.ReadAll(zlibr) b1, err1 := ioutil.ReadAll(zlibr)
if err0 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
return
}
if err1 != nil { if err1 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1) t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
return return
...@@ -103,6 +94,18 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) { ...@@ -103,6 +94,18 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
} }
func TestWriter(t *testing.T) { func TestWriter(t *testing.T) {
for i, s := range data {
b := []byte(s)
tag := fmt.Sprintf("#%d", i)
testLevelDict(t, tag, b, DefaultCompression, "")
testLevelDict(t, tag, b, NoCompression, "")
for level := BestSpeed; level <= BestCompression; level++ {
testLevelDict(t, tag, b, level, "")
}
}
}
func TestWriterBig(t *testing.T) {
for _, fn := range filenames { for _, fn := range filenames {
testFileLevelDict(t, fn, DefaultCompression, "") testFileLevelDict(t, fn, DefaultCompression, "")
testFileLevelDict(t, fn, NoCompression, "") testFileLevelDict(t, fn, NoCompression, "")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment