Commit 1b5d04c5 authored by Ivan Krasin's avatar Ivan Krasin Committed by Russ Cox

compress/flate: fix Huffman tree bug

Incorporate refactoring and a regression test from https://golang.org/cl/4538090/

R=rsc, go.peter.90, imkrasin
CC=golang-dev, mirtchovski
https://golang.org/cl/4524070
parent e8c87a7d
......@@ -15,9 +15,6 @@ const (
// The largest offset code.
offsetCodeCount = 30
// The largest offset code in the extensions.
extendedOffsetCodeCount = 42
// The special code used to mark the end of a block.
endBlockMarker = 256
......@@ -100,11 +97,11 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
return &huffmanBitWriter{
w: w,
literalFreq: make([]int32, maxLit),
offsetFreq: make([]int32, extendedOffsetCodeCount),
codegen: make([]uint8, maxLit+extendedOffsetCodeCount+1),
offsetFreq: make([]int32, offsetCodeCount),
codegen: make([]uint8, maxLit+offsetCodeCount+1),
codegenFreq: make([]int32, codegenCodeCount),
literalEncoding: newHuffmanEncoder(maxLit),
offsetEncoding: newHuffmanEncoder(extendedOffsetCodeCount),
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
}
}
......@@ -290,13 +287,7 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n
}
w.writeBits(firstBits, 3)
w.writeBits(int32(numLiterals-257), 5)
if numOffsets > offsetCodeCount {
// Extended version of decompressor
w.writeBits(int32(offsetCodeCount+((numOffsets-(1+offsetCodeCount))>>3)), 5)
w.writeBits(int32((numOffsets-(1+offsetCodeCount))&0x7), 3)
} else {
w.writeBits(int32(numOffsets-1), 5)
}
w.writeBits(int32(numOffsets-1), 5)
w.writeBits(int32(numCodegens-4), 4)
for i := 0; i < numCodegens; i++ {
......@@ -368,24 +359,17 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
tokens = tokens[0 : n+1]
tokens[n] = endBlockMarker
totalLength := -1 // Subtract 1 for endBlock.
for _, t := range tokens {
switch t.typ() {
case literalType:
w.literalFreq[t.literal()]++
totalLength++
break
case matchType:
length := t.length()
offset := t.offset()
totalLength += int(length + 3)
w.literalFreq[lengthCodesStart+lengthCode(length)]++
w.offsetFreq[offsetCode(offset)]++
break
}
}
w.literalEncoding.generate(w.literalFreq, 15)
w.offsetEncoding.generate(w.offsetFreq, 15)
// get the number of literals
numLiterals := len(w.literalFreq)
......@@ -394,15 +378,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
}
// get the number of offsets
numOffsets := len(w.offsetFreq)
for numOffsets > 1 && w.offsetFreq[numOffsets-1] == 0 {
for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
numOffsets--
}
if numOffsets == 0 {
// We haven't found a single match. If we want to go with the dynamic encoding,
// we should count at least one offset to be sure that the offset huffman tree could be encoded.
w.offsetFreq[0] = 1
numOffsets = 1
}
w.literalEncoding.generate(w.literalFreq, 15)
w.offsetEncoding.generate(w.offsetFreq, 15)
storedBytes := 0
if input != nil {
storedBytes = len(input)
}
var extraBits int64
var storedSize int64
var storedSize int64 = math.MaxInt64
if storedBytes <= maxStoreBlockSize && input != nil {
storedSize = int64((storedBytes + 5) * 8)
// We only bother calculating the costs of the extra bits required by
......@@ -417,34 +411,29 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
// First four offset codes have extra size = 0.
extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
}
} else {
storedSize = math.MaxInt32
}
// Figure out which generates smaller code, fixed Huffman, dynamic
// Huffman, or just storing the data.
var fixedSize int64 = math.MaxInt64
if numOffsets <= offsetCodeCount {
fixedSize = int64(3) +
fixedLiteralEncoding.bitLength(w.literalFreq) +
fixedOffsetEncoding.bitLength(w.offsetFreq) +
extraBits
}
// Figure out smallest code.
// Fixed Huffman baseline.
var size = int64(3) +
fixedLiteralEncoding.bitLength(w.literalFreq) +
fixedOffsetEncoding.bitLength(w.offsetFreq) +
extraBits
var literalEncoding = fixedLiteralEncoding
var offsetEncoding = fixedOffsetEncoding
// Dynamic Huffman?
var numCodegens int
// Generate codegen and codegenFrequencies, which indicates how to encode
// the literalEncoding and the offsetEncoding.
w.generateCodegen(numLiterals, numOffsets)
w.codegenEncoding.generate(w.codegenFreq, 7)
numCodegens := len(w.codegenFreq)
numCodegens = len(w.codegenFreq)
for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
numCodegens--
}
extensionSummand := 0
if numOffsets > offsetCodeCount {
extensionSummand = 3
}
dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
// Following line is an extension.
int64(extensionSummand) +
w.codegenEncoding.bitLength(w.codegenFreq) +
int64(extraBits) +
int64(w.codegenFreq[16]*2) +
......@@ -454,26 +443,25 @@ func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
w.literalEncoding.bitLength(w.literalFreq) +
w.offsetEncoding.bitLength(w.offsetFreq)
if storedSize < fixedSize && storedSize < dynamicSize {
if dynamicSize < size {
size = dynamicSize
literalEncoding = w.literalEncoding
offsetEncoding = w.offsetEncoding
}
// Stored bytes?
if storedSize < size {
w.writeStoredHeader(storedBytes, eof)
w.writeBytes(input[0:storedBytes])
return
}
var literalEncoding *huffmanEncoder
var offsetEncoding *huffmanEncoder
if fixedSize <= dynamicSize {
// Huffman.
if literalEncoding == fixedLiteralEncoding {
w.writeFixedHeader(eof)
literalEncoding = fixedLiteralEncoding
offsetEncoding = fixedOffsetEncoding
} else {
// Write the header.
w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
literalEncoding = w.literalEncoding
offsetEncoding = w.offsetEncoding
}
// Write the tokens.
for _, t := range tokens {
switch t.typ() {
case literalType:
......
......@@ -363,7 +363,12 @@ func (s literalNodeSorter) Less(i, j int) bool {
func (s literalNodeSorter) Swap(i, j int) { s.a[i], s.a[j] = s.a[j], s.a[i] }
func sortByFreq(a []literalNode) {
s := &literalNodeSorter{a, func(i, j int) bool { return a[i].freq < a[j].freq }}
s := &literalNodeSorter{a, func(i, j int) bool {
if a[i].freq == a[j].freq {
return a[i].literal < a[j].literal
}
return a[i].freq < a[j].freq
}}
sort.Sort(s)
}
......
......@@ -77,8 +77,6 @@ type huffmanDecoder struct {
// Initialize Huffman decoding tables from array of code lengths.
func (h *huffmanDecoder) init(bits []int) bool {
// TODO(rsc): Return false sometimes.
// Count number of codes of each length,
// compute min and max length.
var count [maxCodeLen + 1]int
......
......@@ -6,6 +6,7 @@ package zlib
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"os"
......@@ -17,15 +18,13 @@ var filenames = []string{
"../testdata/pi.txt",
}
var data = []string{
"test a reasonable sized string that can be compressed",
}
// Tests that compressing and then decompressing the given file at the given compression level and dictionary
// yields equivalent bytes to the original file.
func testFileLevelDict(t *testing.T, fn string, level int, d string) {
// Read dictionary, if given.
var dict []byte
if d != "" {
dict = []byte(d)
}
// Read the file, as golden output.
golden, err := os.Open(fn)
if err != nil {
......@@ -33,17 +32,25 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
return
}
defer golden.Close()
// Read the file again, and push it through a pipe that compresses at the write end, and decompresses at the read end.
raw, err := os.Open(fn)
if err != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
b0, err0 := ioutil.ReadAll(golden)
if err0 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
return
}
testLevelDict(t, fn, b0, level, d)
}
func testLevelDict(t *testing.T, fn string, b0 []byte, level int, d string) {
// Make dictionary, if given.
var dict []byte
if d != "" {
dict = []byte(d)
}
// Push data through a pipe that compresses at the write end, and decompresses at the read end.
piper, pipew := io.Pipe()
defer piper.Close()
go func() {
defer raw.Close()
defer pipew.Close()
zlibw, err := NewWriterDict(pipew, level, dict)
if err != nil {
......@@ -51,25 +58,14 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
return
}
defer zlibw.Close()
var b [1024]byte
for {
n, err0 := raw.Read(b[0:])
if err0 != nil && err0 != os.EOF {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
return
}
_, err1 := zlibw.Write(b[0:n])
if err1 == os.EPIPE {
// Fail, but do not report the error, as some other (presumably reportable) error broke the pipe.
return
}
if err1 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
return
}
if err0 == os.EOF {
break
}
_, err = zlibw.Write(b0)
if err == os.EPIPE {
// Fail, but do not report the error, as some other (presumably reported) error broke the pipe.
return
}
if err != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err)
return
}
}()
zlibr, err := NewReaderDict(piper, dict)
......@@ -79,13 +75,8 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
}
defer zlibr.Close()
// Compare the two.
b0, err0 := ioutil.ReadAll(golden)
// Compare the decompressed data.
b1, err1 := ioutil.ReadAll(zlibr)
if err0 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err0)
return
}
if err1 != nil {
t.Errorf("%s (level=%d, dict=%q): %v", fn, level, d, err1)
return
......@@ -103,6 +94,18 @@ func testFileLevelDict(t *testing.T, fn string, level int, d string) {
}
func TestWriter(t *testing.T) {
for i, s := range data {
b := []byte(s)
tag := fmt.Sprintf("#%d", i)
testLevelDict(t, tag, b, DefaultCompression, "")
testLevelDict(t, tag, b, NoCompression, "")
for level := BestSpeed; level <= BestCompression; level++ {
testLevelDict(t, tag, b, level, "")
}
}
}
func TestWriterBig(t *testing.T) {
for _, fn := range filenames {
testFileLevelDict(t, fn, DefaultCompression, "")
testFileLevelDict(t, fn, NoCompression, "")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment