Commit b03129aa authored by Joe Tsai's avatar Joe Tsai Committed by Nigel Tao

compress/flate: make library RFC1951 compliant

Corrected several issues:
 * RFC1951 section 3.2.7 dictates that it is okay for the HDist tree to have a
single code of zero bits. Furthermore, the behavior of the C zlib library
permits empty trees even when there are more than one codes.
 * RFC1951 section 3.2.5 shows that HLit codes 286 and 287 are invalid. Thus,
Go's implementation should choke on inputs using these codes.
 * RFC1951 section 3.2.5 and 3.2.7 are ambiguous about whether the number of
HDist codes can be greater than 30. The C zlib library (which is the canonical
reference implementation) performs this check here:
https://github.com/madler/zlib/blob/62d6112a7981ad7c34f3b43cffdf00d4662a4f25/inflate.c#L906

In addition, a number of test cases were added to the unit tests that exercises
these edge cases. The test cases listed in TestStreams will either fail or
succeed in a manner matching the behaviour of the C zlib version. Given that the
C zlib implementation is the reference for the world, Go's implementation should
match C zlib behaviour.

Fixes #11030

Change-Id: Ic24e4e40ce5832c7e1930249246e86d34bfedaa6
Reviewed-on: https://go-review.googlesource.com/11000Reviewed-by: default avatarNigel Tao <nigeltao@golang.org>
parent 10076b55
......@@ -10,23 +10,11 @@ package flate
import (
"bytes"
"encoding/hex"
"io/ioutil"
"strings"
"testing"
)
func TestUncompressedSource(t *testing.T) {
decoder := NewReader(bytes.NewReader([]byte{0x01, 0x01, 0x00, 0xfe, 0xff, 0x11}))
output := make([]byte, 1)
n, error := decoder.Read(output)
if n != 1 || error != nil {
t.Fatalf("decoder.Read() = %d, %v, want 1, nil", n, error)
}
if output[0] != 0x11 {
t.Errorf("output[0] = %x, want 0x11", output[0])
}
}
// The following test should not panic.
func TestIssue5915(t *testing.T) {
bits := []int{4, 0, 0, 6, 4, 3, 2, 3, 3, 4, 4, 5, 0, 0, 0, 0, 5, 5, 6,
......@@ -90,29 +78,183 @@ func TestInvalidBits(t *testing.T) {
}
}
func TestDegenerateHuffmanCoding(t *testing.T) {
const (
want = "abcabc"
// This compressed form has a dynamic Huffman block, even though a
// sensible encoder would use a literal data block, as the latter is
// shorter. Still, it is a valid flate compression of "abcabc". It has
// a degenerate Huffman table with only one coded value: the one
// non-literal back-ref copy of the first "abc" to the second "abc".
//
// To verify that this is decompressible with zlib (the C library),
// it's easy to use the Python wrapper library:
// >>> import zlib
// >>> compressed = "\x0c\xc2...etc...\xff\xff"
// >>> zlib.decompress(compressed, -15) # negative means no GZIP header.
// 'abcabc'
compressed = "\x0c\xc2\x01\x0d\x00\x00\x00\x82\xb0\xac\x4a\xff\x0e\xb0\x7d\x27" +
"\x06\x00\x00\xff\xff"
)
b, err := ioutil.ReadAll(NewReader(strings.NewReader(compressed)))
if err != nil {
t.Fatal(err)
}
if got := string(b); got != want {
t.Fatalf("got %q, want %q", got, want)
func TestStreams(t *testing.T) {
// To verify any of these hexstrings as valid or invalid flate streams
// according to the C zlib library, you can use the Python wrapper library:
// >>> hex_string = "010100feff11"
// >>> import zlib
// >>> zlib.decompress(hex_string.decode("hex"), -15) # Negative means raw DEFLATE
// '\x11'
testCases := []struct {
desc string // Description of the stream
stream string // Hexstring of the input DEFLATE stream
want string // Expected result. Use "fail" to expect failure
}{{
"degenerate HCLenTree",
"05e0010000000000100000000000000000000000000000000000000000000000" +
"00000000000000000004",
"fail",
}, {
"complete HCLenTree, empty HLitTree, empty HDistTree",
"05e0010400000000000000000000000000000000000000000000000000000000" +
"00000000000000000010",
"fail",
}, {
"empty HCLenTree",
"05e0010000000000000000000000000000000000000000000000000000000000" +
"00000000000000000010",
"fail",
}, {
"complete HCLenTree, complete HLitTree, empty HDistTree, use missing HDist symbol",
"000100feff000de0010400000000100000000000000000000000000000000000" +
"0000000000000000000000000000002c",
"fail",
}, {
"complete HCLenTree, complete HLitTree, degenerate HDistTree, use missing HDist symbol",
"000100feff000de0010000000000000000000000000000000000000000000000" +
"00000000000000000610000000004070",
"fail",
}, {
"complete HCLenTree, empty HLitTree, empty HDistTree",
"05e0010400000000100400000000000000000000000000000000000000000000" +
"0000000000000000000000000008",
"fail",
}, {
"complete HCLenTree, empty HLitTree, degenerate HDistTree",
"05e0010400000000100400000000000000000000000000000000000000000000" +
"0000000000000000000800000008",
"fail",
}, {
"complete HCLenTree, degenerate HLitTree, degenerate HDistTree, use missing HLit symbol",
"05e0010400000000100000000000000000000000000000000000000000000000" +
"0000000000000000001c",
"fail",
}, {
"complete HCLenTree, complete HLitTree, too large HDistTree",
"edff870500000000200400000000000000000000000000000000000000000000" +
"000000000000000000080000000000000004",
"fail",
}, {
"complete HCLenTree, complete HLitTree, empty HDistTree, excessive repeater code",
"edfd870500000000200400000000000000000000000000000000000000000000" +
"000000000000000000e8b100",
"fail",
}, {
"complete HCLenTree, complete HLitTree, empty HDistTree of normal length 30",
"05fd01240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
"ffffffffffffffffff07000000fe01",
"",
}, {
"complete HCLenTree, complete HLitTree, empty HDistTree of excessive length 31",
"05fe01240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
"ffffffffffffffffff07000000fc03",
"fail",
}, {
"complete HCLenTree, over-subscribed HLitTree, empty HDistTree",
"05e001240000000000fcffffffffffffffffffffffffffffffffffffffffffff" +
"ffffffffffffffffff07f00f",
"fail",
}, {
"complete HCLenTree, under-subscribed HLitTree, empty HDistTree",
"05e001240000000000fcffffffffffffffffffffffffffffffffffffffffffff" +
"fffffffffcffffffff07f00f",
"fail",
}, {
"complete HCLenTree, complete HLitTree with single code, empty HDistTree",
"05e001240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
"ffffffffffffffffff07f00f",
"01",
}, {
"complete HCLenTree, complete HLitTree with multiple codes, empty HDistTree",
"05e301240000000000f8ffffffffffffffffffffffffffffffffffffffffffff" +
"ffffffffffffffffff07807f",
"01",
}, {
"complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HDist symbol",
"000100feff000de0010400000000100000000000000000000000000000000000" +
"0000000000000000000000000000003c",
"00000000",
}, {
"complete HCLenTree, degenerate HLitTree, degenerate HDistTree",
"05e0010400000000100000000000000000000000000000000000000000000000" +
"0000000000000000000c",
"",
}, {
"complete HCLenTree, degenerate HLitTree, empty HDistTree",
"05e0010400000000100000000000000000000000000000000000000000000000" +
"00000000000000000004",
"",
}, {
"complete HCLenTree, complete HLitTree, empty HDistTree, spanning repeater code",
"edfd870500000000200400000000000000000000000000000000000000000000" +
"000000000000000000e8b000",
"",
}, {
"complete HCLenTree with length codes, complete HLitTree, empty HDistTree",
"ede0010400000000100000000000000000000000000000000000000000000000" +
"0000000000000000000400004000",
"",
}, {
"complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HLit symbol 284 with count 31",
"000100feff00ede0010400000000100000000000000000000000000000000000" +
"000000000000000000000000000000040000407f00",
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"0000000000000000000000000000000000000000000000000000000000000000" +
"000000",
}, {
"complete HCLenTree, complete HLitTree, degenerate HDistTree, use valid HLit and HDist symbols",
"0cc2010d00000082b0ac4aff0eb07d27060000ffff",
"616263616263",
}, {
"fixed block, use reserved symbol 287",
"33180700",
"fail",
}, {
"raw block",
"010100feff11",
"11",
}, {
"issue 10426 - over-subscribed HCLenTree causes a hang",
"344c4a4e494d4b070000ff2e2eff2e2e2e2e2eff",
"fail",
}, {
"issue 11030 - empty HDistTree unexpectedly leads to error",
"05c0070600000080400fff37a0ca",
"",
}, {
"issue 11033 - empty HDistTree unexpectedly leads to error",
"050fb109c020cca5d017dcbca044881ee1034ec149c8980bbc413c2ab35be9dc" +
"b1473449922449922411202306ee97b0383a521b4ffdcf3217f9f7d3adb701",
"3130303634342068652e706870005d05355f7ed957ff084a90925d19e3ebc6d0" +
"c6d7",
}}
for i, tc := range testCases {
data, err := hex.DecodeString(tc.stream)
if err != nil {
t.Fatal(err)
}
data, err = ioutil.ReadAll(NewReader(bytes.NewReader(data)))
if tc.want == "fail" {
if err == nil {
t.Errorf("#%d (%s): got nil error, want non-nil", i, tc.desc)
}
} else {
if err != nil {
t.Errorf("#%d (%s): %v", i, tc.desc, err)
continue
}
if got := hex.EncodeToString(data); got != tc.want {
t.Errorf("#%d (%s):\ngot %q\nwant %q", i, tc.desc, got, tc.want)
}
}
}
}
......@@ -45,6 +45,10 @@ type huffmanDecoder struct {
}
// Initialize Huffman decoding tables from array of code lengths.
// Following this function, h is guaranteed to be initialized into a complete
// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
// degenerate case where the tree has only a single symbol with length 1. Empty
// trees are permitted.
func (h *huffmanDecoder) init(bits []int) bool {
// Sanity enables additional runtime tests during Huffman
// table construction. It's intended to be used during
......@@ -71,8 +75,16 @@ func (h *huffmanDecoder) init(bits []int) bool {
}
count[n]++
}
// Empty tree. The decompressor.huffSym function will fail later if the tree
// is used. Technically, an empty tree is only valid for the HDIST tree and
// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
// is guaranteed to fail since it will attempt to use the tree to decode the
// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
// guaranteed to fail later since the compressed data section must be
// composed of at least one symbol (the end-of-block marker).
if max == 0 {
return false
return true
}
code := 0
......
......@@ -87,11 +87,11 @@ type huffmanBitWriter struct {
func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
return &huffmanBitWriter{
w: w,
literalFreq: make([]int32, maxLit),
literalFreq: make([]int32, maxNumLit),
offsetFreq: make([]int32, offsetCodeCount),
codegen: make([]uint8, maxLit+offsetCodeCount+1),
codegen: make([]uint8, maxNumLit+offsetCodeCount+1),
codegenFreq: make([]int32, codegenCodeCount),
literalEncoding: newHuffmanEncoder(maxLit),
literalEncoding: newHuffmanEncoder(maxNumLit),
offsetEncoding: newHuffmanEncoder(offsetCodeCount),
codegenEncoding: newHuffmanEncoder(codegenCodeCount),
}
......
......@@ -47,11 +47,11 @@ func newHuffmanEncoder(size int) *huffmanEncoder {
// Generates a HuffmanCode corresponding to the fixed literal table
func generateFixedLiteralEncoding() *huffmanEncoder {
h := newHuffmanEncoder(maxLit)
h := newHuffmanEncoder(maxNumLit)
codeBits := h.codeBits
code := h.code
var ch uint16
for ch = 0; ch < maxLit; ch++ {
for ch = 0; ch < maxNumLit; ch++ {
var bits uint16
var size uint8
switch {
......
......@@ -18,10 +18,12 @@ import (
const (
maxCodeLen = 16 // max length of Huffman code
maxHist = 32768 // max history required
// The next three numbers come from the RFC, section 3.2.7.
maxLit = 286
maxDist = 32
numCodes = 19 // number of codes in Huffman meta-code
// The next three numbers come from the RFC section 3.2.7, with the
// additional proviso in section 3.2.5 which implies that distance codes
// 30 and 31 should never occur in compressed data.
maxNumLit = 286
maxNumDist = 30
numCodes = 19 // number of codes in Huffman meta-code
)
// A CorruptInputError reports the presence of corrupt input at a given offset.
......@@ -101,6 +103,10 @@ type huffmanDecoder struct {
}
// Initialize Huffman decoding tables from array of code lengths.
// Following this function, h is guaranteed to be initialized into a complete
// tree (i.e., neither over-subscribed nor under-subscribed). The exception is a
// degenerate case where the tree has only a single symbol with length 1. Empty
// trees are permitted.
func (h *huffmanDecoder) init(bits []int) bool {
// Sanity enables additional runtime tests during Huffman
// table construction. It's intended to be used during
......@@ -127,8 +133,16 @@ func (h *huffmanDecoder) init(bits []int) bool {
}
count[n]++
}
// Empty tree. The decompressor.huffSym function will fail later if the tree
// is used. Technically, an empty tree is only valid for the HDIST tree and
// not the HCLEN and HLIT tree. However, a stream with an empty HCLEN tree
// is guaranteed to fail since it will attempt to use the tree to decode the
// codes for the HLIT and HDIST trees. Similarly, an empty HLIT tree is
// guaranteed to fail later since the compressed data section must be
// composed of at least one symbol (the end-of-block marker).
if max == 0 {
return false
return true
}
code := 0
......@@ -258,7 +272,7 @@ type decompressor struct {
h1, h2 huffmanDecoder
// Length arrays used to define Huffman codes.
bits *[maxLit + maxDist]int
bits *[maxNumLit + maxNumDist]int
codebits *[numCodes]int
// Output history, buffer.
......@@ -356,12 +370,14 @@ func (f *decompressor) readHuffman() error {
}
}
nlit := int(f.b&0x1F) + 257
if nlit > maxLit {
if nlit > maxNumLit {
return CorruptInputError(f.roffset)
}
f.b >>= 5
ndist := int(f.b&0x1F) + 1
// maxDist is 32, so ndist is always valid.
if ndist > maxNumDist {
return CorruptInputError(f.roffset)
}
f.b >>= 5
nclen := int(f.b&0xF) + 4
// numCodes is 19, so nclen is always valid.
......@@ -492,9 +508,12 @@ func (f *decompressor) huffmanBlock() {
case v < 285:
length = v*32 - (281*32 - 131)
n = 5
default:
case v < maxNumLit:
length = 258
n = 0
default:
f.err = CorruptInputError(f.roffset)
return
}
if n > 0 {
for f.nb < n {
......@@ -529,10 +548,7 @@ func (f *decompressor) huffmanBlock() {
switch {
case dist < 4:
dist++
case dist >= 30:
f.err = CorruptInputError(f.roffset)
return
default:
case dist < maxNumDist:
nb := uint(dist-2) >> 1
// have 1 bit in bottom of dist, need nb more.
extra := (dist & 1) << nb
......@@ -546,6 +562,9 @@ func (f *decompressor) huffmanBlock() {
f.b >>= nb
f.nb -= nb
dist = 1<<(nb+1) + 1 + extra
default:
f.err = CorruptInputError(f.roffset)
return
}
// Copy history[-dist:-dist+length] into output.
......@@ -692,6 +711,10 @@ func (f *decompressor) moreBits() error {
// Read the next Huffman-encoded symbol from f according to h.
func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
// Since a huffmanDecoder can be empty or be composed of a degenerate tree
// with single element, huffSym must error on these two edge cases. In both
// cases, the chunks slice will be 0 for the invalid sequence, leading it
// satisfy the n == 0 check below.
n := uint(h.min)
for {
for f.nb < n {
......@@ -761,7 +784,7 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
// The ReadCloser returned by NewReader also implements Resetter.
func NewReader(r io.Reader) io.ReadCloser {
var f decompressor
f.bits = new([maxLit + maxDist]int)
f.bits = new([maxNumLit + maxNumDist]int)
f.codebits = new([numCodes]int)
f.r = makeReader(r)
f.hist = new([maxHist]byte)
......@@ -780,7 +803,7 @@ func NewReaderDict(r io.Reader, dict []byte) io.ReadCloser {
var f decompressor
f.r = makeReader(r)
f.hist = new([maxHist]byte)
f.bits = new([maxLit + maxDist]int)
f.bits = new([maxNumLit + maxNumDist]int)
f.codebits = new([numCodes]int)
f.step = (*decompressor).nextBlock
f.setDict(dict)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment