Commit bf693982 authored by Adam Langley's avatar Adam Langley

compress/bzip2: add package.

This code implements bzip2 decompression only.

R=bradfitzgo, r2, nigeltao, rsc
CC=golang-dev
https://golang.org/cl/4176051
parent 658447ab
......@@ -21,6 +21,7 @@ DIRS=\
bufio\
bytes\
cmath\
compress/bzip2\
compress/flate\
compress/gzip\
compress/lzw \
......
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
include ../../../Make.inc
TARG=compress/bzip2
GOFILES=\
bit_reader.go\
bzip2.go\
huffman.go\
move_to_front.go\
include ../../../Make.pkg
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
import (
"bufio"
"io"
"os"
)
// bitReader wraps an io.Reader and provides the ability to read values,
// bit-by-bit, from it. Its Read* methods don't return the usual os.Error
// because the error handling was verbose. Instead, any error is kept and can
// be checked afterwards.
type bitReader struct {
r byteReader
n uint64
bits uint
err os.Error
}
// bitReader needs to read bytes from an io.Reader. We attempt to cast the
// given io.Reader to this interface and, if it doesn't already fit, we wrap in
// a bufio.Reader.
type byteReader interface {
ReadByte() (byte, os.Error)
}
func newBitReader(r io.Reader) bitReader {
byter, ok := r.(byteReader)
if !ok {
byter = bufio.NewReader(r)
}
return bitReader{r: byter}
}
// ReadBits64 reads the given number of bits and returns them in the
// least-significant part of a uint64. In the event of an error, it returns 0
// and the error can be obtained by calling Error().
func (br *bitReader) ReadBits64(bits uint) (n uint64) {
for bits > br.bits {
b, err := br.r.ReadByte()
if err == os.EOF {
err = io.ErrUnexpectedEOF
}
if err != nil {
br.err = err
return 0
}
br.n <<= 8
br.n |= uint64(b)
br.bits += 8
}
// br.n looks like this (assuming that br.bits = 14 and bits = 6):
// Bit: 111111
// 5432109876543210
//
// (6 bits, the desired output)
// |-----|
// V V
// 0101101101001110
// ^ ^
// |------------|
// br.bits (num valid bits)
//
// This the next line right shifts the desired bits into the
// least-significant places and masks off anything above.
n = (br.n >> (br.bits - bits)) & ((1 << bits) - 1)
br.bits -= bits
return
}
func (br *bitReader) ReadBits(bits uint) (n int) {
n64 := br.ReadBits64(bits)
return int(n64)
}
func (br *bitReader) ReadBit() bool {
n := br.ReadBits(1)
return n != 0
}
func (br *bitReader) Error() os.Error {
return br.err
}
This diff is collapsed.
This diff is collapsed.
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
import (
"os"
"sort"
)
// A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
// symbol.
type huffmanTree struct {
// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
// root of the tree and nextNode contains the index of the next element
// of nodes to use when the tree is being constructed.
nodes []huffmanNode
nextNode int
}
// A huffmanNode is a node in the tree. left and right contain indexes into the
// nodes slice of the tree. If left or right is invalidNodeValue then the child
// is a left node and its value is in leftValue/rightValue.
//
// The symbols are uint16s because bzip2 encodes not only MTF indexes in the
// tree, but also two magic values for run-length encoding and an EOF symbol.
// Thus there are more than 256 possible symbols.
type huffmanNode struct {
left, right uint16
leftValue, rightValue uint16
}
// invalidNodeValue is an invalid index which marks a leaf node in the tree.
const invalidNodeValue = 0xffff
// Decode reads bits from the given bitReader and navigates the tree until a
// symbol is found.
func (t huffmanTree) Decode(br *bitReader) (v uint16) {
nodeIndex := uint16(0) // node 0 is the root of the tree.
for {
node := &t.nodes[nodeIndex]
bit := br.ReadBit()
// bzip2 encodes left as a true bit.
if bit {
// left
if node.left == invalidNodeValue {
return node.leftValue
}
nodeIndex = node.left
} else {
// right
if node.right == invalidNodeValue {
return node.rightValue
}
nodeIndex = node.right
}
}
panic("unreachable")
}
// newHuffmanTree builds a Huffman tree from a slice containing the code
// lengths of each symbol. The maximum code length is 32 bits.
func newHuffmanTree(lengths []uint8) (huffmanTree, os.Error) {
// There are many possible trees that assign the same code length to
// each symbol (consider reflecting a tree down the middle, for
// example). Since the code length assignments determine the
// efficiency of the tree, each of these trees is equally good. In
// order to minimise the amount of information needed to build a tree
// bzip2 uses a canonical tree so that it can be reconstructed given
// only the code length assignments.
if len(lengths) < 2 {
panic("newHuffmanTree: too few symbols")
}
var t huffmanTree
// First we sort the code length assignments by ascending code length,
// using the symbol value to break ties.
pairs := huffmanSymbolLengthPairs(make([]huffmanSymbolLengthPair, len(lengths)))
for i, length := range lengths {
pairs[i].value = uint16(i)
pairs[i].length = length
}
sort.Sort(pairs)
// Now we assign codes to the symbols, starting with the longest code.
// We keep the codes packed into a uint32, at the most-significant end.
// So branches are taken from the MSB downwards. This makes it easy to
// sort them later.
code := uint32(0)
length := uint8(32)
codes := huffmanCodes(make([]huffmanCode, len(lengths)))
for i := len(pairs) - 1; i >= 0; i-- {
if length > pairs[i].length {
// If the code length decreases we shift in order to
// zero any bits beyond the end of the code.
length >>= 32 - pairs[i].length
length <<= 32 - pairs[i].length
length = pairs[i].length
}
codes[i].code = code
codes[i].codeLen = length
codes[i].value = pairs[i].value
// We need to 'increment' the code, which means treating |code|
// like a |length| bit number.
code += 1 << (32 - length)
}
// Now we can sort by the code so that the left half of each branch are
// grouped together, recursively.
sort.Sort(codes)
t.nodes = make([]huffmanNode, len(codes))
_, err := buildHuffmanNode(&t, codes, 0)
return t, err
}
// huffmanSymbolLengthPair contains a symbol and its code length.
type huffmanSymbolLengthPair struct {
value uint16
length uint8
}
// huffmanSymbolLengthPair is used to provide an interface for sorting.
type huffmanSymbolLengthPairs []huffmanSymbolLengthPair
func (h huffmanSymbolLengthPairs) Len() int {
return len(h)
}
func (h huffmanSymbolLengthPairs) Less(i, j int) bool {
if h[i].length < h[j].length {
return true
}
if h[i].length > h[j].length {
return false
}
if h[i].value < h[j].value {
return true
}
return false
}
func (h huffmanSymbolLengthPairs) Swap(i, j int) {
h[i], h[j] = h[j], h[i]
}
// huffmanCode contains a symbol, its code and code length.
type huffmanCode struct {
code uint32
codeLen uint8
value uint16
}
// huffmanCodes is used to provide an interface for sorting.
type huffmanCodes []huffmanCode
func (n huffmanCodes) Len() int {
return len(n)
}
func (n huffmanCodes) Less(i, j int) bool {
return n[i].code < n[j].code
}
func (n huffmanCodes) Swap(i, j int) {
n[i], n[j] = n[j], n[i]
}
// buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
// the Huffman tree at the given level. It returns the index of the newly
// constructed node.
func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err os.Error) {
test := uint32(1) << (31 - level)
// We have to search the list of codes to find the divide between the left and right sides.
firstRightIndex := len(codes)
for i, code := range codes {
if code.code&test != 0 {
firstRightIndex = i
break
}
}
left := codes[:firstRightIndex]
right := codes[firstRightIndex:]
if len(left) == 0 || len(right) == 0 {
return 0, StructuralError("superfluous level in Huffman tree")
}
nodeIndex = uint16(t.nextNode)
node := &t.nodes[t.nextNode]
t.nextNode++
if len(left) == 1 {
// leaf node
node.left = invalidNodeValue
node.leftValue = left[0].value
} else {
node.left, err = buildHuffmanNode(t, left, level+1)
}
if err != nil {
return
}
if len(right) == 1 {
// leaf node
node.right = invalidNodeValue
node.rightValue = right[0].value
} else {
node.right, err = buildHuffmanNode(t, right, level+1)
}
return
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bzip2
// moveToFrontDecoder implements a move-to-front list. Such a list is an
// efficient way to transform a string with repeating elements into one with
// many small valued numbers, which is suitable for entropy encoding. It works
// by starting with an initial list of symbols and references symbols by their
// index into that list. When a symbol is referenced, it's moved to the front
// of the list. Thus, a repeated symbol ends up being encoded with many zeros,
// as the symbol will be at the front of the list after the first access.
type moveToFrontDecoder struct {
// Rather than actually keep the list in memory, the symbols are stored
// as a circular, double linked list which the symbol indexed by head
// at the front of the list.
symbols []byte
next []uint8
prev []uint8
head uint8
}
// newMTFDecoder creates a move-to-front decoder with an explicit initial list
// of symbols.
func newMTFDecoder(symbols []byte) *moveToFrontDecoder {
m := &moveToFrontDecoder{
symbols: symbols,
next: make([]uint8, len(symbols)),
prev: make([]uint8, len(symbols)),
}
m.threadLinkedList()
return m
}
// newMTFDecoderWithRange creates a move-to-front decoder with an initial
// symbol list of 0...n-1.
func newMTFDecoderWithRange(n int) *moveToFrontDecoder {
if n > 256 {
panic("newMTFDecoderWithRange: cannot have > 256 symbols")
}
m := &moveToFrontDecoder{
symbols: make([]uint8, n),
next: make([]uint8, n),
prev: make([]uint8, n),
}
for i := 0; i < n; i++ {
m.symbols[i] = byte(i)
}
m.threadLinkedList()
return m
}
// threadLinkedList creates the initial linked-list pointers.
func (m *moveToFrontDecoder) threadLinkedList() {
if len(m.symbols) == 0 {
return
}
m.prev[0] = uint8(len(m.symbols) - 1)
for i := 0; i < len(m.symbols)-1; i++ {
m.next[i] = uint8(i + 1)
m.prev[i+1] = uint8(i)
}
m.next[len(m.symbols)-1] = 0
}
func (m *moveToFrontDecoder) Decode(n int) (b byte) {
// Most of the time, n will be zero so it's worth dealing with this
// simple case.
if n == 0 {
return m.symbols[m.head]
}
i := m.head
for j := 0; j < n; j++ {
i = m.next[i]
}
b = m.symbols[i]
m.next[m.prev[i]] = m.next[i]
m.prev[m.next[i]] = m.prev[i]
m.next[i] = m.head
m.prev[i] = m.prev[m.head]
m.next[m.prev[m.head]] = i
m.prev[m.head] = i
m.head = i
return
}
// First returns the symbol at the front of the list.
func (m *moveToFrontDecoder) First() byte {
return m.symbols[m.head]
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment