Commit 1e076035 authored by Davies Liu's avatar Davies Liu Committed by Russ Cox

hash/crc32: speedup crc32 of IEEE using slicingBy8

The Slicing-By-8 [1] algorithm has much performance improvements than
current approach. This patch only uses it for IEEE, which is the most
common case in practice.

There is the benchmark on Mac OS X 10.9:

benchmark                     old MB/s     new MB/s     speedup
BenchmarkIEEECrc1KB           349.40       353.03       1.01x
BenchmarkIEEECrc4KB           351.55       934.35       2.66x
BenchmarkCastagnoliCrc1KB     7037.58      7392.63      1.05x

This algorithm need 8K lookup table, so it's enabled only for block
larger than 4K.

We can see about 2.6x improvement for IEEE.

Change-Id: I7f786d20f0949245e4aa101d7921669f496ed0f7
Reviewed-on: https://go-review.googlesource.com/1863Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent d1e7980d
......@@ -54,6 +54,13 @@ func castagnoliInit() {
// IEEETable is the table for the IEEE polynomial.
var IEEETable = makeTable(IEEE)
// slicing8Table is array of 8 Tables
type slicing8Table [8]Table
// iEEETable8 is the slicing8Table for IEEE
var iEEETable8 *slicing8Table
var iEEETable8Once sync.Once
// MakeTable returns the Table constructed from the specified polynomial.
func MakeTable(poly uint32) *Table {
switch poly {
......@@ -83,6 +90,20 @@ func makeTable(poly uint32) *Table {
return t
}
// makeTable8 returns slicing8Table constructed from the specified polynomial.
func makeTable8(poly uint32) *slicing8Table {
t := new(slicing8Table)
t[0] = *makeTable(poly)
for i := 0; i < 256; i++ {
crc := t[0][i]
for j := 1; j < 8; j++ {
crc = t[0][crc&0xFF] ^ (crc >> 8)
t[j][i] = crc
}
}
return t
}
// digest represents the partial evaluation of a checksum.
type digest struct {
crc uint32
......@@ -111,11 +132,32 @@ func update(crc uint32, tab *Table, p []byte) uint32 {
return ^crc
}
// updateSlicingBy8 updates CRC using Slicing-by-8
func updateSlicingBy8(crc uint32, tab *slicing8Table, p []byte) uint32 {
crc = ^crc
for len(p) > 8 {
crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
p = p[8:]
}
crc = ^crc
return update(crc, &tab[0], p)
}
// Update returns the result of adding the bytes in p to the crc.
func Update(crc uint32, tab *Table, p []byte) uint32 {
if tab == castagnoliTable {
return updateCastagnoli(crc, p)
}
// only use slicing-by-8 when input is larger than 4KB
if tab == IEEETable && len(p) >= 4096 {
iEEETable8Once.Do(func() {
iEEETable8 = makeTable8(IEEE)
})
return updateSlicingBy8(crc, iEEETable8, p)
}
return update(crc, tab, p)
}
......@@ -137,4 +179,4 @@ func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
// ChecksumIEEE returns the CRC-32 checksum of data
// using the IEEE polynomial.
func ChecksumIEEE(data []byte) uint32 { return update(0, IEEETable, data) }
func ChecksumIEEE(data []byte) uint32 { return Update(0, IEEETable, data) }
......@@ -81,7 +81,7 @@ func TestGolden(t *testing.T) {
}
}
func BenchmarkCrc32KB(b *testing.B) {
func BenchmarkIEEECrc1KB(b *testing.B) {
b.SetBytes(1024)
data := make([]byte, 1024)
for i := range data {
......@@ -97,3 +97,37 @@ func BenchmarkCrc32KB(b *testing.B) {
h.Sum(in)
}
}
func BenchmarkIEEECrc4KB(b *testing.B) {
b.SetBytes(4096)
data := make([]byte, 4096)
for i := range data {
data[i] = byte(i)
}
h := NewIEEE()
in := make([]byte, 0, h.Size())
b.ResetTimer()
for i := 0; i < b.N; i++ {
h.Reset()
h.Write(data)
h.Sum(in)
}
}
func BenchmarkCastagnoliCrc1KB(b *testing.B) {
b.SetBytes(1024)
data := make([]byte, 1024)
for i := range data {
data[i] = byte(i)
}
h := New(MakeTable(Castagnoli))
in := make([]byte, 0, h.Size())
b.ResetTimer()
for i := 0; i < b.N; i++ {
h.Reset()
h.Write(data)
h.Sum(in)
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment