Commit 9f4c288c authored by Adam Langley's avatar Adam Langley

hash/crc32: add SSE4.2 support

Using the CRC32 instruction speeds up the Castagnoli computation by
about 20x on a modern Intel CPU.

R=rsc
CC=golang-dev
https://golang.org/cl/4650072
parent 0f8678a7
...@@ -169,6 +169,13 @@ assemble(char *file) ...@@ -169,6 +169,13 @@ assemble(char *file)
struct struct
{ {
char *name; char *name;
/*
* type is the lexical type to return. It dictates what kind of
* operands 6a allows to follow it (in a.y) as the possible operand
* types are handled by a grammar. How do you know which LTYPE?
* Either read a.y or think of an instruction that has the same
* possible operands and look up what it takes.
*/
ushort type; ushort type;
ushort value; ushort value;
} itab[] = } itab[] =
...@@ -985,6 +992,8 @@ struct ...@@ -985,6 +992,8 @@ struct
"UNPCKLPS", LTYPE3, AUNPCKLPS, "UNPCKLPS", LTYPE3, AUNPCKLPS,
"XORPD", LTYPE3, AXORPD, "XORPD", LTYPE3, AXORPD,
"XORPS", LTYPE3, AXORPS, "XORPS", LTYPE3, AXORPS,
"CRC32B", LTYPE4, ACRC32B,
"CRC32Q", LTYPE4, ACRC32Q,
0 0
}; };
......
...@@ -730,6 +730,8 @@ enum as ...@@ -730,6 +730,8 @@ enum as
ASWAPGS, ASWAPGS,
AMODE, AMODE,
ACRC32B,
ACRC32Q,
ALAST ALAST
}; };
......
...@@ -222,6 +222,7 @@ enum ...@@ -222,6 +222,7 @@ enum
Zxxx = 0, Zxxx = 0,
Zlit, Zlit,
Zlitm_r,
Z_rp, Z_rp,
Zbr, Zbr,
Zcall, Zcall,
......
...@@ -529,7 +529,69 @@ uchar ymskb[] = ...@@ -529,7 +529,69 @@ uchar ymskb[] =
Ymr, Yrl, Zm_r_xm, 1, Ymr, Yrl, Zm_r_xm, 1,
0 0
}; };
uchar ycrc32l[] =
{
Yml, Yrl, Zlitm_r, 0,
};
/*
* You are doasm, holding in your hand a Prog* with p->as set to, say, ACRC32,
* and p->from and p->to as operands (Adr*). The linker scans optab to find
* the entry with the given p->as and then looks through the ytable for that
* instruction (the second field in the optab struct) for a line whose first
* two values match the Ytypes of the p->from and p->to operands. The function
* oclass in span.c computes the specific Ytype of an operand and then the set
* of more general Ytypes that it satisfies is implied by the ycover table, set
* up in instinit. For example, oclass distinguishes the constants 0 and 1
* from the more general 8-bit constants, but instinit says
*
* ycover[Yi0*Ymax + Ys32] = 1;
* ycover[Yi1*Ymax + Ys32] = 1;
* ycover[Yi8*Ymax + Ys32] = 1;
*
* which means that Yi0, Yi1, and Yi8 all count as Ys32 (signed 32)
* if that's what an instruction can handle.
*
* In parallel with the scan through the ytable for the appropriate line, there
* is a z pointer that starts out pointing at the strange magic byte list in
* the Optab struct. With each step past a non-matching ytable line, z
* advances by the 4th entry in the line. When a matching line is found, that
* z pointer has the extra data to use in laying down the instruction bytes.
* The actual bytes laid down are a function of the 3rd entry in the line (that
* is, the Ztype) and the z bytes.
*
* For example, let's look at AADDL. The optab line says:
* { AADDL, yaddl, Px, 0x83,(00),0x05,0x81,(00),0x01,0x03 },
*
* and yaddl says
* uchar yaddl[] =
* {
* Yi8, Yml, Zibo_m, 2,
* Yi32, Yax, Zil_, 1,
* Yi32, Yml, Zilo_m, 2,
* Yrl, Yml, Zr_m, 1,
* Yml, Yrl, Zm_r, 1,
* 0
* };
*
* so there are 5 possible types of ADDL instruction that can be laid down, and
* possible states used to lay them down (Ztype and z pointer, assuming z
* points at {0x83,(00),0x05,0x81,(00),0x01,0x03}) are:
*
* Yi8, Yml -> Zibo_m, z (0x83, 00)
* Yi32, Yax -> Zil_, z+2 (0x05)
* Yi32, Yml -> Zilo_m, z+2+1 (0x81, 0x00)
* Yrl, Yml -> Zr_m, z+2+1+2 (0x01)
* Yml, Yrl -> Zm_r, z+2+1+2+1 (0x03)
*
* The Pconstant in the optab line controls the prefix bytes to emit. That's
* relatively straightforward as this program goes.
*
* The switch on t[2] in doasm implements the various Z cases. Zibo_m, for
* example, is an opcode byte (z[0]) then an asmando (which is some kind of
* encoded addressing mode for the Yml arg), and then a single immediate byte.
* Zilo_m is the same but a long (32-bit) immediate.
*/
Optab optab[] = Optab optab[] =
/* as, ytab, andproto, opcode */ /* as, ytab, andproto, opcode */
{ {
...@@ -1199,6 +1261,9 @@ Optab optab[] = ...@@ -1199,6 +1261,9 @@ Optab optab[] =
{ AXADDQ, yrl_ml, Pw, 0x0f,0xc1 }, { AXADDQ, yrl_ml, Pw, 0x0f,0xc1 },
{ AXADDW, yrl_ml, Pe, 0x0f,0xc1 }, { AXADDW, yrl_ml, Pe, 0x0f,0xc1 },
{ ACRC32B, ycrc32l,Px, 0xf2,0x0f,0x38,0xf0,0},
{ ACRC32Q, ycrc32l,Pw, 0xf2,0x0f,0x38,0xf1,0},
{ AEND }, { AEND },
0 0
}; };
......
...@@ -1166,6 +1166,12 @@ found: ...@@ -1166,6 +1166,12 @@ found:
*andptr++ = op; *andptr++ = op;
break; break;
case Zlitm_r:
for(; op = o->op[z]; z++)
*andptr++ = op;
asmand(&p->from, &p->to);
break;
case Zmb_r: case Zmb_r:
bytereg(&p->from, &p->ft); bytereg(&p->from, &p->ft);
/* fall through */ /* fall through */
......
...@@ -5,7 +5,16 @@ ...@@ -5,7 +5,16 @@
include ../../../Make.inc include ../../../Make.inc
TARG=hash/crc32 TARG=hash/crc32
ifeq ($(GOARCH), amd64)
ARCH_GOFILES=crc32_amd64.go
OFILES=crc32_amd64.6
else
ARCH_GOFILES=crc32_generic.go
endif
GOFILES=\ GOFILES=\
crc32.go\ crc32.go\
$(ARCH_GOFILES)
include ../../../Make.pkg include ../../../Make.pkg
...@@ -10,6 +10,7 @@ package crc32 ...@@ -10,6 +10,7 @@ package crc32
import ( import (
"hash" "hash"
"os" "os"
"sync"
) )
// The size of a CRC-32 checksum in bytes. // The size of a CRC-32 checksum in bytes.
...@@ -35,8 +36,34 @@ const ( ...@@ -35,8 +36,34 @@ const (
// Table is a 256-word table representing the polynomial for efficient processing. // Table is a 256-word table representing the polynomial for efficient processing.
type Table [256]uint32 type Table [256]uint32
// castagnoliTable points to a lazily initialized Table for the Castagnoli
// polynomial. MakeTable will always return this value when asked to make a
// Castagnoli table so we can compare against it to find when the caller is
// using this polynomial.
var castagnoliTable *Table
var castagnoliOnce sync.Once
func castagnoliInit() {
castagnoliTable = makeTable(Castagnoli)
}
// IEEETable is the table for the IEEE polynomial.
var IEEETable = makeTable(IEEE)
// MakeTable returns the Table constructed from the specified polynomial. // MakeTable returns the Table constructed from the specified polynomial.
func MakeTable(poly uint32) *Table { func MakeTable(poly uint32) *Table {
switch poly {
case IEEE:
return IEEETable
case Castagnoli:
castagnoliOnce.Do(castagnoliInit)
return castagnoliTable
}
return makeTable(poly)
}
// makeTable returns the Table constructed from the specified polynomial.
func makeTable(poly uint32) *Table {
t := new(Table) t := new(Table)
for i := 0; i < 256; i++ { for i := 0; i < 256; i++ {
crc := uint32(i) crc := uint32(i)
...@@ -52,9 +79,6 @@ func MakeTable(poly uint32) *Table { ...@@ -52,9 +79,6 @@ func MakeTable(poly uint32) *Table {
return t return t
} }
// IEEETable is the table for the IEEE polynomial.
var IEEETable = MakeTable(IEEE)
// digest represents the partial evaluation of a checksum. // digest represents the partial evaluation of a checksum.
type digest struct { type digest struct {
crc uint32 crc uint32
...@@ -83,11 +107,14 @@ func update(crc uint32, tab *Table, p []byte) uint32 { ...@@ -83,11 +107,14 @@ func update(crc uint32, tab *Table, p []byte) uint32 {
// Update returns the result of adding the bytes in p to the crc. // Update returns the result of adding the bytes in p to the crc.
func Update(crc uint32, tab *Table, p []byte) uint32 { func Update(crc uint32, tab *Table, p []byte) uint32 {
if tab == castagnoliTable {
return updateCastagnoli(crc, p)
}
return update(crc, tab, p) return update(crc, tab, p)
} }
func (d *digest) Write(p []byte) (n int, err os.Error) { func (d *digest) Write(p []byte) (n int, err os.Error) {
d.crc = update(d.crc, d.tab, p) d.crc = Update(d.crc, d.tab, p)
return len(p), nil return len(p), nil
} }
...@@ -105,7 +132,7 @@ func (d *digest) Sum() []byte { ...@@ -105,7 +132,7 @@ func (d *digest) Sum() []byte {
// Checksum returns the CRC-32 checksum of data // Checksum returns the CRC-32 checksum of data
// using the polynomial represented by the Table. // using the polynomial represented by the Table.
func Checksum(data []byte, tab *Table) uint32 { return update(0, tab, data) } func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
// ChecksumIEEE returns the CRC-32 checksum of data // ChecksumIEEE returns the CRC-32 checksum of data
// using the IEEE polynomial. // using the IEEE polynomial.
......
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
// This file contains the code to call the SSE 4.2 version of the Castagnoli
// CRC.
// haveSSE42 is defined in crc_amd64.s and uses CPUID to test for SSE 4.2
// support.
func haveSSE42() bool
// castagnoliSSE42 is defined in crc_amd64.s and uses the SSE4.2 CRC32
// instruction.
func castagnoliSSE42(uint32, []byte) uint32
var sse42 = haveSSE42()
func updateCastagnoli(crc uint32, p []byte) uint32 {
if sse42 {
return castagnoliSSE42(crc, p)
}
return update(crc, castagnoliTable, p)
}
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// func castagnoliSSE42(crc uint32, p []byte) uint32
TEXT ·castagnoliSSE42(SB),7,$0
MOVL crc+0(FP), AX // CRC value
MOVQ p+8(FP), SI // data pointer
MOVL p+16(FP), CX // len(p)
NOTL AX
/* If there's less than 8 bytes to process, we do it byte-by-byte. */
CMPL CX, $8
JL cleanup
/* Process individual bytes until the input is 8-byte aligned. */
startup:
MOVQ SI, BX
ANDQ $7, BX
JZ aligned
CRC32B (SI), AX
DECL CX
INCQ SI
JMP startup
aligned:
/* The input is now 8-byte aligned and we can process 8-byte chunks. */
CMPL CX, $8
JL cleanup
CRC32Q (SI), AX
ADDQ $8, SI
SUBQ $8, CX
JMP aligned
cleanup:
/* We may have some bytes left over that we process one at a time. */
CMPL CX, $0
JE done
CRC32B (SI), AX
INCQ SI
DECQ CX
JMP cleanup
done:
NOTL AX
MOVL AX, ret+24(FP)
RET
// func haveSSE42() bool
TEXT ·haveSSE42(SB),7,$0
XORQ AX, AX
INCL AX
CPUID
SHRQ $20, CX
ANDQ $1, CX
MOVB CX, ret+0(FP)
RET
// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package crc32
// The file contains the generic version of updateCastagnoli which just calls
// the software implementation.
func updateCastagnoli(crc uint32, p []byte) uint32 {
return update(crc, castagnoliTable, p)
}
...@@ -10,53 +10,73 @@ import ( ...@@ -10,53 +10,73 @@ import (
) )
type test struct { type test struct {
out uint32 ieee, castagnoli uint32
in string in string
} }
var golden = []test{ var golden = []test{
{0x0, ""}, {0x0, 0x0, ""},
{0xe8b7be43, "a"}, {0xe8b7be43, 0xc1d04330, "a"},
{0x9e83486d, "ab"}, {0x9e83486d, 0xe2a22936, "ab"},
{0x352441c2, "abc"}, {0x352441c2, 0x364b3fb7, "abc"},
{0xed82cd11, "abcd"}, {0xed82cd11, 0x92c80a31, "abcd"},
{0x8587d865, "abcde"}, {0x8587d865, 0xc450d697, "abcde"},
{0x4b8e39ef, "abcdef"}, {0x4b8e39ef, 0x53bceff1, "abcdef"},
{0x312a6aa6, "abcdefg"}, {0x312a6aa6, 0xe627f441, "abcdefg"},
{0xaeef2a50, "abcdefgh"}, {0xaeef2a50, 0xa9421b7, "abcdefgh"},
{0x8da988af, "abcdefghi"}, {0x8da988af, 0x2ddc99fc, "abcdefghi"},
{0x3981703a, "abcdefghij"}, {0x3981703a, 0xe6599437, "abcdefghij"},
{0x6b9cdfe7, "Discard medicine more than two years old."}, {0x6b9cdfe7, 0xb2cc01fe, "Discard medicine more than two years old."},
{0xc90ef73f, "He who has a shady past knows that nice guys finish last."}, {0xc90ef73f, 0xe28207f, "He who has a shady past knows that nice guys finish last."},
{0xb902341f, "I wouldn't marry him with a ten foot pole."}, {0xb902341f, 0xbe93f964, "I wouldn't marry him with a ten foot pole."},
{0x42080e8, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"}, {0x42080e8, 0x9e3be0c3, "Free! Free!/A trip/to Mars/for 900/empty jars/Burma Shave"},
{0x154c6d11, "The days of the digital watch are numbered. -Tom Stoppard"}, {0x154c6d11, 0xf505ef04, "The days of the digital watch are numbered. -Tom Stoppard"},
{0x4c418325, "Nepal premier won't resign."}, {0x4c418325, 0x85d3dc82, "Nepal premier won't resign."},
{0x33955150, "For every action there is an equal and opposite government program."}, {0x33955150, 0xc5142380, "For every action there is an equal and opposite government program."},
{0x26216a4b, "His money is twice tainted: 'taint yours and 'taint mine."}, {0x26216a4b, 0x75eb77dd, "His money is twice tainted: 'taint yours and 'taint mine."},
{0x1abbe45e, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"}, {0x1abbe45e, 0x91ebe9f7, "There is no reason for any individual to have a computer in their home. -Ken Olsen, 1977"},
{0xc89a94f7, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"}, {0xc89a94f7, 0xf0b1168e, "It's a tiny change to the code and not completely disgusting. - Bob Manchek"},
{0xab3abe14, "size: a.out: bad magic"}, {0xab3abe14, 0x572b74e2, "size: a.out: bad magic"},
{0xbab102b6, "The major problem is with sendmail. -Mark Horton"}, {0xbab102b6, 0x8a58a6d5, "The major problem is with sendmail. -Mark Horton"},
{0x999149d7, "Give me a rock, paper and scissors and I will move the world. CCFestoon"}, {0x999149d7, 0x9c426c50, "Give me a rock, paper and scissors and I will move the world. CCFestoon"},
{0x6d52a33c, "If the enemy is within range, then so are you."}, {0x6d52a33c, 0x735400a4, "If the enemy is within range, then so are you."},
{0x90631e8d, "It's well we cannot hear the screams/That we create in others' dreams."}, {0x90631e8d, 0xbec49c95, "It's well we cannot hear the screams/That we create in others' dreams."},
{0x78309130, "You remind me of a TV show, but that's all right: I watch it anyway."}, {0x78309130, 0xa95a2079, "You remind me of a TV show, but that's all right: I watch it anyway."},
{0x7d0a377f, "C is as portable as Stonehedge!!"}, {0x7d0a377f, 0xde2e65c5, "C is as portable as Stonehedge!!"},
{0x8c79fd79, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"}, {0x8c79fd79, 0x297a88ed, "Even if I could be Shakespeare, I think I should still choose to be Faraday. - A. Huxley"},
{0xa20b7167, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"}, {0xa20b7167, 0x66ed1d8b, "The fugacity of a constituent in a mixture of gases at a given temperature is proportional to its mole fraction. Lewis-Randall Rule"},
{0x8e0bb443, "How can you write a big system without C++? -Paul Glick"}, {0x8e0bb443, 0xdcded527, "How can you write a big system without C++? -Paul Glick"},
} }
func TestGolden(t *testing.T) { func TestGolden(t *testing.T) {
for i := 0; i < len(golden); i++ { castagnoliTab := MakeTable(Castagnoli)
g := golden[i]
c := NewIEEE() for _, g := range golden {
io.WriteString(c, g.in) ieee := NewIEEE()
s := c.Sum32() io.WriteString(ieee, g.in)
if s != g.out { s := ieee.Sum32()
t.Errorf("crc32(%s) = 0x%x want 0x%x", g.in, s, g.out) if s != g.ieee {
t.FailNow() t.Errorf("IEEE(%s) = 0x%x want 0x%x", g.in, s, g.ieee)
}
castagnoli := New(castagnoliTab)
io.WriteString(castagnoli, g.in)
s = castagnoli.Sum32()
if s != g.castagnoli {
t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
}
if len(g.in) > 0 {
// The SSE4.2 implementation of this has code to deal
// with misaligned data so we ensure that we test that
// too.
castagnoli = New(castagnoliTab)
io.WriteString(castagnoli, g.in[:1])
io.WriteString(castagnoli, g.in[1:])
s = castagnoli.Sum32()
if s != g.castagnoli {
t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli)
}
} }
} }
} }
...@@ -69,6 +89,7 @@ func BenchmarkCrc32KB(b *testing.B) { ...@@ -69,6 +89,7 @@ func BenchmarkCrc32KB(b *testing.B) {
} }
c := NewIEEE() c := NewIEEE()
b.StartTimer() b.StartTimer()
b.SetBytes(int64(len(data)))
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {
c.Write(data) c.Write(data)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment