Commit 60214492 authored by Keith Randall's avatar Keith Randall

runtime: faster x86 memmove (a.k.a. built-in copy())

REP instructions have a high startup cost, so we handle small
sizes with some straightline code.  The REP MOVSx instructions
are really fast for large sizes.  The cutover is approximately
1K.  We implement up to 128/256 because that is the maximum
SSE register load (loading all data into registers before any
stores lets us ignore copy direction).

(on a Sandy Bridge E5-1650 @ 3.20GHz)
benchmark               old ns/op    new ns/op    delta
BenchmarkMemmove0               3            3   +0.86%
BenchmarkMemmove1               5            5   +5.40%
BenchmarkMemmove2              18            8  -56.84%
BenchmarkMemmove3              18            7  -58.45%
BenchmarkMemmove4              36            7  -78.63%
BenchmarkMemmove5              36            8  -77.91%
BenchmarkMemmove6              36            8  -77.76%
BenchmarkMemmove7              36            8  -77.82%
BenchmarkMemmove8              18            8  -56.33%
BenchmarkMemmove9              18            7  -58.34%
BenchmarkMemmove10             18            7  -58.34%
BenchmarkMemmove11             18            7  -58.45%
BenchmarkMemmove12             36            7  -78.51%
BenchmarkMemmove13             36            7  -78.48%
BenchmarkMemmove14             36            7  -78.56%
BenchmarkMemmove15             36            7  -78.56%
BenchmarkMemmove16             18            7  -58.24%
BenchmarkMemmove32             18            8  -54.33%
BenchmarkMemmove64             18            8  -53.37%
BenchmarkMemmove128            20            9  -55.93%
BenchmarkMemmove256            25           11  -55.16%
BenchmarkMemmove512            33           33   -1.19%
BenchmarkMemmove1024           43           44   +2.06%
BenchmarkMemmove2048           61           61   +0.16%
BenchmarkMemmove4096           95           95   +0.00%

R=golang-dev, bradfitz, remyoudompheng, khr, iant, dominik.honnef
CC=golang-dev
https://golang.org/cl/9038048
parent af1dd56d
......@@ -27,6 +27,34 @@ TEXT runtime·memmove(SB), 7, $0
MOVL to+0(FP), DI
MOVL fr+4(FP), SI
MOVL n+8(FP), BX
// REP instructions have a high startup cost, so we handle small sizes
// with some straightline code. The REP MOVSL instruction is really fast
// for large sizes. The cutover is approximately 1K. We implement up to
// 128 because that is the maximum SSE register load (loading all data
// into registers lets us ignore copy direction).
tail:
TESTL BX, BX
JEQ move_0
CMPL BX, $2
JBE move_1or2
CMPL BX, $4
JBE move_3or4
CMPL BX, $8
JBE move_5through8
CMPL BX, $16
JBE move_9through16
TESTL $0x4000000, runtime·cpuid_edx(SB) // check for sse2
JEQ nosse2
CMPL BX, $32
JBE move_17through32
CMPL BX, $64
JBE move_33through64
CMPL BX, $128
JBE move_65through128
// TODO: use branch table and BSR to make this just a single dispatch
nosse2:
/*
* check and set for backwards
*/
......@@ -42,11 +70,7 @@ forward:
ANDL $3, BX
REP; MOVSL
MOVL BX, CX
REP; MOVSB
MOVL to+0(FP),AX
RET
JMP tail
/*
* check overlap
*/
......@@ -75,12 +99,73 @@ back:
SUBL $4, SI
REP; MOVSL
ADDL $3, DI
ADDL $3, SI
MOVL BX, CX
REP; MOVSB
CLD
MOVL to+0(FP),AX
RET
ADDL $4, DI
ADDL $4, SI
SUBL BX, DI
SUBL BX, SI
JMP tail
move_1or2:
MOVB (SI), AX
MOVB -1(SI)(BX*1), CX
MOVB AX, (DI)
MOVB CX, -1(DI)(BX*1)
move_0:
RET
move_3or4:
MOVW (SI), AX
MOVW -2(SI)(BX*1), CX
MOVW AX, (DI)
MOVW CX, -2(DI)(BX*1)
RET
move_5through8:
MOVL (SI), AX
MOVL -4(SI)(BX*1), CX
MOVL AX, (DI)
MOVL CX, -4(DI)(BX*1)
RET
move_9through16:
MOVL (SI), AX
MOVL 4(SI), CX
MOVL -8(SI)(BX*1), DX
MOVL -4(SI)(BX*1), BP
MOVL AX, (DI)
MOVL CX, 4(DI)
MOVL DX, -8(DI)(BX*1)
MOVL BP, -4(DI)(BX*1)
RET
move_17through32:
MOVOU (SI), X0
MOVOU -16(SI)(BX*1), X1
MOVOU X0, (DI)
MOVOU X1, -16(DI)(BX*1)
RET
move_33through64:
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU -32(SI)(BX*1), X2
MOVOU -16(SI)(BX*1), X3
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, -32(DI)(BX*1)
MOVOU X3, -16(DI)(BX*1)
RET
move_65through128:
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU 32(SI), X2
MOVOU 48(SI), X3
MOVOU -64(SI)(BX*1), X4
MOVOU -48(SI)(BX*1), X5
MOVOU -32(SI)(BX*1), X6
MOVOU -16(SI)(BX*1), X7
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
MOVOU X4, -64(DI)(BX*1)
MOVOU X5, -48(DI)(BX*1)
MOVOU X6, -32(DI)(BX*1)
MOVOU X7, -16(DI)(BX*1)
RET
......@@ -30,6 +30,32 @@ TEXT runtime·memmove(SB), 7, $0
MOVQ fr+8(FP), SI
MOVQ n+16(FP), BX
// REP instructions have a high startup cost, so we handle small sizes
// with some straightline code. The REP MOVSQ instruction is really fast
// for large sizes. The cutover is approximately 1K. We implement up to
// 256 because that is the maximum SSE register load (loading all data
// into registers lets us ignore copy direction).
tail:
TESTQ BX, BX
JEQ move_0
CMPQ BX, $2
JBE move_1or2
CMPQ BX, $4
JBE move_3or4
CMPQ BX, $8
JBE move_5through8
CMPQ BX, $16
JBE move_9through16
CMPQ BX, $32
JBE move_17through32
CMPQ BX, $64
JBE move_33through64
CMPQ BX, $128
JBE move_65through128
CMPQ BX, $256
JBE move_129through256
// TODO: use branch table and BSR to make this just a single dispatch
/*
* check and set for backwards
*/
......@@ -45,11 +71,8 @@ forward:
ANDQ $7, BX
REP; MOVSQ
MOVQ BX, CX
REP; MOVSB
JMP tail
MOVQ to+0(FP),AX
RET
back:
/*
* check overlap
......@@ -78,12 +101,103 @@ back:
SUBQ $8, SI
REP; MOVSQ
ADDQ $7, DI
ADDQ $7, SI
MOVQ BX, CX
REP; MOVSB
CLD
MOVQ to+0(FP),AX
RET
ADDQ $8, DI
ADDQ $8, SI
SUBQ BX, DI
SUBQ BX, SI
JMP tail
move_1or2:
MOVB (SI), AX
MOVB -1(SI)(BX*1), CX
MOVB AX, (DI)
MOVB CX, -1(DI)(BX*1)
move_0:
RET
move_3or4:
MOVW (SI), AX
MOVW -2(SI)(BX*1), CX
MOVW AX, (DI)
MOVW CX, -2(DI)(BX*1)
RET
move_5through8:
MOVL (SI), AX
MOVL -4(SI)(BX*1), CX
MOVL AX, (DI)
MOVL CX, -4(DI)(BX*1)
RET
move_9through16:
MOVQ (SI), AX
MOVQ -8(SI)(BX*1), CX
MOVQ AX, (DI)
MOVQ CX, -8(DI)(BX*1)
RET
move_17through32:
MOVOU (SI), X0
MOVOU -16(SI)(BX*1), X1
MOVOU X0, (DI)
MOVOU X1, -16(DI)(BX*1)
RET
move_33through64:
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU -32(SI)(BX*1), X2
MOVOU -16(SI)(BX*1), X3
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, -32(DI)(BX*1)
MOVOU X3, -16(DI)(BX*1)
RET
move_65through128:
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU 32(SI), X2
MOVOU 48(SI), X3
MOVOU -64(SI)(BX*1), X4
MOVOU -48(SI)(BX*1), X5
MOVOU -32(SI)(BX*1), X6
MOVOU -16(SI)(BX*1), X7
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
MOVOU X4, -64(DI)(BX*1)
MOVOU X5, -48(DI)(BX*1)
MOVOU X6, -32(DI)(BX*1)
MOVOU X7, -16(DI)(BX*1)
RET
move_129through256:
MOVOU (SI), X0
MOVOU 16(SI), X1
MOVOU 32(SI), X2
MOVOU 48(SI), X3
MOVOU 64(SI), X4
MOVOU 80(SI), X5
MOVOU 96(SI), X6
MOVOU 112(SI), X7
MOVOU -128(SI)(BX*1), X8
MOVOU -112(SI)(BX*1), X9
MOVOU -96(SI)(BX*1), X10
MOVOU -80(SI)(BX*1), X11
MOVOU -64(SI)(BX*1), X12
MOVOU -48(SI)(BX*1), X13
MOVOU -32(SI)(BX*1), X14
MOVOU -16(SI)(BX*1), X15
MOVOU X0, (DI)
MOVOU X1, 16(DI)
MOVOU X2, 32(DI)
MOVOU X3, 48(DI)
MOVOU X4, 64(DI)
MOVOU X5, 80(DI)
MOVOU X6, 96(DI)
MOVOU X7, 112(DI)
MOVOU X8, -128(DI)(BX*1)
MOVOU X9, -112(DI)(BX*1)
MOVOU X10, -96(DI)(BX*1)
MOVOU X11, -80(DI)(BX*1)
MOVOU X12, -64(DI)(BX*1)
MOVOU X13, -48(DI)(BX*1)
MOVOU X14, -32(DI)(BX*1)
MOVOU X15, -16(DI)(BX*1)
RET
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runtime_test
import (
"testing"
)
func TestMemmove(t *testing.T) {
size := 256
if testing.Short() {
size = 128 + 16
}
src := make([]byte, size)
dst := make([]byte, size)
for i := 0; i < size; i++ {
src[i] = byte(128 + (i & 127))
}
for i := 0; i < size; i++ {
dst[i] = byte(i & 127)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // offset in src
for y := 0; y <= size-n; y++ { // offset in dst
copy(dst[y:y+n], src[x:x+n])
for i := 0; i < y; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("prefix dst[%d] = %d", i, dst[i])
}
}
for i := y; i < y+n; i++ {
if dst[i] != byte(128+((i-y+x)&127)) {
t.Fatalf("copied dst[%d] = %d", i, dst[i])
}
dst[i] = byte(i & 127) // reset dst
}
for i := y + n; i < size; i++ {
if dst[i] != byte(i&127) {
t.Fatalf("suffix dst[%d] = %d", i, dst[i])
}
}
}
}
}
}
func TestMemmoveAlias(t *testing.T) {
size := 256
if testing.Short() {
size = 128 + 16
}
buf := make([]byte, size)
for i := 0; i < size; i++ {
buf[i] = byte(i)
}
for n := 0; n <= size; n++ {
for x := 0; x <= size-n; x++ { // src offset
for y := 0; y <= size-n; y++ { // dst offset
copy(buf[y:y+n], buf[x:x+n])
for i := 0; i < y; i++ {
if buf[i] != byte(i) {
t.Fatalf("prefix buf[%d] = %d", i, buf[i])
}
}
for i := y; i < y+n; i++ {
if buf[i] != byte(i-y+x) {
t.Fatalf("copied buf[%d] = %d", i, buf[i])
}
buf[i] = byte(i) // reset buf
}
for i := y + n; i < size; i++ {
if buf[i] != byte(i) {
t.Fatalf("suffix buf[%d] = %d", i, buf[i])
}
}
}
}
}
}
func bmMemmove(n int, b *testing.B) {
x := make([]byte, n)
y := make([]byte, n)
b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
copy(x, y)
}
}
func BenchmarkMemmove0(b *testing.B) { bmMemmove(0, b) }
func BenchmarkMemmove1(b *testing.B) { bmMemmove(1, b) }
func BenchmarkMemmove2(b *testing.B) { bmMemmove(2, b) }
func BenchmarkMemmove3(b *testing.B) { bmMemmove(3, b) }
func BenchmarkMemmove4(b *testing.B) { bmMemmove(4, b) }
func BenchmarkMemmove5(b *testing.B) { bmMemmove(5, b) }
func BenchmarkMemmove6(b *testing.B) { bmMemmove(6, b) }
func BenchmarkMemmove7(b *testing.B) { bmMemmove(7, b) }
func BenchmarkMemmove8(b *testing.B) { bmMemmove(8, b) }
func BenchmarkMemmove9(b *testing.B) { bmMemmove(9, b) }
func BenchmarkMemmove10(b *testing.B) { bmMemmove(10, b) }
func BenchmarkMemmove11(b *testing.B) { bmMemmove(11, b) }
func BenchmarkMemmove12(b *testing.B) { bmMemmove(12, b) }
func BenchmarkMemmove13(b *testing.B) { bmMemmove(13, b) }
func BenchmarkMemmove14(b *testing.B) { bmMemmove(14, b) }
func BenchmarkMemmove15(b *testing.B) { bmMemmove(15, b) }
func BenchmarkMemmove16(b *testing.B) { bmMemmove(16, b) }
func BenchmarkMemmove32(b *testing.B) { bmMemmove(32, b) }
func BenchmarkMemmove64(b *testing.B) { bmMemmove(64, b) }
func BenchmarkMemmove128(b *testing.B) { bmMemmove(128, b) }
func BenchmarkMemmove256(b *testing.B) { bmMemmove(256, b) }
func BenchmarkMemmove512(b *testing.B) { bmMemmove(512, b) }
func BenchmarkMemmove1024(b *testing.B) { bmMemmove(1024, b) }
func BenchmarkMemmove2048(b *testing.B) { bmMemmove(2048, b) }
func BenchmarkMemmove4096(b *testing.B) { bmMemmove(4096, b) }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment