Commit 4d10aba3 authored by Josh Bleecher Snyder's avatar Josh Bleecher Snyder

math/big: add fast path for amd64 addVW for large z

This matches the pure Go fast path added in the previous commit.

I will leave other architectures to those with ready access to hardware.

name            old time/op    new time/op    delta
AddVW/1-8         3.60ns ± 3%    3.59ns ± 1%      ~     (p=0.147 n=91+86)
AddVW/2-8         3.92ns ± 1%    3.91ns ± 2%    -0.36%  (p=0.000 n=86+92)
AddVW/3-8         4.33ns ± 5%    4.46ns ± 5%    +2.94%  (p=0.000 n=96+97)
AddVW/4-8         4.76ns ± 5%    4.82ns ± 5%    +1.28%  (p=0.000 n=95+92)
AddVW/5-8         5.40ns ± 1%    5.42ns ± 0%    +0.47%  (p=0.000 n=76+71)
AddVW/10-8        8.03ns ± 1%    7.80ns ± 5%    -2.90%  (p=0.000 n=73+96)
AddVW/100-8       43.8ns ± 5%    17.9ns ± 1%   -59.12%  (p=0.000 n=94+81)
AddVW/1000-8       428ns ± 4%      85ns ± 6%   -80.20%  (p=0.000 n=96+99)
AddVW/10000-8     4.22µs ± 2%    1.80µs ± 3%   -57.32%  (p=0.000 n=69+92)
AddVW/100000-8    44.8µs ± 8%    31.5µs ± 3%   -29.76%  (p=0.000 n=99+90)

name            old time/op    new time/op    delta
SubVW/1-8         3.53ns ± 2%    3.63ns ± 5%    +2.97%  (p=0.000 n=94+93)
SubVW/2-8         4.33ns ± 5%    4.01ns ± 2%    -7.36%  (p=0.000 n=90+85)
SubVW/3-8         4.32ns ± 2%    4.32ns ± 5%      ~     (p=0.084 n=87+97)
SubVW/4-8         4.70ns ± 2%    4.83ns ± 6%    +2.77%  (p=0.000 n=85+96)
SubVW/5-8         5.84ns ± 1%    5.35ns ± 1%    -8.35%  (p=0.000 n=87+87)
SubVW/10-8        8.01ns ± 4%    7.54ns ± 4%    -5.84%  (p=0.000 n=98+97)
SubVW/100-8       43.9ns ± 5%    17.9ns ± 1%   -59.20%  (p=0.000 n=98+76)
SubVW/1000-8       426ns ± 2%      85ns ± 3%   -80.13%  (p=0.000 n=90+98)
SubVW/10000-8     4.24µs ± 2%    1.81µs ± 3%   -57.28%  (p=0.000 n=74+91)
SubVW/100000-8    44.5µs ± 4%    31.5µs ± 2%   -29.33%  (p=0.000 n=84+91)

Change-Id: I10dd361cbaca22197c27e7734c0f50065292afbb
Reviewed-on: https://go-review.googlesource.com/c/go/+/164969
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRobert Griesemer <gri@golang.org>
parent fe24837c
...@@ -143,6 +143,8 @@ E2: NEGQ CX ...@@ -143,6 +143,8 @@ E2: NEGQ CX
// func addVW(z, x []Word, y Word) (c Word) // func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),NOSPLIT,$0 TEXT ·addVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI MOVQ z_len+8(FP), DI
CMPQ DI, $32
JG large
MOVQ x+24(FP), R8 MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10 MOVQ z+0(FP), R10
...@@ -189,12 +191,16 @@ L3: // n > 0 ...@@ -189,12 +191,16 @@ L3: // n > 0
E3: MOVQ CX, c+56(FP) // return c E3: MOVQ CX, c+56(FP) // return c
RET RET
large:
JMP ·addVWlarge(SB)
// func subVW(z, x []Word, y Word) (c Word) // func subVW(z, x []Word, y Word) (c Word)
// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names) // (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
TEXT ·subVW(SB),NOSPLIT,$0 TEXT ·subVW(SB),NOSPLIT,$0
MOVQ z_len+8(FP), DI MOVQ z_len+8(FP), DI
CMPQ DI, $32
JG large
MOVQ x+24(FP), R8 MOVQ x+24(FP), R8
MOVQ y+48(FP), CX // c = y MOVQ y+48(FP), CX // c = y
MOVQ z+0(FP), R10 MOVQ z+0(FP), R10
...@@ -242,6 +248,8 @@ L4: // n > 0 ...@@ -242,6 +248,8 @@ L4: // n > 0
E4: MOVQ CX, c+56(FP) // return c E4: MOVQ CX, c+56(FP) // return c
RET RET
large:
JMP ·subVWlarge(SB)
// func shlVU(z, x []Word, s uint) (c Word) // func shlVU(z, x []Word, s uint) (c Word)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment