Commit b6a15683 authored by Bill O'Farrell's avatar Bill O'Farrell Committed by Michael Munday

math: use SIMD to accelerate some scalar math functions on s390x

Note, most math functions are structured to use stubs, so that they can
be accelerated with assembly on any platform.
Sinh, cosh, and tanh were not structued with stubs, so this CL does
that. This set of routines was chosen as likely to produce good speedups
with assembly on any platform.

Technique used was minimax polynomial approximation using tables of
polynomial coefficients, with argument range reduction.
A table of scaling factors was also used for cosh and log10.

                     before       after      speedup
BenchmarkCos         22.1 ns/op   6.79 ns/op  3.25x
BenchmarkCosh       125   ns/op  11.7  ns/op 10.68x
BenchmarkLog10       48.4 ns/op  12.5  ns/op  3.87x
BenchmarkSin         22.2 ns/op   6.55 ns/op  3.39x
BenchmarkSinh       125   ns/op  14.2  ns/op  8.80x
BenchmarkTanh        65.0 ns/op  15.1  ns/op  4.30x

Accuracy was tested against a high precision
reference function to determine maximum error.
Approximately 4,000,000 points were tested for each function,
producing the following result.
Note: ulperr is error in "units in the last place"

       max
      ulperr
sin    1.43 (returns NaN beyond +-2^50)
cos    1.79 (returns NaN beyond +-2^50)
cosh   1.05
sinh   3.02
tanh   3.69
log10  1.75

Also includes a set of tests to test non-vector functions even
when SIMD is enabled

Change-Id: Icb45f14d00864ee19ed973d209c3af21e4df4edc
Reviewed-on: https://go-review.googlesource.com/32352
Run-TryBot: Michael Munday <munday@ca.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarMichael Munday <munday@ca.ibm.com>
parent 9f9d8340
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package math
func log10TrampolineSetup(x float64) float64
func log10Asm(x float64) float64
func cosTrampolineSetup(x float64) float64
func cosAsm(x float64) float64
func coshTrampolineSetup(x float64) float64
func coshAsm(x float64) float64
func sinTrampolineSetup(x float64) float64
func sinAsm(x float64) float64
func sinhTrampolineSetup(x float64) float64
func sinhAsm(x float64) float64
func tanhTrampolineSetup(x float64) float64
func tanhAsm(x float64) float64
// hasVectorFacility reports whether the machine has the z/Architecture
// vector facility installed and enabled.
func hasVectorFacility() bool
var hasVX = hasVectorFacility()
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Tests whether the non vector routines are working, even when the tests are run on a
// vector-capable machine.
package math_test
import (
. "math"
"testing"
)
func TestCosNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
if f := CosNoVec(vf[i]); !veryclose(cos[i], f) {
t.Errorf("Cos(%g) = %g, want %g", vf[i], f, cos[i])
}
}
for i := 0; i < len(vfcosSC); i++ {
if f := CosNoVec(vfcosSC[i]); !alike(cosSC[i], f) {
t.Errorf("Cos(%g) = %g, want %g", vfcosSC[i], f, cosSC[i])
}
}
}
func TestCoshNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
if f := CoshNoVec(vf[i]); !close(cosh[i], f) {
t.Errorf("Cosh(%g) = %g, want %g", vf[i], f, cosh[i])
}
}
for i := 0; i < len(vfcoshSC); i++ {
if f := CoshNoVec(vfcoshSC[i]); !alike(coshSC[i], f) {
t.Errorf("Cosh(%g) = %g, want %g", vfcoshSC[i], f, coshSC[i])
}
}
}
func TestSinNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
if f := SinNoVec(vf[i]); !veryclose(sin[i], f) {
t.Errorf("Sin(%g) = %g, want %g", vf[i], f, sin[i])
}
}
for i := 0; i < len(vfsinSC); i++ {
if f := SinNoVec(vfsinSC[i]); !alike(sinSC[i], f) {
t.Errorf("Sin(%g) = %g, want %g", vfsinSC[i], f, sinSC[i])
}
}
}
func TestSinhNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
if f := SinhNoVec(vf[i]); !close(sinh[i], f) {
t.Errorf("Sinh(%g) = %g, want %g", vf[i], f, sinh[i])
}
}
for i := 0; i < len(vfsinhSC); i++ {
if f := SinhNoVec(vfsinhSC[i]); !alike(sinhSC[i], f) {
t.Errorf("Sinh(%g) = %g, want %g", vfsinhSC[i], f, sinhSC[i])
}
}
}
// Check that math functions of high angle values
// return accurate results. [Since (vf[i] + large) - large != vf[i],
// testing for Trig(vf[i] + large) == Trig(vf[i]), where large is
// a multiple of 2*Pi, is misleading.]
func TestLargeCosNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
large := float64(100000 * Pi)
for i := 0; i < len(vf); i++ {
f1 := cosLarge[i]
f2 := CosNoVec(vf[i] + large)
if !close(f1, f2) {
t.Errorf("Cos(%g) = %g, want %g", vf[i]+large, f2, f1)
}
}
}
func TestLargeSinNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
large := float64(100000 * Pi)
for i := 0; i < len(vf); i++ {
f1 := sinLarge[i]
f2 := SinNoVec(vf[i] + large)
if !close(f1, f2) {
t.Errorf("Sin(%g) = %g, want %g", vf[i]+large, f2, f1)
}
}
}
func TestTanhNovec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
if f := TanhNoVec(vf[i]); !veryclose(tanh[i], f) {
t.Errorf("Tanh(%g) = %g, want %g", vf[i], f, tanh[i])
}
}
for i := 0; i < len(vftanhSC); i++ {
if f := TanhNoVec(vftanhSC[i]); !alike(tanhSC[i], f) {
t.Errorf("Tanh(%g) = %g, want %g", vftanhSC[i], f, tanhSC[i])
}
}
}
func TestLog10Novec(t *testing.T) {
if !HasVX {
t.Skipf("no vector support")
}
for i := 0; i < len(vf); i++ {
a := Abs(vf[i])
if f := Log10NoVec(a); !veryclose(log10[i], f) {
t.Errorf("Log10(%g) = %g, want %g", a, f, log10[i])
}
}
if f := Log10NoVec(E); f != Log10E {
t.Errorf("Log10(%g) = %g, want %g", E, f, Log10E)
}
for i := 0; i < len(vflogSC); i++ {
if f := Log10NoVec(vflogSC[i]); !alike(logSC[i], f) {
t.Errorf("Log10(%g) = %g, want %g", vflogSC[i], f, logSC[i])
}
}
}
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Constants
DATA coshrodataL23<>+0(SB)/8, $0.231904681384629956E-16
DATA coshrodataL23<>+8(SB)/8, $0.693147180559945286E+00
DATA coshrodataL23<>+16(SB)/8, $0.144269504088896339E+01
DATA coshrodataL23<>+24(SB)/8, $704.E0
GLOBL coshrodataL23<>+0(SB), RODATA, $32
DATA coshxinf<>+0(SB)/8, $0x7FF0000000000000
GLOBL coshxinf<>+0(SB), RODATA, $8
DATA coshxlim1<>+0(SB)/8, $800.E0
GLOBL coshxlim1<>+0(SB), RODATA, $8
DATA coshxaddhy<>+0(SB)/8, $0xc2f0000100003fdf
GLOBL coshxaddhy<>+0(SB), RODATA, $8
DATA coshx4ff<>+0(SB)/8, $0x4ff0000000000000
GLOBL coshx4ff<>+0(SB), RODATA, $8
DATA coshe1<>+0(SB)/8, $0x3ff000000000000a
GLOBL coshe1<>+0(SB), RODATA, $8
// Log multiplier table
DATA coshtab<>+0(SB)/8, $0.442737824274138381E-01
DATA coshtab<>+8(SB)/8, $0.263602189790660309E-01
DATA coshtab<>+16(SB)/8, $0.122565642281703586E-01
DATA coshtab<>+24(SB)/8, $0.143757052860721398E-02
DATA coshtab<>+32(SB)/8, $-.651375034121276075E-02
DATA coshtab<>+40(SB)/8, $-.119317678849450159E-01
DATA coshtab<>+48(SB)/8, $-.150868749549871069E-01
DATA coshtab<>+56(SB)/8, $-.161992609578469234E-01
DATA coshtab<>+64(SB)/8, $-.154492360403337917E-01
DATA coshtab<>+72(SB)/8, $-.129850717389178721E-01
DATA coshtab<>+80(SB)/8, $-.892902649276657891E-02
DATA coshtab<>+88(SB)/8, $-.338202636596794887E-02
DATA coshtab<>+96(SB)/8, $0.357266307045684762E-02
DATA coshtab<>+104(SB)/8, $0.118665304327406698E-01
DATA coshtab<>+112(SB)/8, $0.214434994118118914E-01
DATA coshtab<>+120(SB)/8, $0.322580645161290314E-01
GLOBL coshtab<>+0(SB), RODATA, $128
// Minimax polynomial approximations
DATA coshe2<>+0(SB)/8, $0.500000000000004237e+00
GLOBL coshe2<>+0(SB), RODATA, $8
DATA coshe3<>+0(SB)/8, $0.166666666630345592e+00
GLOBL coshe3<>+0(SB), RODATA, $8
DATA coshe4<>+0(SB)/8, $0.416666664838056960e-01
GLOBL coshe4<>+0(SB), RODATA, $8
DATA coshe5<>+0(SB)/8, $0.833349307718286047e-02
GLOBL coshe5<>+0(SB), RODATA, $8
DATA coshe6<>+0(SB)/8, $0.138926439368309441e-02
GLOBL coshe6<>+0(SB), RODATA, $8
// Cosh returns the hyperbolic cosine of x.
//
// Special cases are:
// Cosh(±0) = 1
// CoshInf) = +Inf
// Cosh(NaN) = NaN
// The algorithm used is minimax polynomial approximation
// with coefficients determined with a Remez exchange algorithm.
TEXT ·coshAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
MOVD $coshrodataL23<>+0(SB), R9
WORD $0xB3120000 //ltdbr %f0,%f0
MOVD $0x4086000000000000, R2
MOVD $0x4086000000000000, R3
BLTU L19
FMOVD F0, F4
L2:
WORD $0xED409018 //cdb %f4,.L24-.L23(%r9)
BYTE $0x00
BYTE $0x19
BGE L14 //jnl .L14
BVS L14
WFCEDBS V4, V4, V2
BEQ L20
L1:
FMOVD F0, ret+8(FP)
RET
L14:
WFCEDBS V4, V4, V2
BVS L1
MOVD $coshxlim1<>+0(SB), R1
FMOVD 0(R1), F2
WFCHEDBS V4, V2, V2
BEQ L21
MOVD $coshxaddhy<>+0(SB), R1
FMOVD coshrodataL23<>+16(SB), F5
FMOVD 0(R1), F2
WFMSDB V0, V5, V2, V5
FMOVD coshrodataL23<>+8(SB), F3
FADD F5, F2
MOVD $coshe6<>+0(SB), R1
WFMSDB V2, V3, V0, V3
FMOVD 0(R1), F6
WFMDB V3, V3, V1
MOVD $coshe4<>+0(SB), R1
FMOVD coshrodataL23<>+0(SB), F7
WFMADB V2, V7, V3, V2
FMOVD 0(R1), F3
MOVD $coshe5<>+0(SB), R1
WFMADB V1, V6, V3, V6
FMOVD 0(R1), F7
MOVD $coshe3<>+0(SB), R1
FMOVD 0(R1), F3
WFMADB V1, V7, V3, V7
FNEG F2, F3
WORD $0xB3CD0015 //lgdr %r1,%f5
MOVD $coshe2<>+0(SB), R3
WFCEDBS V4, V0, V0
FMOVD 0(R3), F5
MOVD $coshe1<>+0(SB), R3
WFMADB V1, V6, V5, V6
FMOVD 0(R3), F5
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WFMADB V1, V7, V5, V1
BVS L22
WORD $0xEC4139BC //risbg %r4,%r1,57,128+60,3
BYTE $0x03
BYTE $0x55
MOVD $coshtab<>+0(SB), R3
WFMADB V3, V6, V1, V6
WORD $0x68043000 //ld %f0,0(%r4,%r3)
FMSUB F0, F3, F2, F2
WORD $0xA71AF000 //ahi %r1,-4096
WFMADB V2, V6, V0, V6
L17:
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0xB3C10022 //ldgr %f2,%r2
FMADD F2, F6, F2, F2
MOVD $coshx4ff<>+0(SB), R1
FMOVD 0(R1), F0
FMUL F2, F0
FMOVD F0, ret+8(FP)
RET
L19:
FNEG F0, F4
BR L2
L20:
MOVD $coshxaddhy<>+0(SB), R1
FMOVD coshrodataL23<>+16(SB), F3
FMOVD 0(R1), F2
WFMSDB V0, V3, V2, V3
FMOVD coshrodataL23<>+8(SB), F4
FADD F3, F2
MOVD $coshe6<>+0(SB), R1
FMSUB F4, F2, F0, F0
FMOVD 0(R1), F6
WFMDB V0, V0, V1
MOVD $coshe4<>+0(SB), R1
FMOVD 0(R1), F4
MOVD $coshe5<>+0(SB), R1
FMOVD coshrodataL23<>+0(SB), F5
WFMADB V1, V6, V4, V6
FMADD F5, F2, F0, F0
FMOVD 0(R1), F2
MOVD $coshe3<>+0(SB), R1
FMOVD 0(R1), F4
WFMADB V1, V2, V4, V2
MOVD $coshe2<>+0(SB), R1
FMOVD 0(R1), F5
FNEG F0, F4
WFMADB V1, V6, V5, V6
MOVD $coshe1<>+0(SB), R1
FMOVD 0(R1), F5
WFMADB V1, V2, V5, V1
WORD $0xB3CD0013 //lgdr %r1,%f3
MOVD $coshtab<>+0(SB), R5
WFMADB V4, V6, V1, V3
WORD $0xEC4139BC //risbg %r4,%r1,57,128+60,3
BYTE $0x03
BYTE $0x55
WFMSDB V4, V6, V1, V6
WORD $0x68145000 //ld %f1,0(%r4,%r5)
WFMSDB V4, V1, V0, V2
WORD $0xA7487FBE //lhi %r4,32702
FMADD F3, F2, F1, F1
SUBW R1, R4
WORD $0xECC439BC //risbg %r12,%r4,57,128+60,3
BYTE $0x03
BYTE $0x55
WORD $0x682C5000 //ld %f2,0(%r12,%r5)
FMSUB F2, F4, F0, F0
WORD $0xEC21000F //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WFMADB V0, V6, V2, V6
WORD $0xEC34000F //risbgn %r3,%r4,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0xB3C10022 //ldgr %f2,%r2
WORD $0xB3C10003 //ldgr %f0,%r3
FMADD F2, F1, F2, F2
FMADD F0, F6, F0, F0
FADD F2, F0
FMOVD F0, ret+8(FP)
RET
L22:
WORD $0xA7387FBE //lhi %r3,32702
MOVD $coshtab<>+0(SB), R4
SUBW R1, R3
WFMSDB V3, V6, V1, V6
WORD $0xEC3339BC //risbg %r3,%r3,57,128+60,3
BYTE $0x03
BYTE $0x55
WORD $0x68034000 //ld %f0,0(%r3,%r4)
FMSUB F0, F3, F2, F2
WORD $0xA7386FBE //lhi %r3,28606
WFMADB V2, V6, V0, V6
SUBW R1, R3, R1
BR L17
L21:
MOVD $coshxinf<>+0(SB), R1
FMOVD 0(R1), F0
FMOVD F0, ret+8(FP)
RET
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package math
// Export internal functions and variable for testing.
var Log10NoVec = log10
var CosNoVec = cos
var CoshNoVec = cosh
var SinNoVec = sin
var SinhNoVec = sinh
var TanhNoVec = tanh
var HasVX = hasVX
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Minimax polynomial coefficients and other constants
DATA log10rodataL19<>+0(SB)/8, $0.000000000000000000E+00
DATA log10rodataL19<>+8(SB)/8, $-1.0
DATA log10rodataL19<>+16(SB)/8, $0x7FF8000000000000 //+NanN
DATA log10rodataL19<>+24(SB)/8, $.15375570329280596749
DATA log10rodataL19<>+32(SB)/8, $.60171950900703668594E+04
DATA log10rodataL19<>+40(SB)/8, $-1.9578460454940795898
DATA log10rodataL19<>+48(SB)/8, $0.78962633073318517310E-01
DATA log10rodataL19<>+56(SB)/8, $-.71784211884836937993E-02
DATA log10rodataL19<>+64(SB)/8, $0.87011165920689940661E-03
DATA log10rodataL19<>+72(SB)/8, $-.11865158981621437541E-03
DATA log10rodataL19<>+80(SB)/8, $0.17258413403018680410E-04
DATA log10rodataL19<>+88(SB)/8, $0.40752932047883484315E-06
DATA log10rodataL19<>+96(SB)/8, $-.26149194688832680410E-05
DATA log10rodataL19<>+104(SB)/8, $0.92453396963875026759E-08
DATA log10rodataL19<>+112(SB)/8, $-.64572084905921579630E-07
DATA log10rodataL19<>+120(SB)/8, $-5.5
DATA log10rodataL19<>+128(SB)/8, $18446744073709551616.
GLOBL log10rodataL19<>+0(SB), RODATA, $136
// Table of log10 correction terms
DATA log10tab2074<>+0(SB)/8, $0.254164497922885069E-01
DATA log10tab2074<>+8(SB)/8, $0.179018857989381839E-01
DATA log10tab2074<>+16(SB)/8, $0.118926768029048674E-01
DATA log10tab2074<>+24(SB)/8, $0.722595568238080033E-02
DATA log10tab2074<>+32(SB)/8, $0.376393570022739135E-02
DATA log10tab2074<>+40(SB)/8, $0.138901135928814326E-02
DATA log10tab2074<>+48(SB)/8, $0
DATA log10tab2074<>+56(SB)/8, $-0.490780466387818203E-03
DATA log10tab2074<>+64(SB)/8, $-0.159811431402137571E-03
DATA log10tab2074<>+72(SB)/8, $0.925796337165100494E-03
DATA log10tab2074<>+80(SB)/8, $0.270683176738357035E-02
DATA log10tab2074<>+88(SB)/8, $0.513079030821304758E-02
DATA log10tab2074<>+96(SB)/8, $0.815089785397996303E-02
DATA log10tab2074<>+104(SB)/8, $0.117253060262419215E-01
DATA log10tab2074<>+112(SB)/8, $0.158164239345343963E-01
DATA log10tab2074<>+120(SB)/8, $0.203903595489229786E-01
GLOBL log10tab2074<>+0(SB), RODATA, $128
// Log10 returns the decimal logarithm of the argument.
//
// Special cases are:
// Log(+Inf) = +Inf
// Log(0) = -Inf
// Log(x < 0) = NaN
// Log(NaN) = NaN
// The algorithm used is minimax polynomial approximation
// with coefficients determined with a Remez exchange algorithm.
TEXT ·log10Asm(SB),NOSPLIT,$8-16
FMOVD x+0(FP), F0
MOVD $log10rodataL19<>+0(SB), R9
FMOVD F0, x-8(SP)
WORD $0xC0298006 //iilf %r2,2147909631
BYTE $0x7F
BYTE $0xFF
WORD $0x5840F008 //l %r4, 8(%r15)
SUBW R4, R2, R3
WORD $0xEC5320AF //risbg %r5,%r3,32,128+47,0
BYTE $0x00
BYTE $0x55
MOVH $0x0, R1
WORD $0xEC15001F //risbgn %r1,%r5,64-64+0,64-64+0+32-1,64-0-32
BYTE $0x20
BYTE $0x59
WORD $0xC0590016 //iilf %r5,1507327
BYTE $0xFF
BYTE $0xFF
MOVW R4, R10
MOVW R5, R11
CMPBLE R10, R11, L2
WORD $0xC0297FEF //iilf %r2,2146435071
BYTE $0xFF
BYTE $0xFF
MOVW R4, R10
MOVW R2, R11
CMPBLE R10, R11, L16
L3:
L1:
FMOVD F0, ret+8(FP)
RET
L2:
WORD $0xB3120000 //ltdbr %f0,%f0
BLEU L13
WORD $0xED009080 //mdb %f0,.L20-.L19(%r9)
BYTE $0x00
BYTE $0x1C
FMOVD F0, x-8(SP)
WORD $0x5B20F008 //s %r2, 8(%r15)
WORD $0xEC3239BC //risbg %r3,%r2,57,128+60,64-13
BYTE $0x33
BYTE $0x55
ANDW $0xFFFF0000, R2
WORD $0xEC12001F //risbgn %r1,%r2,64-64+0,64-64+0+32-1,64-0-32
BYTE $0x20
BYTE $0x59
ADDW $0x4000000, R2
BLEU L17
L8:
SRW $8, R2, R2
ORW $0x45000000, R2
L4:
FMOVD log10rodataL19<>+120(SB), F2
WORD $0xB3C10041 //ldgr %f4,%r1
WFMADB V4, V0, V2, V0
FMOVD log10rodataL19<>+112(SB), F4
FMOVD log10rodataL19<>+104(SB), F6
WFMADB V0, V6, V4, V6
FMOVD log10rodataL19<>+96(SB), F4
FMOVD log10rodataL19<>+88(SB), F1
WFMADB V0, V1, V4, V1
WFMDB V0, V0, V4
FMOVD log10rodataL19<>+80(SB), F2
WFMADB V6, V4, V1, V6
FMOVD log10rodataL19<>+72(SB), F1
WFMADB V0, V2, V1, V2
FMOVD log10rodataL19<>+64(SB), F1
WORD $0xEC3339BC //risbg %r3,%r3,57,128+60,0
BYTE $0x00
BYTE $0x55
WFMADB V4, V6, V2, V6
FMOVD log10rodataL19<>+56(SB), F2
WFMADB V0, V1, V2, V1
VLVGF $0, R2, V2
WFMADB V4, V6, V1, V4
LDEBR F2, F2
FMOVD log10rodataL19<>+48(SB), F6
WFMADB V0, V4, V6, V4
FMOVD log10rodataL19<>+40(SB), F1
FMOVD log10rodataL19<>+32(SB), F6
MOVD $log10tab2074<>+0(SB), R1
WFMADB V2, V1, V6, V2
WORD $0x68331000 //ld %f3,0(%r3,%r1)
WFMADB V0, V4, V3, V0
FMOVD log10rodataL19<>+24(SB), F4
FMADD F4, F2, F0, F0
FMOVD F0, ret+8(FP)
RET
L16:
WORD $0xEC2328B7 //risbg %r2,%r3,40,128+55,64-8
BYTE $0x38
BYTE $0x55
WORD $0xEC3339BC //risbg %r3,%r3,57,128+60,64-13
BYTE $0x33
BYTE $0x55
ORW $0x45000000, R2
BR L4
L13:
BGE L18 //jnl .L18
BVS L18
FMOVD log10rodataL19<>+16(SB), F0
BR L1
L17:
SRAW $1, R2, R2
SUBW $0x40000000, R2
BR L8
L18:
FMOVD log10rodataL19<>+8(SB), F0
WORD $0xED009000 //ddb %f0,.L36-.L19(%r9)
BYTE $0x00
BYTE $0x1D
BR L1
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Various constants
DATA sincosxnan<>+0(SB)/8, $0x7ff8000000000000
GLOBL sincosxnan<>+0(SB), RODATA, $8
DATA sincosxlim<>+0(SB)/8, $0x432921fb54442d19
GLOBL sincosxlim<>+0(SB), RODATA, $8
DATA sincosxadd<>+0(SB)/8, $0xc338000000000000
GLOBL sincosxadd<>+0(SB), RODATA, $8
DATA sincosxpi2l<>+0(SB)/8, $0.108285667392191389e-31
GLOBL sincosxpi2l<>+0(SB), RODATA, $8
DATA sincosxpi2m<>+0(SB)/8, $0.612323399573676480e-16
GLOBL sincosxpi2m<>+0(SB), RODATA, $8
DATA sincosxpi2h<>+0(SB)/8, $0.157079632679489656e+01
GLOBL sincosxpi2h<>+0(SB), RODATA, $8
DATA sincosrpi2<>+0(SB)/8, $0.636619772367581341e+00
GLOBL sincosrpi2<>+0(SB), RODATA, $8
// Minimax polynomial approximations
DATA sincosc0<>+0(SB)/8, $0.100000000000000000E+01
GLOBL sincosc0<>+0(SB), RODATA, $8
DATA sincosc1<>+0(SB)/8, $-.499999999999999833E+00
GLOBL sincosc1<>+0(SB), RODATA, $8
DATA sincosc2<>+0(SB)/8, $0.416666666666625843E-01
GLOBL sincosc2<>+0(SB), RODATA, $8
DATA sincosc3<>+0(SB)/8, $-.138888888885498984E-02
GLOBL sincosc3<>+0(SB), RODATA, $8
DATA sincosc4<>+0(SB)/8, $0.248015871681607202E-04
GLOBL sincosc4<>+0(SB), RODATA, $8
DATA sincosc5<>+0(SB)/8, $-.275572911309937875E-06
GLOBL sincosc5<>+0(SB), RODATA, $8
DATA sincosc6<>+0(SB)/8, $0.208735047247632818E-08
GLOBL sincosc6<>+0(SB), RODATA, $8
DATA sincosc7<>+0(SB)/8, $-.112753632738365317E-10
GLOBL sincosc7<>+0(SB), RODATA, $8
DATA sincoss0<>+0(SB)/8, $0.100000000000000000E+01
GLOBL sincoss0<>+0(SB), RODATA, $8
DATA sincoss1<>+0(SB)/8, $-.166666666666666657E+00
GLOBL sincoss1<>+0(SB), RODATA, $8
DATA sincoss2<>+0(SB)/8, $0.833333333333309209E-02
GLOBL sincoss2<>+0(SB), RODATA, $8
DATA sincoss3<>+0(SB)/8, $-.198412698410701448E-03
GLOBL sincoss3<>+0(SB), RODATA, $8
DATA sincoss4<>+0(SB)/8, $0.275573191453906794E-05
GLOBL sincoss4<>+0(SB), RODATA, $8
DATA sincoss5<>+0(SB)/8, $-.250520918387633290E-07
GLOBL sincoss5<>+0(SB), RODATA, $8
DATA sincoss6<>+0(SB)/8, $0.160571285514715856E-09
GLOBL sincoss6<>+0(SB), RODATA, $8
DATA sincoss7<>+0(SB)/8, $-.753213484933210972E-12
GLOBL sincoss7<>+0(SB), RODATA, $8
// Sin returns the sine of the radian argument x.
//
// Special cases are:
// Sin(±0) = ±0
// SinInf) = NaN
// Sin(NaN) = NaN
// The algorithm used is minimax polynomial approximation.
// with coefficients determined with a Remez exchange algorithm.
TEXT ·sinAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
//special case Sin(±0) = ±0
FMOVD $(0.0), F1
FCMPU F0, F1
BEQ sinIsZero
WORD $0xB3120000 //ltdbr %f0,%f0
BLTU L17
FMOVD F0, F5
L2:
MOVD $sincoss7<>+0(SB), R1
FMOVD 0(R1), F4
MOVD $sincoss6<>+0(SB), R1
FMOVD 0(R1), F1
MOVD $sincoss5<>+0(SB), R1
VLEG $0, 0(R1), V18
MOVD $sincoss4<>+0(SB), R1
FMOVD 0(R1), F6
MOVD $sincoss2<>+0(SB), R1
VLEG $0, 0(R1), V16
MOVD $sincoss3<>+0(SB), R1
FMOVD 0(R1), F7
MOVD $sincoss1<>+0(SB), R1
FMOVD 0(R1), F3
MOVD $sincoss0<>+0(SB), R1
FMOVD 0(R1), F2
WFCHDBS V2, V5, V2
BEQ L18
MOVD $sincosrpi2<>+0(SB), R1
FMOVD 0(R1), F3
MOVD $sincosxadd<>+0(SB), R1
FMOVD 0(R1), F2
WFMSDB V0, V3, V2, V3
FMOVD 0(R1), F6
FADD F3, F6
MOVD $sincosxpi2h<>+0(SB), R1
FMOVD 0(R1), F2
FMSUB F2, F6, F0, F0
MOVD $sincosxpi2m<>+0(SB), R1
FMOVD 0(R1), F4
FMADD F4, F6, F0, F0
MOVD $sincosxpi2l<>+0(SB), R1
WFMDB V0, V0, V1
FMOVD 0(R1), F7
WFMDB V1, V1, V2
WORD $0xB3CD0013 //lgdr %r1,%f3
MOVD $sincosxlim<>+0(SB), R2
WORD $0xA7110001 //tmll %r1,1
BEQ L6
FMOVD 0(R2), F0
WFCHDBS V0, V5, V0
BNE L14
MOVD $sincosc7<>+0(SB), R2
FMOVD 0(R2), F0
MOVD $sincosc6<>+0(SB), R2
FMOVD 0(R2), F4
MOVD $sincosc5<>+0(SB), R2
WFMADB V1, V0, V4, V0
FMOVD 0(R2), F6
MOVD $sincosc4<>+0(SB), R2
WFMADB V1, V0, V6, V0
FMOVD 0(R2), F4
MOVD $sincosc2<>+0(SB), R2
FMOVD 0(R2), F6
WFMADB V2, V4, V6, V4
MOVD $sincosc3<>+0(SB), R2
FMOVD 0(R2), F3
MOVD $sincosc1<>+0(SB), R2
WFMADB V2, V0, V3, V0
FMOVD 0(R2), F6
WFMADB V1, V4, V6, V4
WORD $0xA7110002 //tmll %r1,2
WFMADB V2, V0, V4, V0
MOVD $sincosc0<>+0(SB), R1
FMOVD 0(R1), F2
WFMADB V1, V0, V2, V0
BNE L15
FMOVD F0, ret+8(FP)
RET
L6:
FMOVD 0(R2), F4
WFCHDBS V4, V5, V4
BNE L14
MOVD $sincoss7<>+0(SB), R2
FMOVD 0(R2), F4
MOVD $sincoss6<>+0(SB), R2
FMOVD 0(R2), F3
MOVD $sincoss5<>+0(SB), R2
WFMADB V1, V4, V3, V4
WFMADB V6, V7, V0, V6
FMOVD 0(R2), F0
MOVD $sincoss4<>+0(SB), R2
FMADD F4, F1, F0, F0
FMOVD 0(R2), F3
MOVD $sincoss2<>+0(SB), R2
FMOVD 0(R2), F4
MOVD $sincoss3<>+0(SB), R2
WFMADB V2, V3, V4, V3
FMOVD 0(R2), F4
MOVD $sincoss1<>+0(SB), R2
WFMADB V2, V0, V4, V0
FMOVD 0(R2), F4
WFMADB V1, V3, V4, V3
FNEG F6, F4
WFMADB V2, V0, V3, V2
WFMDB V4, V1, V0
WORD $0xA7110002 //tmll %r1,2
WFMSDB V0, V2, V6, V0
BNE L15
FMOVD F0, ret+8(FP)
RET
L14:
MOVD $sincosxnan<>+0(SB), R1
FMOVD 0(R1), F0
FMOVD F0, ret+8(FP)
RET
L18:
WFMDB V0, V0, V2
WFMADB V2, V4, V1, V4
WFMDB V2, V2, V1
WFMADB V2, V4, V18, V4
WFMADB V1, V6, V16, V6
WFMADB V1, V4, V7, V4
WFMADB V2, V6, V3, V6
FMUL F0, F2
WFMADB V1, V4, V6, V4
FMADD F4, F2, F0, F0
FMOVD F0, ret+8(FP)
RET
L17:
FNEG F0, F5
BR L2
L15:
FNEG F0, F0
FMOVD F0, ret+8(FP)
RET
sinIsZero:
FMOVD F0, ret+8(FP)
RET
// Cos returns the cosine of the radian argument.
//
// Special cases are:
// CosInf) = NaN
// Cos(NaN) = NaN
// The algorithm used is minimax polynomial approximation.
// with coefficients determined with a Remez exchange algorithm.
TEXT ·cosAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
WORD $0xB3120000 //ltdbr %f0,%f0
BLTU L35
FMOVD F0, F1
L21:
MOVD $sincosc7<>+0(SB), R1
FMOVD 0(R1), F4
MOVD $sincosc6<>+0(SB), R1
VLEG $0, 0(R1), V20
MOVD $sincosc5<>+0(SB), R1
VLEG $0, 0(R1), V18
MOVD $sincosc4<>+0(SB), R1
FMOVD 0(R1), F6
MOVD $sincosc2<>+0(SB), R1
VLEG $0, 0(R1), V16
MOVD $sincosc3<>+0(SB), R1
FMOVD 0(R1), F7
MOVD $sincosc1<>+0(SB), R1
FMOVD 0(R1), F5
MOVD $sincosrpi2<>+0(SB), R1
FMOVD 0(R1), F2
MOVD $sincosxadd<>+0(SB), R1
FMOVD 0(R1), F3
MOVD $sincoss0<>+0(SB), R1
WFMSDB V0, V2, V3, V2
FMOVD 0(R1), F3
WFCHDBS V3, V1, V3
WORD $0xB3CD0012 //lgdr %r1,%f2
BEQ L36
MOVD $sincosxadd<>+0(SB), R2
FMOVD 0(R2), F4
FADD F2, F4
MOVD $sincosxpi2h<>+0(SB), R2
FMOVD 0(R2), F2
WFMSDB V4, V2, V0, V2
MOVD $sincosxpi2m<>+0(SB), R2
FMOVD 0(R2), F0
WFMADB V4, V0, V2, V0
MOVD $sincosxpi2l<>+0(SB), R2
WFMDB V0, V0, V2
FMOVD 0(R2), F5
WFMDB V2, V2, V6
MOVD $sincosxlim<>+0(SB), R2
WORD $0xA7110001 //tmll %r1,1
BNE L25
FMOVD 0(R2), F0
WFCHDBS V0, V1, V0
BNE L33
MOVD $sincosc7<>+0(SB), R2
FMOVD 0(R2), F0
MOVD $sincosc6<>+0(SB), R2
FMOVD 0(R2), F4
MOVD $sincosc5<>+0(SB), R2
WFMADB V2, V0, V4, V0
FMOVD 0(R2), F1
MOVD $sincosc4<>+0(SB), R2
WFMADB V2, V0, V1, V0
FMOVD 0(R2), F4
MOVD $sincosc2<>+0(SB), R2
FMOVD 0(R2), F1
WFMADB V6, V4, V1, V4
MOVD $sincosc3<>+0(SB), R2
FMOVD 0(R2), F3
MOVD $sincosc1<>+0(SB), R2
WFMADB V6, V0, V3, V0
FMOVD 0(R2), F1
WFMADB V2, V4, V1, V4
WORD $0xA7110002 //tmll %r1,2
WFMADB V6, V0, V4, V0
MOVD $sincosc0<>+0(SB), R1
FMOVD 0(R1), F4
WFMADB V2, V0, V4, V0
BNE L34
FMOVD F0, ret+8(FP)
RET
L25:
FMOVD 0(R2), F3
WFCHDBS V3, V1, V1
BNE L33
MOVD $sincoss7<>+0(SB), R2
FMOVD 0(R2), F1
MOVD $sincoss6<>+0(SB), R2
FMOVD 0(R2), F3
MOVD $sincoss5<>+0(SB), R2
WFMADB V2, V1, V3, V1
FMOVD 0(R2), F3
MOVD $sincoss4<>+0(SB), R2
WFMADB V2, V1, V3, V1
FMOVD 0(R2), F3
MOVD $sincoss2<>+0(SB), R2
FMOVD 0(R2), F7
WFMADB V6, V3, V7, V3
MOVD $sincoss3<>+0(SB), R2
FMADD F5, F4, F0, F0
FMOVD 0(R2), F4
MOVD $sincoss1<>+0(SB), R2
FMADD F1, F6, F4, F4
FMOVD 0(R2), F1
FMADD F3, F2, F1, F1
FMUL F0, F2
WFMADB V6, V4, V1, V6
WORD $0xA7110002 //tmll %r1,2
FMADD F6, F2, F0, F0
BNE L34
FMOVD F0, ret+8(FP)
RET
L33:
MOVD $sincosxnan<>+0(SB), R1
FMOVD 0(R1), F0
FMOVD F0, ret+8(FP)
RET
L36:
FMUL F0, F0
MOVD $sincosc0<>+0(SB), R1
WFMDB V0, V0, V1
WFMADB V0, V4, V20, V4
WFMADB V1, V6, V16, V6
WFMADB V0, V4, V18, V4
WFMADB V0, V6, V5, V6
WFMADB V1, V4, V7, V4
FMOVD 0(R1), F2
WFMADB V1, V4, V6, V4
WFMADB V0, V4, V2, V0
FMOVD F0, ret+8(FP)
RET
L35:
FNEG F0, F1
BR L21
L34:
FNEG F0, F0
FMOVD F0, ret+8(FP)
RET
......@@ -22,7 +22,9 @@ package math
// Sinh(±0) = ±0
// Sinh(±Inf) = ±Inf
// Sinh(NaN) = NaN
func Sinh(x float64) float64 {
func Sinh(x float64) float64
func sinh(x float64) float64 {
// The coefficients are #2029 from Hart & Cheney. (20.36D)
const (
P0 = -0.6307673640497716991184787251e+6
......@@ -66,7 +68,9 @@ func Sinh(x float64) float64 {
// Cosh(±0) = 1
// Cosh(±Inf) = +Inf
// Cosh(NaN) = NaN
func Cosh(x float64) float64 {
func Cosh(x float64) float64
func cosh(x float64) float64 {
if x < 0 {
x = -x
}
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Constants
DATA sinhrodataL21<>+0(SB)/8, $0.231904681384629956E-16
DATA sinhrodataL21<>+8(SB)/8, $0.693147180559945286E+00
DATA sinhrodataL21<>+16(SB)/8, $704.E0
GLOBL sinhrodataL21<>+0(SB), RODATA, $24
DATA sinhrlog2<>+0(SB)/8, $0x3ff7154760000000
GLOBL sinhrlog2<>+0(SB), RODATA, $8
DATA sinhxinf<>+0(SB)/8, $0x7ff0000000000000
GLOBL sinhxinf<>+0(SB), RODATA, $8
DATA sinhxinit<>+0(SB)/8, $0x3ffb504f333f9de6
GLOBL sinhxinit<>+0(SB), RODATA, $8
DATA sinhxlim1<>+0(SB)/8, $800.E0
GLOBL sinhxlim1<>+0(SB), RODATA, $8
DATA sinhxadd<>+0(SB)/8, $0xc3200001610007fb
GLOBL sinhxadd<>+0(SB), RODATA, $8
DATA sinhx4ff<>+0(SB)/8, $0x4ff0000000000000
GLOBL sinhx4ff<>+0(SB), RODATA, $8
// Minimax polynomial approximations
DATA sinhe0<>+0(SB)/8, $0.11715728752538099300E+01
GLOBL sinhe0<>+0(SB), RODATA, $8
DATA sinhe1<>+0(SB)/8, $0.11715728752538099300E+01
GLOBL sinhe1<>+0(SB), RODATA, $8
DATA sinhe2<>+0(SB)/8, $0.58578643762688526692E+00
GLOBL sinhe2<>+0(SB), RODATA, $8
DATA sinhe3<>+0(SB)/8, $0.19526214587563004497E+00
GLOBL sinhe3<>+0(SB), RODATA, $8
DATA sinhe4<>+0(SB)/8, $0.48815536475176217404E-01
GLOBL sinhe4<>+0(SB), RODATA, $8
DATA sinhe5<>+0(SB)/8, $0.97631072948627397816E-02
GLOBL sinhe5<>+0(SB), RODATA, $8
DATA sinhe6<>+0(SB)/8, $0.16271839297756073153E-02
GLOBL sinhe6<>+0(SB), RODATA, $8
DATA sinhe7<>+0(SB)/8, $0.23245485387271142509E-03
GLOBL sinhe7<>+0(SB), RODATA, $8
DATA sinhe8<>+0(SB)/8, $0.29080955860869629131E-04
GLOBL sinhe8<>+0(SB), RODATA, $8
DATA sinhe9<>+0(SB)/8, $0.32311267157667725278E-05
GLOBL sinhe9<>+0(SB), RODATA, $8
// Sinh returns the hyperbolic sine of the argument.
//
// Special cases are:
// Sinh(±0) = ±0
// SinhInf) = ±Inf
// Sinh(NaN) = NaN
// The algorithm used is minimax polynomial approximation
// with coefficients determined with a Remez exchange algorithm.
TEXT ·sinhAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
//specail case Sinh(±0) = ±0
FMOVD $(0.0), F1
FCMPU F0, F1
BEQ sinhIsZero
//specail case Sinh(±Inf = ±Inf
FMOVD $1.797693134862315708145274237317043567981e+308, F1
FCMPU F1, F0
BLEU sinhIsInf
FMOVD $-1.797693134862315708145274237317043567981e+308, F1
FCMPU F1, F0
BGT sinhIsInf
MOVD $sinhrodataL21<>+0(SB), R5
WORD $0xB3120000 //ltdbr %f0,%f0
MOVD sinhxinit<>+0(SB), R1
FMOVD F0, F4
MOVD R1, R3
BLTU L19
FMOVD F0, F2
L2:
WORD $0xED205010 //cdb %f2,.L22-.L21(%r5)
BYTE $0x00
BYTE $0x19
BGE L15 //jnl .L15
BVS L15
WFCEDBS V2, V2, V0
BEQ L20
L12:
FMOVD F4, F0
FMOVD F0, ret+8(FP)
RET
L15:
WFCEDBS V2, V2, V0
BVS L12
MOVD $sinhxlim1<>+0(SB), R2
FMOVD 0(R2), F0
WFCHDBS V0, V2, V0
BEQ L6
WFCHEDBS V4, V2, V6
MOVD $sinhxinf<>+0(SB), R1
FMOVD 0(R1), F0
BNE LEXITTAGsinh
WFCHDBS V2, V4, V2
BNE L16
FNEG F0, F0
FMOVD F0, ret+8(FP)
RET
L19:
FNEG F0, F2
BR L2
L6:
MOVD $sinhxadd<>+0(SB), R2
FMOVD 0(R2), F0
MOVD sinhrlog2<>+0(SB), R2
WORD $0xB3C10062 //ldgr %f6,%r2
WFMSDB V4, V6, V0, V16
FMOVD sinhrodataL21<>+8(SB), F6
WFADB V0, V16, V0
FMOVD sinhrodataL21<>+0(SB), F3
WFMSDB V0, V6, V4, V6
MOVD $sinhe9<>+0(SB), R2
WFMADB V0, V3, V6, V0
FMOVD 0(R2), F1
MOVD $sinhe7<>+0(SB), R2
WFMDB V0, V0, V6
FMOVD 0(R2), F5
MOVD $sinhe8<>+0(SB), R2
FMOVD 0(R2), F3
MOVD $sinhe6<>+0(SB), R2
WFMADB V6, V1, V5, V1
FMOVD 0(R2), F5
MOVD $sinhe5<>+0(SB), R2
FMOVD 0(R2), F7
MOVD $sinhe3<>+0(SB), R2
WFMADB V6, V3, V5, V3
FMOVD 0(R2), F5
MOVD $sinhe4<>+0(SB), R2
WFMADB V6, V7, V5, V7
FMOVD 0(R2), F5
MOVD $sinhe2<>+0(SB), R2
VLEG $0, 0(R2), V20
WFMDB V6, V6, V18
WFMADB V6, V5, V20, V5
WFMADB V1, V18, V7, V1
FNEG F0, F0
WFMADB V3, V18, V5, V3
MOVD $sinhe1<>+0(SB), R3
WFCEDBS V2, V4, V2
FMOVD 0(R3), F5
MOVD $sinhe0<>+0(SB), R3
WFMADB V6, V1, V5, V1
FMOVD 0(R3), F5
VLGVG $0, V16, R2
WFMADB V6, V3, V5, V6
RLL $3, R2, R2
WORD $0xEC12000F //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
BEQ L9
WFMSDB V0, V1, V6, V0
MOVD $sinhx4ff<>+0(SB), R3
FNEG F0, F0
FMOVD 0(R3), F2
FMUL F2, F0
ANDW $0xFFFF, R2
WORD $0xA53FEFB6 //llill %r3,61366
SUBW R2, R3, R2
WORD $0xEC12000F //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0xB3C10021 //ldgr %f2,%r1
FMUL F2, F0
FMOVD F0, ret+8(FP)
RET
L20:
MOVD $sinhxadd<>+0(SB), R2
FMOVD 0(R2), F2
MOVD sinhrlog2<>+0(SB), R2
WORD $0xB3C10002 //ldgr %f0,%r2
WFMSDB V4, V0, V2, V6
FMOVD sinhrodataL21<>+8(SB), F0
FADD F6, F2
MOVD $sinhe9<>+0(SB), R2
FMSUB F0, F2, F4, F4
FMOVD 0(R2), F1
FMOVD sinhrodataL21<>+0(SB), F3
MOVD $sinhe7<>+0(SB), R2
FMADD F3, F2, F4, F4
FMOVD 0(R2), F0
MOVD $sinhe8<>+0(SB), R2
WFMDB V4, V4, V2
FMOVD 0(R2), F3
MOVD $sinhe6<>+0(SB), R2
FMOVD 0(R2), F5
WORD $0xB3CD0026 //lgdr %r2,%f6
RLL $3, R2, R2
WORD $0xEC12000F //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WFMADB V2, V1, V0, V1
WORD $0xB3C10001 //ldgr %f0,%r1
MOVD $sinhe5<>+0(SB), R1
WFMADB V2, V3, V5, V3
FMOVD 0(R1), F5
MOVD $sinhe3<>+0(SB), R1
FMOVD 0(R1), F6
WFMDB V2, V2, V7
WFMADB V2, V5, V6, V5
WORD $0xA7487FB6 //lhi %r4,32694
FNEG F4, F4
ANDW $0xFFFF, R2
SUBW R2, R4, R2
WORD $0xEC32000F //risbgn %r3,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0xB3C10063 //ldgr %f6,%r3
WFADB V0, V6, V16
MOVD $sinhe4<>+0(SB), R1
WFMADB V1, V7, V5, V1
WFMDB V4, V16, V4
FMOVD 0(R1), F5
MOVD $sinhe2<>+0(SB), R1
VLEG $0, 0(R1), V16
MOVD $sinhe1<>+0(SB), R1
WFMADB V2, V5, V16, V5
VLEG $0, 0(R1), V16
WFMADB V3, V7, V5, V3
WFMADB V2, V1, V16, V1
FSUB F6, F0
FMUL F1, F4
MOVD $sinhe0<>+0(SB), R1
FMOVD 0(R1), F6
WFMADB V2, V3, V6, V2
WFMADB V0, V2, V4, V0
FMOVD F0, ret+8(FP)
RET
L9:
WFMADB V0, V1, V6, V0
MOVD $sinhx4ff<>+0(SB), R3
FMOVD 0(R3), F2
FMUL F2, F0
WORD $0xA72AF000 //ahi %r2,-4096
WORD $0xEC12000F //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0xB3C10021 //ldgr %f2,%r1
FMUL F2, F0
FMOVD F0, ret+8(FP)
RET
L16:
FMOVD F0, ret+8(FP)
RET
LEXITTAGsinh:
sinhIsInf:
sinhIsZero:
FMOVD F0, ret+8(FP)
RET
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build 386 amd64 amd64p32 arm
#include "textflag.h"
TEXT ·Sinh(SB),NOSPLIT,$0
JMP ·sinh(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
JMP ·cosh(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
JMP ·tanh(SB)
......@@ -21,6 +21,9 @@ TEXT ·Atan(SB),NOSPLIT,$0
TEXT ·Exp2(SB),NOSPLIT,$0
B ·exp2(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
B ·cosh(SB)
TEXT ·Expm1(SB),NOSPLIT,$0
B ·expm1(SB)
......@@ -60,8 +63,14 @@ TEXT ·Sincos(SB),NOSPLIT,$0
TEXT ·Sin(SB),NOSPLIT,$0
B ·sin(SB)
TEXT ·Sinh(SB),NOSPLIT,$0
B ·sinh(SB)
TEXT ·Cos(SB),NOSPLIT,$0
B ·cos(SB)
TEXT ·Tan(SB),NOSPLIT,$0
B ·tan(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
B ·tanh(SB)
......@@ -81,11 +81,20 @@ TEXT ·Sincos(SB),NOSPLIT,$0
TEXT ·Sin(SB),NOSPLIT,$0
JMP ·sin(SB)
TEXT ·Sinh(SB),NOSPLIT,$0
JMP ·sinh(SB)
TEXT ·Cos(SB),NOSPLIT,$0
JMP ·cos(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
JMP ·cosh(SB)
TEXT ·Sqrt(SB),NOSPLIT,$0
JMP ·sqrt(SB)
TEXT ·Tan(SB),NOSPLIT,$0
JMP ·tan(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
JMP ·tanh(SB)
......@@ -81,8 +81,18 @@ TEXT ·Sincos(SB),NOSPLIT,$0
TEXT ·Sin(SB),NOSPLIT,$0
JMP ·sin(SB)
TEXT ·Sinh(SB),NOSPLIT,$0
JMP ·sinh(SB)
TEXT ·Cos(SB),NOSPLIT,$0
JMP ·cos(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
JMP ·cosh(SB)
TEXT ·Tan(SB),NOSPLIT,$0
JMP ·tan(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
JMP ·tanh(SB)
......@@ -72,8 +72,17 @@ TEXT ·Sincos(SB),NOSPLIT,$0
TEXT ·Sin(SB),NOSPLIT,$0
BR ·sin(SB)
TEXT ·Sinh(SB),NOSPLIT,$0
BR ·sinh(SB)
TEXT ·Cos(SB),NOSPLIT,$0
BR ·cos(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
BR ·cosh(SB)
TEXT ·Tan(SB),NOSPLIT,$0
BR ·tan(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
BR ·tanh(SB)
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "../runtime/textflag.h"
#include "textflag.h"
TEXT ·Asin(SB),NOSPLIT,$0
BR ·asin(SB)
......@@ -34,9 +34,6 @@ TEXT ·Hypot(SB),NOSPLIT,$0
TEXT ·Ldexp(SB),NOSPLIT,$0
BR ·ldexp(SB)
TEXT ·Log10(SB),NOSPLIT,$0
BR ·log10(SB)
TEXT ·Log2(SB),NOSPLIT,$0
BR ·log2(SB)
......@@ -58,11 +55,154 @@ TEXT ·Remainder(SB),NOSPLIT,$0
TEXT ·Sincos(SB),NOSPLIT,$0
BR ·sincos(SB)
TEXT ·Sin(SB),NOSPLIT,$0
BR ·sin(SB)
TEXT ·Tan(SB),NOSPLIT,$0
BR ·tan(SB)
//if go assembly use vector instruction
TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
MOVD $x-24(SP), R1
XC $24, 0(R1), 0(R1) // clear the storage
MOVD $2, R0 // R0 is the number of double words stored -1
WORD $0xB2B01000 // STFLE 0(R1)
XOR R0, R0 // reset the value of R0
MOVBZ z-8(SP), R1
AND $0x40, R1
BEQ novector
vectorinstalled:
// check if the vector instruction has been enabled
VLEIB $0, $0xF, V16
VLGVB $0, V16, R1
CMPBNE R1, $0xF, novector
MOVB $1, ret+0(FP) // have vx
RET
novector:
MOVB $0, ret+0(FP) // no vx
RET
TEXT ·Log10(SB),NOSPLIT,$0
MOVD log10vectorfacility+0x00(SB),R1
BR (R1)
TEXT ·log10TrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $log10vectorfacility+0x00(SB), R1
MOVD log10(SB), R2
MOVD R2, 0(R1)
BR ·log10(SB)
vectorimpl:
MOVD $log10vectorfacility+0x00(SB), R1
MOVD log10Asm(SB), R2
MOVD R2, 0(R1)
BR ·log10Asm(SB)
GLOBL log10vectorfacility+0x00(SB), NOPTR, $8
DATA log10vectorfacility+0x00(SB)/8, log10TrampolineSetup(SB)
TEXT ·Cos(SB),NOSPLIT,$0
MOVD cosvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·cosTrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $cosvectorfacility+0x00(SB), R1
MOVD cos(SB), R2
MOVD R2, 0(R1)
BR ·cos(SB)
vectorimpl:
MOVD $cosvectorfacility+0x00(SB), R1
MOVD cosAsm(SB), R2
MOVD R2, 0(R1)
BR ·cosAsm(SB)
GLOBL cosvectorfacility+0x00(SB), NOPTR, $8
DATA cosvectorfacility+0x00(SB)/8, cosTrampolineSetup(SB)
TEXT ·Cosh(SB),NOSPLIT,$0
MOVD coshvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·coshTrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $coshvectorfacility+0x00(SB), R1
MOVD cosh(SB), R2
MOVD R2, 0(R1)
BR ·cosh(SB)
vectorimpl:
MOVD $coshvectorfacility+0x00(SB), R1
MOVD coshAsm(SB), R2
MOVD R2, 0(R1)
BR ·coshAsm(SB)
GLOBL coshvectorfacility+0x00(SB), NOPTR, $8
DATA coshvectorfacility+0x00(SB)/8, coshTrampolineSetup(SB)
TEXT ·Sin(SB),NOSPLIT,$0
MOVD sinvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·sinTrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $sinvectorfacility+0x00(SB), R1
MOVD sin(SB), R2
MOVD R2, 0(R1)
BR ·sin(SB)
vectorimpl:
MOVD $sinvectorfacility+0x00(SB), R1
MOVD sinAsm(SB), R2
MOVD R2, 0(R1)
BR ·sinAsm(SB)
GLOBL sinvectorfacility+0x00(SB), NOPTR, $8
DATA sinvectorfacility+0x00(SB)/8, sinTrampolineSetup(SB)
TEXT ·Sinh(SB),NOSPLIT,$0
MOVD sinhvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·sinhTrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $sinhvectorfacility+0x00(SB), R1
MOVD sinh(SB), R2
MOVD R2, 0(R1)
BR ·sinh(SB)
vectorimpl:
MOVD $sinhvectorfacility+0x00(SB), R1
MOVD sinhAsm(SB), R2
MOVD R2, 0(R1)
BR ·sinhAsm(SB)
GLOBL sinhvectorfacility+0x00(SB), NOPTR, $8
DATA sinhvectorfacility+0x00(SB)/8, sinhTrampolineSetup(SB)
TEXT ·Tanh(SB),NOSPLIT,$0
MOVD tanhvectorfacility+0x00(SB),R1
BR (R1)
TEXT ·tanhTrampolineSetup(SB),NOSPLIT, $0
MOVB ·hasVX(SB), R1
CMPBEQ R1, $1, vectorimpl // vectorfacility = 1, vector supported
MOVD $tanhvectorfacility+0x00(SB), R1
MOVD tanh(SB), R2
MOVD R2, 0(R1)
BR ·tanh(SB)
vectorimpl:
MOVD $tanhvectorfacility+0x00(SB), R1
MOVD tanhAsm(SB), R2
MOVD R2, 0(R1)
BR ·tanhAsm(SB)
GLOBL tanhvectorfacility+0x00(SB), NOPTR, $8
DATA tanhvectorfacility+0x00(SB)/8, tanhTrampolineSetup(SB)
TEXT ·Tan(SB),NOSPLIT,$0
BR ·tan(SB)
......@@ -71,7 +71,9 @@ var tanhQ = [...]float64{
// Tanh(±0) = ±0
// Tanh(±Inf) = ±1
// Tanh(NaN) = NaN
func Tanh(x float64) float64 {
func Tanh(x float64) float64
func tanh(x float64) float64 {
const MAXLOG = 8.8029691931113054295988e+01 // log(2**127)
z := Abs(x)
switch {
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// Minimax polynomial approximations
DATA tanhrodataL18<>+0(SB)/8, $-1.0
DATA tanhrodataL18<>+8(SB)/8, $-2.0
DATA tanhrodataL18<>+16(SB)/8, $1.0
DATA tanhrodataL18<>+24(SB)/8, $2.0
DATA tanhrodataL18<>+32(SB)/8, $0.20000000000000011868E+01
DATA tanhrodataL18<>+40(SB)/8, $0.13333333333333341256E+01
DATA tanhrodataL18<>+48(SB)/8, $0.26666666663549111502E+00
DATA tanhrodataL18<>+56(SB)/8, $0.66666666658721844678E+00
DATA tanhrodataL18<>+64(SB)/8, $0.88890217768964374821E-01
DATA tanhrodataL18<>+72(SB)/8, $0.25397199429103821138E-01
DATA tanhrodataL18<>+80(SB)/8, $-.346573590279972643E+00
DATA tanhrodataL18<>+88(SB)/8, $20.E0
GLOBL tanhrodataL18<>+0(SB), RODATA, $96
// Constants
DATA tanhrlog2<>+0(SB)/8, $0x4007154760000000
GLOBL tanhrlog2<>+0(SB), RODATA, $8
DATA tanhxadd<>+0(SB)/8, $0xc2f0000100003ff0
GLOBL tanhxadd<>+0(SB), RODATA, $8
DATA tanhxmone<>+0(SB)/8, $-1.0
GLOBL tanhxmone<>+0(SB), RODATA, $8
DATA tanhxzero<>+0(SB)/8, $0
GLOBL tanhxzero<>+0(SB), RODATA, $8
// Polynomial coefficients
DATA tanhtab<>+0(SB)/8, $0.000000000000000000E+00
DATA tanhtab<>+8(SB)/8, $-.171540871271399150E-01
DATA tanhtab<>+16(SB)/8, $-.306597931864376363E-01
DATA tanhtab<>+24(SB)/8, $-.410200970469965021E-01
DATA tanhtab<>+32(SB)/8, $-.486343079978231466E-01
DATA tanhtab<>+40(SB)/8, $-.538226193725835820E-01
DATA tanhtab<>+48(SB)/8, $-.568439602538111520E-01
DATA tanhtab<>+56(SB)/8, $-.579091847395528847E-01
DATA tanhtab<>+64(SB)/8, $-.571909584179366341E-01
DATA tanhtab<>+72(SB)/8, $-.548312665987204407E-01
DATA tanhtab<>+80(SB)/8, $-.509471843643441085E-01
DATA tanhtab<>+88(SB)/8, $-.456353588448863359E-01
DATA tanhtab<>+96(SB)/8, $-.389755254243262365E-01
DATA tanhtab<>+104(SB)/8, $-.310332908285244231E-01
DATA tanhtab<>+112(SB)/8, $-.218623539150173528E-01
DATA tanhtab<>+120(SB)/8, $-.115062908917949451E-01
GLOBL tanhtab<>+0(SB), RODATA, $128
// Tanh returns the hyperbolic tangent of the argument.
//
// Special cases are:
// Tanh(±0) = ±0
// TanhInf) = ±1
// Tanh(NaN) = NaN
// The algorithm used is minimax polynomial approximation using a table of
// polynomial coefficients determined with a Remez exchange algorithm.
TEXT ·tanhAsm(SB),NOSPLIT,$0-16
FMOVD x+0(FP), F0
//specail case Tanh(±0) = ±0
FMOVD $(0.0), F1
FCMPU F0, F1
BEQ tanhIsZero
MOVD $tanhrodataL18<>+0(SB), R5
WORD $0xB3120000 //ltdbr %f0,%f0
MOVD $0x4034000000000000, R1
BLTU L15
FMOVD F0, F1
L2:
MOVD $tanhxadd<>+0(SB), R2
FMOVD 0(R2), F2
MOVD tanhrlog2<>+0(SB), R2
WORD $0xB3C10042 //ldgr %f4,%r2
WFMSDB V0, V4, V2, V4
MOVD $tanhtab<>+0(SB), R3
WORD $0xB3CD0024 //lgdr %r2,%f4
WORD $0xEC4239BC //risbg %r4,%r2,57,128+60,3
BYTE $0x03
BYTE $0x55
WORD $0xED105058 //cdb %f1,.L19-.L18(%r5)
BYTE $0x00
BYTE $0x19
WORD $0xEC12000F //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
BYTE $0x30
BYTE $0x59
WORD $0x68543000 //ld %f5,0(%r4,%r3)
WORD $0xB3C10061 //ldgr %f6,%r1
BLT L3
MOVD $tanhxzero<>+0(SB), R1
FMOVD 0(R1), F2
WFCHDBS V0, V2, V4
BEQ L9
WFCHDBS V2, V0, V2
BNE L1
MOVD $tanhxmone<>+0(SB), R1
FMOVD 0(R1), F0
FMOVD F0, ret+8(FP)
RET
L3:
FADD F4, F2
FMOVD tanhrodataL18<>+80(SB), F4
FMADD F4, F2, F0, F0
FMOVD tanhrodataL18<>+72(SB), F1
WFMDB V0, V0, V3
FMOVD tanhrodataL18<>+64(SB), F2
WFMADB V0, V1, V2, V1
FMOVD tanhrodataL18<>+56(SB), F4
FMOVD tanhrodataL18<>+48(SB), F2
WFMADB V1, V3, V4, V1
FMOVD tanhrodataL18<>+40(SB), F4
WFMADB V3, V2, V4, V2
FMOVD tanhrodataL18<>+32(SB), F4
WORD $0xB9270022 //lhr %r2,%r2
WFMADB V3, V1, V4, V1
FMOVD tanhrodataL18<>+24(SB), F4
WFMADB V3, V2, V4, V3
WFMADB V0, V5, V0, V2
WFMADB V0, V1, V3, V0
WORD $0xA7183ECF //lhi %r1,16079
WFMADB V0, V2, V5, V2
FMUL F6, F2
MOVW R2, R10
MOVW R1, R11
CMPBLE R10, R11, L16
FMOVD F6, F0
WORD $0xED005010 //adb %f0,.L28-.L18(%r5)
BYTE $0x00
BYTE $0x1A
WORD $0xA7184330 //lhi %r1,17200
FADD F2, F0
MOVW R2, R10
MOVW R1, R11
CMPBGT R10, R11, L17
WORD $0xED605010 //sdb %f6,.L28-.L18(%r5)
BYTE $0x00
BYTE $0x1B
FADD F6, F2
WFDDB V0, V2, V0
FMOVD F0, ret+8(FP)
RET
L9:
FMOVD tanhrodataL18<>+16(SB), F0
L1:
FMOVD F0, ret+8(FP)
RET
L15:
FNEG F0, F1
BR L2
L16:
FADD F6, F2
FMOVD tanhrodataL18<>+8(SB), F0
FMADD F4, F2, F0, F0
FMOVD tanhrodataL18<>+0(SB), F4
FNEG F0, F0
WFMADB V0, V2, V4, V0
FMOVD F0, ret+8(FP)
RET
L17:
WFDDB V0, V4, V0
FMOVD tanhrodataL18<>+16(SB), F2
WFSDB V0, V2, V0
FMOVD F0, ret+8(FP)
RET
tanhIsZero: //return ±0
FMOVD F0, ret+8(FP)
RET
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment