Commit 64f22e4b authored by Agniva De Sarker's avatar Agniva De Sarker Committed by Nigel Tao

image/jpeg: reduce bound checks from idct and fdct

Before -
$gotip build -gcflags="-d=ssa/check_bce/debug=1" fdct.go idct.go
./fdct.go:89:10: Found IsInBounds
./fdct.go:90:10: Found IsInBounds
./fdct.go:91:10: Found IsInBounds
./fdct.go:92:10: Found IsInBounds
./fdct.go:93:10: Found IsInBounds
./fdct.go:94:10: Found IsInBounds
./fdct.go:95:10: Found IsInBounds
./fdct.go:96:10: Found IsInBounds
./idct.go:77:9: Found IsInBounds
./idct.go:77:27: Found IsInBounds
./idct.go:77:45: Found IsInBounds
./idct.go:78:7: Found IsInBounds
./idct.go:78:25: Found IsInBounds
./idct.go:78:43: Found IsInBounds
./idct.go:78:61: Found IsInBounds
./idct.go:79:13: Found IsInBounds
./idct.go:92:13: Found IsInBounds
./idct.go:93:12: Found IsInBounds
./idct.go:94:12: Found IsInBounds
./idct.go:95:12: Found IsInBounds
./idct.go:97:12: Found IsInBounds
./idct.go:98:12: Found IsInBounds
./idct.go:99:12: Found IsInBounds

After -
$gotip build -gcflags="-d=ssa/check_bce/debug=1" fdct.go idct.go
./fdct.go:90:9: Found IsSliceInBounds
./idct.go:76:11: Found IsSliceInBounds
./idct.go:145:11: Found IsSliceInBounds

name                 old time/op    new time/op    delta
FDCT-4                 1.85µs ± 2%    1.74µs ± 1%  -5.95%  (p=0.000 n=10+10)
IDCT-4                 1.94µs ± 2%    1.89µs ± 1%  -2.67%  (p=0.000 n=10+9)
DecodeBaseline-4       1.45ms ± 2%    1.46ms ± 1%    ~     (p=0.156 n=9+10)
DecodeProgressive-4    2.21ms ± 1%    2.21ms ± 1%    ~     (p=0.796 n=10+10)
EncodeRGBA-4           24.9ms ± 1%    25.0ms ± 1%    ~     (p=0.075 n=10+10)
EncodeYCbCr-4          26.1ms ± 1%    26.2ms ± 1%    ~     (p=0.573 n=8+10)

name                 old speed      new speed      delta
DecodeBaseline-4     42.5MB/s ± 2%  42.4MB/s ± 1%    ~     (p=0.162 n=9+10)
DecodeProgressive-4  27.9MB/s ± 1%  27.9MB/s ± 1%    ~     (p=0.796 n=10+10)
EncodeRGBA-4         49.4MB/s ± 1%  49.1MB/s ± 1%    ~     (p=0.066 n=10+10)
EncodeYCbCr-4        35.3MB/s ± 1%  35.2MB/s ± 1%    ~     (p=0.586 n=8+10)

name                 old alloc/op   new alloc/op   delta
DecodeBaseline-4       63.0kB ± 0%    63.0kB ± 0%    ~     (all equal)
DecodeProgressive-4     260kB ± 0%     260kB ± 0%    ~     (all equal)
EncodeRGBA-4           4.40kB ± 0%    4.40kB ± 0%    ~     (all equal)
EncodeYCbCr-4          4.40kB ± 0%    4.40kB ± 0%    ~     (all equal)

name                 old allocs/op  new allocs/op  delta
DecodeBaseline-4         5.00 ± 0%      5.00 ± 0%    ~     (all equal)
DecodeProgressive-4      13.0 ± 0%      13.0 ± 0%    ~     (all equal)
EncodeRGBA-4             4.00 ± 0%      4.00 ± 0%    ~     (all equal)
EncodeYCbCr-4            4.00 ± 0%      4.00 ± 0%    ~     (all equal)

Updates #24499

Change-Id: I6828d077b851817503a7c1a08235763f81bdadf9
Reviewed-on: https://go-review.googlesource.com/c/go/+/167417
Run-TryBot: Agniva De Sarker <agniva.quicksilver@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarNigel Tao <nigeltao@golang.org>
parent 2da96591
...@@ -86,14 +86,16 @@ const ( ...@@ -86,14 +86,16 @@ const (
func fdct(b *block) { func fdct(b *block) {
// Pass 1: process rows. // Pass 1: process rows.
for y := 0; y < 8; y++ { for y := 0; y < 8; y++ {
x0 := b[y*8+0] y8 := y * 8
x1 := b[y*8+1] s := b[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
x2 := b[y*8+2] x0 := s[0]
x3 := b[y*8+3] x1 := s[1]
x4 := b[y*8+4] x2 := s[2]
x5 := b[y*8+5] x3 := s[3]
x6 := b[y*8+6] x4 := s[4]
x7 := b[y*8+7] x5 := s[5]
x6 := s[6]
x7 := s[7]
tmp0 := x0 + x7 tmp0 := x0 + x7
tmp1 := x1 + x6 tmp1 := x1 + x6
...@@ -110,12 +112,12 @@ func fdct(b *block) { ...@@ -110,12 +112,12 @@ func fdct(b *block) {
tmp2 = x2 - x5 tmp2 = x2 - x5
tmp3 = x3 - x4 tmp3 = x3 - x4
b[y*8+0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits s[0] = (tmp10 + tmp11 - 8*centerJSample) << pass1Bits
b[y*8+4] = (tmp10 - tmp11) << pass1Bits s[4] = (tmp10 - tmp11) << pass1Bits
z1 := (tmp12 + tmp13) * fix_0_541196100 z1 := (tmp12 + tmp13) * fix_0_541196100
z1 += 1 << (constBits - pass1Bits - 1) z1 += 1 << (constBits - pass1Bits - 1)
b[y*8+2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits) s[2] = (z1 + tmp12*fix_0_765366865) >> (constBits - pass1Bits)
b[y*8+6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits) s[6] = (z1 - tmp13*fix_1_847759065) >> (constBits - pass1Bits)
tmp10 = tmp0 + tmp3 tmp10 = tmp0 + tmp3
tmp11 = tmp1 + tmp2 tmp11 = tmp1 + tmp2
...@@ -134,10 +136,10 @@ func fdct(b *block) { ...@@ -134,10 +136,10 @@ func fdct(b *block) {
tmp12 += z1 tmp12 += z1
tmp13 += z1 tmp13 += z1
b[y*8+1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits) s[1] = (tmp0 + tmp10 + tmp12) >> (constBits - pass1Bits)
b[y*8+3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits) s[3] = (tmp1 + tmp11 + tmp13) >> (constBits - pass1Bits)
b[y*8+5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits) s[5] = (tmp2 + tmp11 + tmp12) >> (constBits - pass1Bits)
b[y*8+7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits) s[7] = (tmp3 + tmp10 + tmp13) >> (constBits - pass1Bits)
} }
// Pass 2: process columns. // Pass 2: process columns.
// We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8. // We remove pass1Bits scaling, but leave results scaled up by an overall factor of 8.
......
...@@ -73,30 +73,31 @@ func idct(src *block) { ...@@ -73,30 +73,31 @@ func idct(src *block) {
// Horizontal 1-D IDCT. // Horizontal 1-D IDCT.
for y := 0; y < 8; y++ { for y := 0; y < 8; y++ {
y8 := y * 8 y8 := y * 8
s := src[y8 : y8+8 : y8+8] // Small cap improves performance, see https://golang.org/issue/27857
// If all the AC components are zero, then the IDCT is trivial. // If all the AC components are zero, then the IDCT is trivial.
if src[y8+1] == 0 && src[y8+2] == 0 && src[y8+3] == 0 && if s[1] == 0 && s[2] == 0 && s[3] == 0 &&
src[y8+4] == 0 && src[y8+5] == 0 && src[y8+6] == 0 && src[y8+7] == 0 { s[4] == 0 && s[5] == 0 && s[6] == 0 && s[7] == 0 {
dc := src[y8+0] << 3 dc := s[0] << 3
src[y8+0] = dc s[0] = dc
src[y8+1] = dc s[1] = dc
src[y8+2] = dc s[2] = dc
src[y8+3] = dc s[3] = dc
src[y8+4] = dc s[4] = dc
src[y8+5] = dc s[5] = dc
src[y8+6] = dc s[6] = dc
src[y8+7] = dc s[7] = dc
continue continue
} }
// Prescale. // Prescale.
x0 := (src[y8+0] << 11) + 128 x0 := (s[0] << 11) + 128
x1 := src[y8+4] << 11 x1 := s[4] << 11
x2 := src[y8+6] x2 := s[6]
x3 := src[y8+2] x3 := s[2]
x4 := src[y8+1] x4 := s[1]
x5 := src[y8+7] x5 := s[7]
x6 := src[y8+5] x6 := s[5]
x7 := src[y8+3] x7 := s[3]
// Stage 1. // Stage 1.
x8 := w7 * (x4 + x5) x8 := w7 * (x4 + x5)
...@@ -126,14 +127,14 @@ func idct(src *block) { ...@@ -126,14 +127,14 @@ func idct(src *block) {
x4 = (r2*(x4-x5) + 128) >> 8 x4 = (r2*(x4-x5) + 128) >> 8
// Stage 4. // Stage 4.
src[y8+0] = (x7 + x1) >> 8 s[0] = (x7 + x1) >> 8
src[y8+1] = (x3 + x2) >> 8 s[1] = (x3 + x2) >> 8
src[y8+2] = (x0 + x4) >> 8 s[2] = (x0 + x4) >> 8
src[y8+3] = (x8 + x6) >> 8 s[3] = (x8 + x6) >> 8
src[y8+4] = (x8 - x6) >> 8 s[4] = (x8 - x6) >> 8
src[y8+5] = (x0 - x4) >> 8 s[5] = (x0 - x4) >> 8
src[y8+6] = (x3 - x2) >> 8 s[6] = (x3 - x2) >> 8
src[y8+7] = (x7 - x1) >> 8 s[7] = (x7 - x1) >> 8
} }
// Vertical 1-D IDCT. // Vertical 1-D IDCT.
...@@ -141,16 +142,17 @@ func idct(src *block) { ...@@ -141,16 +142,17 @@ func idct(src *block) {
// Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial. // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
// However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
// we do not bother to check for the all-zero case. // we do not bother to check for the all-zero case.
s := src[x : x+57 : x+57] // Small cap improves performance, see https://golang.org/issue/27857
// Prescale. // Prescale.
y0 := (src[8*0+x] << 8) + 8192 y0 := (s[8*0] << 8) + 8192
y1 := src[8*4+x] << 8 y1 := s[8*4] << 8
y2 := src[8*6+x] y2 := s[8*6]
y3 := src[8*2+x] y3 := s[8*2]
y4 := src[8*1+x] y4 := s[8*1]
y5 := src[8*7+x] y5 := s[8*7]
y6 := src[8*5+x] y6 := s[8*5]
y7 := src[8*3+x] y7 := s[8*3]
// Stage 1. // Stage 1.
y8 := w7*(y4+y5) + 4 y8 := w7*(y4+y5) + 4
...@@ -180,13 +182,13 @@ func idct(src *block) { ...@@ -180,13 +182,13 @@ func idct(src *block) {
y4 = (r2*(y4-y5) + 128) >> 8 y4 = (r2*(y4-y5) + 128) >> 8
// Stage 4. // Stage 4.
src[8*0+x] = (y7 + y1) >> 14 s[8*0] = (y7 + y1) >> 14
src[8*1+x] = (y3 + y2) >> 14 s[8*1] = (y3 + y2) >> 14
src[8*2+x] = (y0 + y4) >> 14 s[8*2] = (y0 + y4) >> 14
src[8*3+x] = (y8 + y6) >> 14 s[8*3] = (y8 + y6) >> 14
src[8*4+x] = (y8 - y6) >> 14 s[8*4] = (y8 - y6) >> 14
src[8*5+x] = (y0 - y4) >> 14 s[8*5] = (y0 - y4) >> 14
src[8*6+x] = (y3 - y2) >> 14 s[8*6] = (y3 - y2) >> 14
src[8*7+x] = (y7 - y1) >> 14 s[8*7] = (y7 - y1) >> 14
} }
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment