Commit e4371fb1 authored by Russ Cox's avatar Russ Cox

time: optimize Now on darwin, windows

Fetch both monotonic and wall time together when possible.
Avoids skew and is cheaper.

Also shave a few ns off in conversion in package time.

Compared to current implementation (after monotonic changes):

name   old time/op  new time/op  delta
Now    19.6ns ± 1%   9.7ns ± 1%  -50.63%  (p=0.000 n=41+49) darwin/amd64
Now    23.5ns ± 4%  10.6ns ± 5%  -54.61%  (p=0.000 n=30+28) windows/amd64
Now    54.5ns ± 5%  29.8ns ± 9%  -45.40%  (p=0.000 n=27+29) windows/386

More importantly, compared to Go 1.8:

name   old time/op  new time/op  delta
Now     9.5ns ± 1%   9.7ns ± 1%   +1.94%  (p=0.000 n=41+49) darwin/amd64
Now    12.9ns ± 5%  10.6ns ± 5%  -17.73%  (p=0.000 n=30+28) windows/amd64
Now    15.3ns ± 5%  29.8ns ± 9%  +94.36%  (p=0.000 n=30+29) windows/386

This brings time.Now back in line with Go 1.8 on darwin/amd64 and windows/amd64.

It's not obvious why windows/386 is still noticeably worse than Go 1.8,
but it's better than before this CL. The windows/386 speed is not too
important; the changes just keep the two architectures similar.

Change-Id: If69b94970c8a1a57910a371ee91e0d4e82e46c5d
Reviewed-on: https://go-review.googlesource.com/36428
Run-TryBot: Russ Cox <rsc@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
parent 3a6842a0
......@@ -548,7 +548,7 @@ func dumpmemstats() {
dumpint(memstats.gc_sys)
dumpint(memstats.other_sys)
dumpint(memstats.next_gc)
dumpint(memstats.last_gc)
dumpint(memstats.last_gc_unix)
dumpint(memstats.pause_total_ns)
for i := 0; i < 256; i++ {
dumpint(memstats.pause_ns[i])
......
......@@ -1291,10 +1291,13 @@ func gcMarkTermination() {
}
// Update timing memstats
now, unixNow := nanotime(), unixnanotime()
now := nanotime()
sec, nsec, _ := time_now()
unixNow := sec*1e9 + int64(nsec)
work.pauseNS += now - work.pauseStart
work.tEnd = now
atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
memstats.pause_total_ns += uint64(work.pauseNS)
......
......@@ -72,7 +72,7 @@ type mstats struct {
// Statistics about garbage collector.
// Protected by mheap or stopping the world during GC.
next_gc uint64 // goal heap_live for when next GC ends; ^0 if disabled
last_gc uint64 // last gc (in absolute time)
last_gc_unix uint64 // last gc (in unix time)
pause_total_ns uint64
pause_ns [256]uint64 // circular buffer of recent gc pause lengths
pause_end [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
......@@ -92,6 +92,7 @@ type mstats struct {
// Statistics below here are not exported to MemStats directly.
last_gc_nanotime uint64 // last gc (monotonic time)
tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
// gc_trigger is the heap size that triggers marking.
......@@ -497,7 +498,7 @@ func readGCStats_m(pauses *[]uint64) {
p[n+i] = memstats.pause_end[j]
}
p[n+n] = memstats.last_gc
p[n+n] = memstats.last_gc_unix
p[n+n+1] = uint64(memstats.numgc)
p[n+n+2] = memstats.pause_total_ns
unlock(&mheap_.lock)
......
......@@ -578,50 +578,7 @@ func unminit() {
*tp = 0
}
// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
type _KSYSTEM_TIME struct {
LowPart uint32
High1Time int32
High2Time int32
}
const (
_INTERRUPT_TIME = 0x7ffe0008
_SYSTEM_TIME = 0x7ffe0014
)
//go:nosplit
func systime(addr uintptr) int64 {
timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
var t _KSYSTEM_TIME
for i := 1; i < 10000; i++ {
// these fields must be read in that order (see URL above)
t.High1Time = timeaddr.High1Time
t.LowPart = timeaddr.LowPart
t.High2Time = timeaddr.High2Time
if t.High1Time == t.High2Time {
return int64(t.High1Time)<<32 | int64(t.LowPart)
}
if (i % 100) == 0 {
osyield()
}
}
systemstack(func() {
throw("interrupt/system time is changing too fast")
})
return 0
}
//go:nosplit
func unixnano() int64 {
return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
}
//go:nosplit
func nanotime() int64 {
return systime(_INTERRUPT_TIME) * 100
}
func nanotime() int64
// Calling stdcall on os stack.
// May run during STW, so write barriers are not allowed.
......
......@@ -3818,7 +3818,6 @@ func sysmon() {
// poll network if not polled for more than 10ms
lastpoll := int64(atomic.Load64(&sched.lastpoll))
now := nanotime()
unixnow := unixnanotime()
if lastpoll != 0 && lastpoll+10*1000*1000 < now {
atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
gp := netpoll(false) // non-blocking - returns list of goroutines
......@@ -3843,8 +3842,8 @@ func sysmon() {
idle++
}
// check if we need to force a GC
lastgc := int64(atomic.Load64(&memstats.last_gc))
if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
if gcphase == _GCoff && lastgc != 0 && now-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
lock(&forcegc.lock)
forcegc.idle = 0
forcegc.g.schedlink = 0
......
......@@ -241,8 +241,6 @@ func stackBarrier()
// in asm_*.s
func return0()
func walltime() (sec int64, nsec int32)
// in asm_*.s
// not called directly; definitions here supply type information for traceback.
func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
......@@ -279,11 +277,6 @@ func prefetcht1(addr uintptr)
func prefetcht2(addr uintptr)
func prefetchnta(addr uintptr)
func unixnanotime() int64 {
sec, nsec := walltime()
return sec*1e9 + int64(nsec)
}
// round n up to a multiple of a. a must be a power of 2.
func round(n, a uintptr) uintptr {
return (n + a - 1) &^ (a - 1)
......
......@@ -114,6 +114,16 @@ TEXT runtime·setitimer(SB),NOSPLIT,$0
// 64-bit unix nanoseconds returned in DX:AX.
// I'd much rather write this in C but we need
// assembly for the 96-bit multiply and RDTSC.
//
// Note that we could arrange to return monotonic time here
// as well, but we don't bother, for two reasons:
// 1. macOS only supports 64-bit systems, so no one should
// be using the 32-bit code in production.
// This code is only maintained to make it easier for developers
// using Macs to test the 32-bit compiler.
// 2. On some (probably now unsupported) CPUs,
// the code falls back to the system call always,
// so it can't even use the comm page at all.
TEXT runtime·now(SB),NOSPLIT,$40
MOVL $0xffff0000, BP /* comm page base */
......@@ -217,9 +227,15 @@ inreg:
ADCL $0, DX
RET
// func walltime() (sec int64, nsec int32)
TEXT runtime·walltime(SB),NOSPLIT,$0
// func now() (sec int64, nsec int32, mono uint64)
TEXT time·now(SB),NOSPLIT,$0-20
CALL runtime·now(SB)
MOVL AX, BX
MOVL DX, BP
SUBL runtime·startNano(SB), BX
SBBL runtime·startNano+4(SB), BP
MOVL BX, mono+12(FP)
MOVL BP, mono+16(FP)
MOVL $1000000000, CX
DIVL CX
MOVL AX, sec+0(FP)
......@@ -230,6 +246,8 @@ TEXT runtime·walltime(SB),NOSPLIT,$0
// func nanotime() int64
TEXT runtime·nanotime(SB),NOSPLIT,$0
CALL runtime·now(SB)
SUBL runtime·startNano(SB), AX
SBBL runtime·startNano+4(SB), DX
MOVL AX, ret_lo+0(FP)
MOVL DX, ret_hi+4(FP)
RET
......
......@@ -117,35 +117,44 @@ TEXT runtime·madvise(SB), NOSPLIT, $0
#define gtod_ns_base 0x70
#define gtod_sec_base 0x78
TEXT monotonictime<>(SB), NOSPLIT, $32
MOVQ $0x7fffffe00000, SI // comm page base
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
MOVQ $0x7fffffe00000, BP /* comm page base */
// Loop trying to take a consistent snapshot
// of the time parameters.
timeloop:
MOVL nt_generation(SI), R8
TESTL R8, R8
MOVL nt_generation(BP), R9
TESTL R9, R9
JZ timeloop
RDTSC
MOVQ nt_tsc_base(BP), R10
MOVL nt_scale(BP), R11
MOVQ nt_ns_base(BP), R12
CMPL nt_generation(BP), R9
JNE timeloop
// Gathered all the data we need. Compute monotonic time:
// ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
// The multiply and shift extracts the top 64 bits of the 96-bit product.
SHLQ $32, DX
ORQ DX, AX
MOVL nt_shift(SI), CX
SUBQ nt_tsc_base(SI), AX
SHLQ CX, AX
MOVL nt_scale(SI), CX
MULQ CX
ADDQ DX, AX
SUBQ R10, AX
MULQ R11
SHRQ $32, AX:DX
ADDQ nt_ns_base(SI), AX
CMPL nt_generation(SI), R8
JNE timeloop
ADDQ R12, AX
MOVQ runtime·startNano(SB), CX
SUBQ CX, AX
MOVQ AX, ret+0(FP)
RET
TEXT nanotime<>(SB), NOSPLIT, $32
TEXT time·now(SB), NOSPLIT, $32-24
// Note: The 32 bytes of stack frame requested on the TEXT line
// are used in the systime fallback, as the timeval address
// filled in by the system call.
MOVQ $0x7fffffe00000, BP /* comm page base */
// Loop trying to take a consistent snapshot
// of the time parameters.
timeloop:
MOVL gtod_generation(BP), R8
TESTL R8, R8
JZ systime
MOVL nt_generation(BP), R9
TESTL R9, R9
JZ timeloop
......@@ -160,8 +169,8 @@ timeloop:
CMPL gtod_generation(BP), R8
JNE timeloop
// Gathered all the data we need. Compute time.
// ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
// Gathered all the data we need. Compute:
// monotonic_time = ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
// The multiply and shift extracts the top 64 bits of the 96-bit product.
SHLQ $32, DX
ADDQ DX, AX
......@@ -169,9 +178,33 @@ timeloop:
MULQ R11
SHRQ $32, AX:DX
ADDQ R12, AX
MOVQ AX, BX
MOVQ runtime·startNano(SB), CX
SUBQ CX, BX
MOVQ BX, monotonic+16(FP)
// Compute:
// wall_time = monotonic time - gtod_ns_base + gtod_sec_base*1e9
// or, if gtod_generation==0, invoke the system call.
TESTL R8, R8
JZ systime
SUBQ R13, AX
IMULQ $1000000000, R14
ADDQ R14, AX
// Split wall time into sec, nsec.
// generated code for
// func f(x uint64) (uint64, uint64) { return x/1e9, x%1e9 }
// adapted to reduce duplication
MOVQ AX, CX
SHRQ $9, AX
MOVQ $19342813113834067, DX
MULQ DX
SHRQ $11, DX
MOVQ DX, sec+0(FP)
IMULQ $1000000000, DX
SUBQ DX, CX
MOVL CX, nsec+8(FP)
RET
systime:
......@@ -187,34 +220,9 @@ systime:
MOVL 8(SP), DX
inreg:
// sec is in AX, usec in DX
// return nsec in AX
IMULQ $1000000000, AX
IMULQ $1000, DX
ADDQ DX, AX
RET
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
CALL monotonictime<>(SB)
MOVQ AX, ret+0(FP)
RET
// func walltime() (sec int64, nsec int32)
TEXT runtime·walltime(SB),NOSPLIT,$0-12
CALL nanotime<>(SB)
// generated code for
// func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
// adapted to reduce duplication
MOVQ AX, CX
MOVQ $1360296554856532783, AX
MULQ CX
ADDQ CX, DX
RCRQ $1, DX
SHRQ $29, DX
MOVQ DX, sec+0(FP)
IMULQ $1000000000, DX
SUBQ DX, CX
MOVL CX, nsec+8(FP)
MOVQ AX, sec+0(FP)
MOVL DX, nsec+8(FP)
RET
TEXT runtime·sigprocmask(SB),NOSPLIT,$0
......
......@@ -432,15 +432,103 @@ TEXT runtime·switchtothread(SB),NOSPLIT,$0
MOVL BP, SP
RET
// func walltime() (sec int64, nsec int32)
TEXT runtime·walltime(SB),NOSPLIT,$8-12
CALL runtime·unixnano(SB)
MOVL 0(SP), AX
MOVL 4(SP), DX
// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
#define _INTERRUPT_TIME 0x7ffe0008
#define _SYSTEM_TIME 0x7ffe0014
#define time_lo 0
#define time_hi1 4
#define time_hi2 8
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
loop:
MOVL (_INTERRUPT_TIME+time_hi1), AX
MOVL (_INTERRUPT_TIME+time_lo), CX
MOVL (_INTERRUPT_TIME+time_hi2), DI
CMPL AX, DI
JNE loop
// wintime = DI:CX, multiply by 100
MOVL $100, AX
MULL CX
IMULL $100, DI
ADDL DI, DX
// wintime*100 = DX:AX, subtract startNano and return
SUBL runtime·startNano+0(SB), AX
SBBL runtime·startNano+4(SB), DX
MOVL AX, ret+0(FP)
MOVL DX, ret+4(FP)
RET
TEXT time·now(SB),NOSPLIT,$0-20
loop:
MOVL (_INTERRUPT_TIME+time_hi1), AX
MOVL (_INTERRUPT_TIME+time_lo), CX
MOVL (_INTERRUPT_TIME+time_hi2), DI
CMPL AX, DI
JNE loop
// w = DI:CX
// multiply by 100
MOVL $100, AX
MULL CX
IMULL $100, DI
ADDL DI, DX
// w*100 = DX:AX
// subtract startNano and save for return
SUBL runtime·startNano+0(SB), AX
SBBL runtime·startNano+4(SB), DX
MOVL AX, mono+12(FP)
MOVL DX, mono+16(FP)
wall:
MOVL (_SYSTEM_TIME+time_hi1), CX
MOVL (_SYSTEM_TIME+time_lo), AX
MOVL (_SYSTEM_TIME+time_hi2), DX
CMPL CX, DX
JNE wall
// w = DX:AX
// convert to Unix epoch (but still 100ns units)
#define delta 116444736000000000
SUBL $(delta & 0xFFFFFFFF), AX
SBBL $(delta >> 32), DX
// nano/100 = DX:AX
// split into two decimal halves by div 1e9.
// (decimal point is two spots over from correct place,
// but we avoid overflow in the high word.)
MOVL $1000000000, CX
DIVL CX
MOVL AX, DI
MOVL DX, SI
// DI = nano/100/1e9 = nano/1e11 = sec/100, DX = SI = nano/100%1e9
// split DX into seconds and nanoseconds by div 1e7 magic multiply.
MOVL DX, AX
MOVL $1801439851, CX
MULL CX
SHRL $22, DX
MOVL DX, BX
IMULL $10000000, DX
MOVL SI, CX
SUBL DX, CX
// DI = sec/100 (still)
// BX = (nano/100%1e9)/1e7 = (nano/1e9)%100 = sec%100
// CX = (nano/100%1e9)%1e7 = (nano%1e9)/100 = nsec/100
// store nsec for return
IMULL $100, CX
MOVL CX, nsec+8(FP)
// DI = sec/100 (still)
// BX = sec%100
// construct DX:AX = 64-bit sec and store for return
MOVL $0, DX
MOVL $100, AX
MULL DI
ADDL BX, AX
ADCL $0, DX
MOVL AX, sec+0(FP)
MOVL $0, sec+4(FP)
MOVL DX, nsec+8(FP)
MOVL DX, sec+4(FP)
RET
......@@ -465,10 +465,55 @@ TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
MOVQ 32(SP), SP
RET
// func walltime() (sec int64, nsec int32)
TEXT runtime·walltime(SB),NOSPLIT,$8-12
CALL runtime·unixnano(SB)
MOVQ 0(SP), AX
// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
#define _INTERRUPT_TIME 0x7ffe0008
#define _SYSTEM_TIME 0x7ffe0014
#define time_lo 0
#define time_hi1 4
#define time_hi2 8
TEXT runtime·nanotime(SB),NOSPLIT,$0-8
MOVQ $_INTERRUPT_TIME, DI
loop:
MOVL time_hi1(DI), AX
MOVL time_lo(DI), BX
MOVL time_hi2(DI), CX
CMPL AX, CX
JNE loop
SHLQ $32, CX
ORQ BX, CX
IMULQ $100, CX
SUBQ runtime·startNano(SB), CX
MOVQ CX, ret+0(FP)
RET
TEXT time·now(SB),NOSPLIT,$0-24
MOVQ $_INTERRUPT_TIME, DI
loop:
MOVL time_hi1(DI), AX
MOVL time_lo(DI), BX
MOVL time_hi2(DI), CX
CMPL AX, CX
JNE loop
SHLQ $32, AX
ORQ BX, AX
IMULQ $100, AX
SUBQ runtime·startNano(SB), AX
MOVQ AX, mono+16(FP)
MOVQ $_SYSTEM_TIME, DI
wall:
MOVL time_hi1(DI), AX
MOVL time_lo(DI), BX
MOVL time_hi2(DI), CX
CMPL AX, CX
JNE wall
SHLQ $32, AX
ORQ BX, AX
MOVQ $116444736000000000, DI
SUBQ DI, AX
IMULQ $100, AX
// generated code for
// func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
......@@ -484,4 +529,3 @@ TEXT runtime·walltime(SB),NOSPLIT,$8-12
SUBQ DX, CX
MOVL CX, nsec+8(FP)
RET
......@@ -302,10 +302,4 @@ func time_runtimeNano() int64 {
return nanotime()
}
var startNano = nanotime()
//go:linkname time_now time.now
func time_now() (sec int64, nsec int32, mono uint64) {
sec, nsec = walltime()
return sec, nsec, uint64(nanotime() - startNano + 1)
}
var startNano int64 = nanotime()
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Declarations for operating systems implementing time.now directly in assembly.
// Those systems are also expected to have nanotime subtract startNano,
// so that time.now and nanotime return the same monotonic clock readings.
// +build darwin,amd64 darwin,386 windows
package runtime
import _ "unsafe"
//go:linkname time_now time.now
func time_now() (sec int64, nsec int32, mono int64)
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Declarations for operating systems implementing time.now
// indirectly, in terms of walltime and nanotime assembly.
// +build !darwin !amd64,!386
// +build !windows
package runtime
import _ "unsafe" // for go:linkname
func walltime() (sec int64, nsec int32)
//go:linkname time_now time.now
func time_now() (sec int64, nsec int32, mono int64) {
sec, nsec = walltime()
return sec, nsec, nanotime() - startNano
}
......@@ -246,6 +246,9 @@ var monotonicStringTests = []struct {
}
func TestMonotonicString(t *testing.T) {
t1 := Now()
t.Logf("Now() = %v", t1)
for _, tt := range monotonicStringTests {
t1 := Now()
SetMono(&t1, tt.mono)
......
......@@ -981,14 +981,16 @@ func daysIn(m Month, year int) int {
}
// Provided by package runtime.
func now() (sec int64, nsec int32, mono uint64)
func now() (sec int64, nsec int32, mono int64)
// Now returns the current local time.
func Now() Time {
sec, nsec, mono := now()
t := unixTime(sec, nsec)
t.setMono(int64(mono))
return t
sec += unixToInternal - minWall
if uint64(sec)>>33 != 0 {
return Time{uint64(nsec), sec + minWall, Local}
}
return Time{hasMonotonic | uint64(sec)<<nsecShift | uint64(nsec), mono, Local}
}
func unixTime(sec int64, nsec int32) Time {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment