Commit 7037c15e authored by Austin Clements's avatar Austin Clements

runtime/pprof: retry failed tests with longer duration

Currently we run profiling tests for around 200ms in short mode.
However, even on platforms with good profiling, these tests are
inherently flaky, especially on loaded systems like the builders.

To mitigate this, modify the profiling test harness so that if a test
fails in a way that could indicate there just weren't enough samples,
it retries with a longer duration.

This requires some adjustment to the profile checker to distinguish
"fatal" and "retryable" errors. In particular, we no longer consider
it a fatal error to get a profile with zero samples (which we
previously treated as a parse error). We replace this with a retryable
check that the total number of samples is reasonable.

Fixes #13943. Fixes #13871. Fixes #13223.

Change-Id: I9a08664a7e1734c5334b1f3792a56184fe314c4d
Reviewed-on: https://go-review.googlesource.com/18683Reviewed-by: default avatarRuss Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent aff6aa0a
...@@ -23,14 +23,14 @@ import ( ...@@ -23,14 +23,14 @@ import (
"unsafe" "unsafe"
) )
func cpuHogger(f func()) { func cpuHogger(f func(), dur time.Duration) {
// We only need to get one 100 Hz clock tick, so we've got // We only need to get one 100 Hz clock tick, so we've got
// a 25x safety buffer. // a large safety buffer.
// But do at least 500 iterations (which should take about 100ms), // But do at least 500 iterations (which should take about 100ms),
// otherwise TestCPUProfileMultithreaded can fail if only one // otherwise TestCPUProfileMultithreaded can fail if only one
// thread is scheduled during the 250ms period. // thread is scheduled during the testing period.
t0 := time.Now() t0 := time.Now()
for i := 0; i < 500 || time.Since(t0) < 250*time.Millisecond; i++ { for i := 0; i < 500 || time.Since(t0) < dur; i++ {
f() f()
} }
} }
...@@ -68,20 +68,20 @@ func cpuHog2() { ...@@ -68,20 +68,20 @@ func cpuHog2() {
} }
func TestCPUProfile(t *testing.T) { func TestCPUProfile(t *testing.T) {
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func() { testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
cpuHogger(cpuHog1) cpuHogger(cpuHog1, dur)
}) })
} }
func TestCPUProfileMultithreaded(t *testing.T) { func TestCPUProfileMultithreaded(t *testing.T) {
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2)) defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func() { testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
c := make(chan int) c := make(chan int)
go func() { go func() {
cpuHogger(cpuHog1) cpuHogger(cpuHog1, dur)
c <- 1 c <- 1
}() }()
cpuHogger(cpuHog2) cpuHogger(cpuHog2, dur)
<-c <-c
}) })
} }
...@@ -92,8 +92,8 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) { ...@@ -92,8 +92,8 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
val := *(*[]uintptr)(unsafe.Pointer(&bytes)) val := *(*[]uintptr)(unsafe.Pointer(&bytes))
val = val[:l] val = val[:l]
// 5 for the header, 2 for the per-sample header on at least one sample, 3 for the trailer. // 5 for the header, 3 for the trailer.
if l < 5+2+3 { if l < 5+3 {
t.Logf("profile too short: %#x", val) t.Logf("profile too short: %#x", val)
if badOS[runtime.GOOS] { if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS) t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
...@@ -120,7 +120,7 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) { ...@@ -120,7 +120,7 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
} }
} }
func testCPUProfile(t *testing.T, need []string, f func()) { func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
switch runtime.GOOS { switch runtime.GOOS {
case "darwin": case "darwin":
switch runtime.GOARCH { switch runtime.GOARCH {
...@@ -138,12 +138,55 @@ func testCPUProfile(t *testing.T, need []string, f func()) { ...@@ -138,12 +138,55 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
t.Skip("skipping on plan9") t.Skip("skipping on plan9")
} }
var prof bytes.Buffer const maxDuration = 5 * time.Second
if err := StartCPUProfile(&prof); err != nil { // If we're running a long test, start with a long duration
t.Fatal(err) // because some of the tests (e.g., TestStackBarrierProfiling)
// are trying to make sure something *doesn't* happen.
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
// Profiling tests are inherently flaky, especially on a
// loaded system, such as when this test is running with
// several others under go test std. If a test fails in a way
// that could mean it just didn't run long enough, try with a
// longer duration.
for duration <= maxDuration {
var prof bytes.Buffer
if err := StartCPUProfile(&prof); err != nil {
t.Fatal(err)
}
f(duration)
StopCPUProfile()
if profileOk(t, need, prof, duration) {
return
}
duration *= 2
if duration <= maxDuration {
t.Logf("retrying with %s duration", duration)
}
}
if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
return
}
// Ignore the failure if the tests are running in a QEMU-based emulator,
// QEMU is not perfect at emulating everything.
// IN_QEMU environmental variable is set by some of the Go builders.
// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
if os.Getenv("IN_QEMU") == "1" {
t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
return
} }
f() t.FailNow()
StopCPUProfile() }
func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
ok = true
// Check that profile is well formed and contains need. // Check that profile is well formed and contains need.
have := make([]uintptr, len(need)) have := make([]uintptr, len(need))
...@@ -172,11 +215,18 @@ func testCPUProfile(t *testing.T, need []string, f func()) { ...@@ -172,11 +215,18 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
// On some windows machines we end up with // On some windows machines we end up with
// not enough samples due to coarse timer // not enough samples due to coarse timer
// resolution. Let it go. // resolution. Let it go.
t.Skip("too few samples on Windows (golang.org/issue/10842)") t.Log("too few samples on Windows (golang.org/issue/10842)")
return false
}
// Check that we got a reasonable number of samples.
if ideal := uintptr(duration * 100 / time.Second); samples == 0 || samples < ideal/4 {
t.Logf("too few samples; got %d, want at least %d, ideally %d", samples, ideal/4, ideal)
ok = false
} }
if len(need) == 0 { if len(need) == 0 {
return return ok
} }
var total uintptr var total uintptr
...@@ -184,9 +234,8 @@ func testCPUProfile(t *testing.T, need []string, f func()) { ...@@ -184,9 +234,8 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
total += have[i] total += have[i]
t.Logf("%s: %d\n", name, have[i]) t.Logf("%s: %d\n", name, have[i])
} }
ok := true
if total == 0 { if total == 0 {
t.Logf("no CPU profile samples collected") t.Logf("no samples in expected functions")
ok = false ok = false
} }
// We'd like to check a reasonable minimum, like // We'd like to check a reasonable minimum, like
...@@ -200,22 +249,7 @@ func testCPUProfile(t *testing.T, need []string, f func()) { ...@@ -200,22 +249,7 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
ok = false ok = false
} }
} }
return ok
if !ok {
if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
return
}
// Ignore the failure if the tests are running in a QEMU-based emulator,
// QEMU is not perfect at emulating everything.
// IN_QEMU environmental variable is set by some of the Go builders.
// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
if os.Getenv("IN_QEMU") == "1" {
t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
return
}
t.FailNow()
}
} }
// Fork can hang if preempted with signals frequently enough (see issue 5517). // Fork can hang if preempted with signals frequently enough (see issue 5517).
...@@ -310,11 +344,7 @@ func TestGoroutineSwitch(t *testing.T) { ...@@ -310,11 +344,7 @@ func TestGoroutineSwitch(t *testing.T) {
// Test that profiling of division operations is okay, especially on ARM. See issue 6681. // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
func TestMathBigDivide(t *testing.T) { func TestMathBigDivide(t *testing.T) {
testCPUProfile(t, nil, func() { testCPUProfile(t, nil, func(duration time.Duration) {
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
t := time.After(duration) t := time.After(duration)
pi := new(big.Int) pi := new(big.Int)
for { for {
...@@ -365,13 +395,9 @@ func TestStackBarrierProfiling(t *testing.T) { ...@@ -365,13 +395,9 @@ func TestStackBarrierProfiling(t *testing.T) {
return return
} }
testCPUProfile(t, nil, func() { testCPUProfile(t, nil, func(duration time.Duration) {
// This is long enough that we're likely to get one or // In long mode, we're likely to get one or two
// two samples in stackBarrier. // samples in stackBarrier.
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
t := time.After(duration) t := time.After(duration)
for { for {
deepStack(1000) deepStack(1000)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment