Commit 7037c15e authored by Austin Clements's avatar Austin Clements

runtime/pprof: retry failed tests with longer duration

Currently we run profiling tests for around 200ms in short mode.
However, even on platforms with good profiling, these tests are
inherently flaky, especially on loaded systems like the builders.

To mitigate this, modify the profiling test harness so that if a test
fails in a way that could indicate there just weren't enough samples,
it retries with a longer duration.

This requires some adjustment to the profile checker to distinguish
"fatal" and "retryable" errors. In particular, we no longer consider
it a fatal error to get a profile with zero samples (which we
previously treated as a parse error). We replace this with a retryable
check that the total number of samples is reasonable.

Fixes #13943. Fixes #13871. Fixes #13223.

Change-Id: I9a08664a7e1734c5334b1f3792a56184fe314c4d
Reviewed-on: https://go-review.googlesource.com/18683Reviewed-by: default avatarRuss Cox <rsc@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent aff6aa0a
......@@ -23,14 +23,14 @@ import (
"unsafe"
)
func cpuHogger(f func()) {
func cpuHogger(f func(), dur time.Duration) {
// We only need to get one 100 Hz clock tick, so we've got
// a 25x safety buffer.
// a large safety buffer.
// But do at least 500 iterations (which should take about 100ms),
// otherwise TestCPUProfileMultithreaded can fail if only one
// thread is scheduled during the 250ms period.
// thread is scheduled during the testing period.
t0 := time.Now()
for i := 0; i < 500 || time.Since(t0) < 250*time.Millisecond; i++ {
for i := 0; i < 500 || time.Since(t0) < dur; i++ {
f()
}
}
......@@ -68,20 +68,20 @@ func cpuHog2() {
}
func TestCPUProfile(t *testing.T) {
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func() {
cpuHogger(cpuHog1)
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
cpuHogger(cpuHog1, dur)
})
}
func TestCPUProfileMultithreaded(t *testing.T) {
defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func() {
testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
c := make(chan int)
go func() {
cpuHogger(cpuHog1)
cpuHogger(cpuHog1, dur)
c <- 1
}()
cpuHogger(cpuHog2)
cpuHogger(cpuHog2, dur)
<-c
})
}
......@@ -92,8 +92,8 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
val := *(*[]uintptr)(unsafe.Pointer(&bytes))
val = val[:l]
// 5 for the header, 2 for the per-sample header on at least one sample, 3 for the trailer.
if l < 5+2+3 {
// 5 for the header, 3 for the trailer.
if l < 5+3 {
t.Logf("profile too short: %#x", val)
if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
......@@ -120,7 +120,7 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
}
}
func testCPUProfile(t *testing.T, need []string, f func()) {
func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
switch runtime.GOOS {
case "darwin":
switch runtime.GOARCH {
......@@ -138,12 +138,55 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
t.Skip("skipping on plan9")
}
var prof bytes.Buffer
if err := StartCPUProfile(&prof); err != nil {
t.Fatal(err)
const maxDuration = 5 * time.Second
// If we're running a long test, start with a long duration
// because some of the tests (e.g., TestStackBarrierProfiling)
// are trying to make sure something *doesn't* happen.
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
// Profiling tests are inherently flaky, especially on a
// loaded system, such as when this test is running with
// several others under go test std. If a test fails in a way
// that could mean it just didn't run long enough, try with a
// longer duration.
for duration <= maxDuration {
var prof bytes.Buffer
if err := StartCPUProfile(&prof); err != nil {
t.Fatal(err)
}
f(duration)
StopCPUProfile()
if profileOk(t, need, prof, duration) {
return
}
duration *= 2
if duration <= maxDuration {
t.Logf("retrying with %s duration", duration)
}
}
if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
return
}
// Ignore the failure if the tests are running in a QEMU-based emulator,
// QEMU is not perfect at emulating everything.
// IN_QEMU environmental variable is set by some of the Go builders.
// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
if os.Getenv("IN_QEMU") == "1" {
t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
return
}
f()
StopCPUProfile()
t.FailNow()
}
func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
ok = true
// Check that profile is well formed and contains need.
have := make([]uintptr, len(need))
......@@ -172,11 +215,18 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
// On some windows machines we end up with
// not enough samples due to coarse timer
// resolution. Let it go.
t.Skip("too few samples on Windows (golang.org/issue/10842)")
t.Log("too few samples on Windows (golang.org/issue/10842)")
return false
}
// Check that we got a reasonable number of samples.
if ideal := uintptr(duration * 100 / time.Second); samples == 0 || samples < ideal/4 {
t.Logf("too few samples; got %d, want at least %d, ideally %d", samples, ideal/4, ideal)
ok = false
}
if len(need) == 0 {
return
return ok
}
var total uintptr
......@@ -184,9 +234,8 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
total += have[i]
t.Logf("%s: %d\n", name, have[i])
}
ok := true
if total == 0 {
t.Logf("no CPU profile samples collected")
t.Logf("no samples in expected functions")
ok = false
}
// We'd like to check a reasonable minimum, like
......@@ -200,22 +249,7 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
ok = false
}
}
if !ok {
if badOS[runtime.GOOS] {
t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
return
}
// Ignore the failure if the tests are running in a QEMU-based emulator,
// QEMU is not perfect at emulating everything.
// IN_QEMU environmental variable is set by some of the Go builders.
// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
if os.Getenv("IN_QEMU") == "1" {
t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
return
}
t.FailNow()
}
return ok
}
// Fork can hang if preempted with signals frequently enough (see issue 5517).
......@@ -310,11 +344,7 @@ func TestGoroutineSwitch(t *testing.T) {
// Test that profiling of division operations is okay, especially on ARM. See issue 6681.
func TestMathBigDivide(t *testing.T) {
testCPUProfile(t, nil, func() {
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
testCPUProfile(t, nil, func(duration time.Duration) {
t := time.After(duration)
pi := new(big.Int)
for {
......@@ -365,13 +395,9 @@ func TestStackBarrierProfiling(t *testing.T) {
return
}
testCPUProfile(t, nil, func() {
// This is long enough that we're likely to get one or
// two samples in stackBarrier.
duration := 5 * time.Second
if testing.Short() {
duration = 200 * time.Millisecond
}
testCPUProfile(t, nil, func(duration time.Duration) {
// In long mode, we're likely to get one or two
// samples in stackBarrier.
t := time.After(duration)
for {
deepStack(1000)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment