runtime/pprof: retry failed tests with longer duration

Currently we run profiling tests for around 200ms in short mode. However, even on platforms with good profiling, these tests are inherently flaky, especially on loaded systems like the builders. To mitigate this, modify the profiling test harness so that if a test fails in a way that could indicate there just weren't enough samples, it retries with a longer duration. This requires some adjustment to the profile checker to distinguish "fatal" and "retryable" errors. In particular, we no longer consider it a fatal error to get a profile with zero samples (which we previously treated as a parse error). We replace this with a retryable check that the total number of samples is reasonable. Fixes #13943. Fixes #13871. Fixes #13223. Change-Id: I9a08664a7e1734c5334b1f3792a56184fe314c4d Reviewed-on: https://go-review.googlesource.com/18683Reviewed-by: Russ Cox <rsc@golang.org> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>

runtime/pprof: retry failed tests with longer duration
Currently we run profiling tests for around 200ms in short mode. However, even on platforms with good profiling, these tests are inherently flaky, especially on loaded systems like the builders. To mitigate this, modify the profiling test harness so that if a test fails in a way that could indicate there just weren't enough samples, it retries with a longer duration. This requires some adjustment to the profile checker to distinguish "fatal" and "retryable" errors. In particular, we no longer consider it a fatal error to get a profile with zero samples (which we previously treated as a parse error). We replace this with a retryable check that the total number of samples is reasonable. Fixes #13943. Fixes #13871. Fixes #13223. Change-Id: I9a08664a7e1734c5334b1f3792a56184fe314c4d Reviewed-on: https://go-review.googlesource.com/18683Reviewed-by: Russ Cox <rsc@golang.org> Run-TryBot: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
7037c15e · Austin Clements · aff6aa0a · 7037c15e
Commit 7037c15e authored Jan 15, 2016 by Austin Clements
Hide whitespace changes
Inline Side-by-side

Showing with 75 additions and 49 deletions

src/runtime/pprof/pprof_test.go src/runtime/pprof/pprof_test.go +75 -49

No files found.
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -23,14 +23,14 @@ import (
 	"unsafe"
 )

-func cpuHogger(f func()) {
+func cpuHogger(f func(), dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
-	// a 25x safety buffer.
+	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
 	// otherwise TestCPUProfileMultithreaded can fail if only one
-	// thread is scheduled during the 250ms period.
+	// thread is scheduled during the testing period.
 	t0 := time.Now()
-	for i := 0; i < 500 || time.Since(t0) < 250*time.Millisecond; i++ {
+	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
 		f()
 	}
 }
@@ -68,20 +68,20 @@ func cpuHog2() {
 }

 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func() {
-		cpuHogger(cpuHog1)
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
+		cpuHogger(cpuHog1, dur)
 	})
 }

 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func() {
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
-			cpuHogger(cpuHog1)
+			cpuHogger(cpuHog1, dur)
 			c <- 1
 		}()
-		cpuHogger(cpuHog2)
+		cpuHogger(cpuHog2, dur)
 		<-c
 	})
 }
@@ -92,8 +92,8 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
 	val := *(*[]uintptr)(unsafe.Pointer(&bytes))
 	val = val[:l]

-	// 5 for the header, 2 for the per-sample header on at least one sample, 3 for the trailer.
-	if l < 5+2+3 {
+	// 5 for the header, 3 for the trailer.
+	if l < 5+3 {
 		t.Logf("profile too short: %#x", val)
 		if badOS[runtime.GOOS] {
 			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
@@ -120,7 +120,7 @@ func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
 	}
 }

-func testCPUProfile(t *testing.T, need []string, f func()) {
+func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
@@ -138,12 +138,55 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
 		t.Skip("skipping on plan9")
 	}

-	var prof bytes.Buffer
-	if err := StartCPUProfile(&prof); err != nil {
-		t.Fatal(err)
+	const maxDuration = 5 * time.Second
+	// If we're running a long test, start with a long duration
+	// because some of the tests (e.g., TestStackBarrierProfiling)
+	// are trying to make sure something *doesn't* happen.
+	duration := 5 * time.Second
+	if testing.Short() {
+		duration = 200 * time.Millisecond
+	}
+
+	// Profiling tests are inherently flaky, especially on a
+	// loaded system, such as when this test is running with
+	// several others under go test std. If a test fails in a way
+	// that could mean it just didn't run long enough, try with a
+	// longer duration.
+	for duration <= maxDuration {
+		var prof bytes.Buffer
+		if err := StartCPUProfile(&prof); err != nil {
+			t.Fatal(err)
+		}
+		f(duration)
+		StopCPUProfile()
+
+		if profileOk(t, need, prof, duration) {
+			return
+		}
+
+		duration *= 2
+		if duration <= maxDuration {
+			t.Logf("retrying with %s duration", duration)
+		}
+	}
+
+	if badOS[runtime.GOOS] {
+		t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
+		return
+	}
+	// Ignore the failure if the tests are running in a QEMU-based emulator,
+	// QEMU is not perfect at emulating everything.
+	// IN_QEMU environmental variable is set by some of the Go builders.
+	// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
+	if os.Getenv("IN_QEMU") == "1" {
+		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
+		return
 	}
-	f()
-	StopCPUProfile()
+	t.FailNow()
+}
+
+func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+	ok = true

 	// Check that profile is well formed and contains need.
 	have := make([]uintptr, len(need))
@@ -172,11 +215,18 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
 		// On some windows machines we end up with
 		// not enough samples due to coarse timer
 		// resolution. Let it go.
-		t.Skip("too few samples on Windows (golang.org/issue/10842)")
+		t.Log("too few samples on Windows (golang.org/issue/10842)")
+		return false
+	}
+
+	// Check that we got a reasonable number of samples.
+	if ideal := uintptr(duration * 100 / time.Second); samples == 0 || samples < ideal/4 {
+		t.Logf("too few samples; got %d, want at least %d, ideally %d", samples, ideal/4, ideal)
+		ok = false
 	}

 	if len(need) == 0 {
-		return
+		return ok
 	}

 	var total uintptr
@@ -184,9 +234,8 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
 		total += have[i]
 		t.Logf("%s: %d\n", name, have[i])
 	}
-	ok := true
 	if total == 0 {
-		t.Logf("no CPU profile samples collected")
+		t.Logf("no samples in expected functions")
 		ok = false
 	}
 	// We'd like to check a reasonable minimum, like
@@ -200,22 +249,7 @@ func testCPUProfile(t *testing.T, need []string, f func()) {
 			ok = false
 		}
 	}
-
-	if !ok {
-		if badOS[runtime.GOOS] {
-			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
-			return
-		}
-		// Ignore the failure if the tests are running in a QEMU-based emulator,
-		// QEMU is not perfect at emulating everything.
-		// IN_QEMU environmental variable is set by some of the Go builders.
-		// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
-		if os.Getenv("IN_QEMU") == "1" {
-			t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
-			return
-		}
-		t.FailNow()
-	}
+	return ok
 }

 // Fork can hang if preempted with signals frequently enough (see issue 5517).
@@ -310,11 +344,7 @@ func TestGoroutineSwitch(t *testing.T) {

 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
-	testCPUProfile(t, nil, func() {
-		duration := 5 * time.Second
-		if testing.Short() {
-			duration = 200 * time.Millisecond
-		}
+	testCPUProfile(t, nil, func(duration time.Duration) {
 		t := time.After(duration)
 		pi := new(big.Int)
 		for {
@@ -365,13 +395,9 @@ func TestStackBarrierProfiling(t *testing.T) {
 		return
 	}

-	testCPUProfile(t, nil, func() {
-		// This is long enough that we're likely to get one or
-		// two samples in stackBarrier.
-		duration := 5 * time.Second
-		if testing.Short() {
-			duration = 200 * time.Millisecond
-		}
+	testCPUProfile(t, nil, func(duration time.Duration) {
+		// In long mode, we're likely to get one or two
+		// samples in stackBarrier.
 		t := time.After(duration)
 		for {
 			deepStack(1000)