runtime/pprof: add streaming protobuf encoder

The existing code builds a full profile in memory. Then it translates that profile into a data structure (in memory). Then it marshals that data structure into a protocol buffer (in memory). Then it gzips that marshaled form into the underlying writer. So there are three copies of the full profile data in memory at the same time before we're done. This is obviously dumb. This CL implements a fully streaming conversion from the original in-memory profile to the underlying writer. There is now only one copy of the profile in memory. For the non-CPU profiles, this is optimal, since we have to have a full copy in memory to start with. For the CPU profiles, we could still try to bound the profile size stored in memory and stream fragments out during the actual profiling, as Go 1.7 did (with a simpler format), but so far that hasn't been necessary. Change-Id: Ic36141021857791bf0cd1fce84178fb5e744b989 Reviewed-on: https://go-review.googlesource.com/37164 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Michael Matloob <matloob@golang.org>

runtime/pprof: add streaming protobuf encoder
The existing code builds a full profile in memory. Then it translates that profile into a data structure (in memory). Then it marshals that data structure into a protocol buffer (in memory). Then it gzips that marshaled form into the underlying writer. So there are three copies of the full profile data in memory at the same time before we're done. This is obviously dumb. This CL implements a fully streaming conversion from the original in-memory profile to the underlying writer. There is now only one copy of the profile in memory. For the non-CPU profiles, this is optimal, since we have to have a full copy in memory to start with. For the CPU profiles, we could still try to bound the profile size stored in memory and stream fragments out during the actual profiling, as Go 1.7 did (with a simpler format), but so far that hasn't been necessary. Change-Id: Ic36141021857791bf0cd1fce84178fb5e744b989 Reviewed-on: https://go-review.googlesource.com/37164 Run-TryBot: Russ Cox <rsc@golang.org> Reviewed-by: Michael Matloob <matloob@golang.org>
cbab65fd · Russ Cox · 322fff8a · cbab65fd · cbab65fd · cbab65fd
Commit cbab65fd authored Feb 17, 2017 by Russ Cox
9 changed files
--- a/src/go/build/deps_test.go
+++ b/src/go/build/deps_test.go
@@ -172,13 +172,12 @@ var pkgDeps = map[string][]string{
 	"log": {"L1", "os", "fmt", "time"},
 	// Packages used by testing must be low-level (L2+fmt).
-	"regexp":                            {"L2", "regexp/syntax"},
+	"regexp":         {"L2", "regexp/syntax"},
-	"regexp/syntax":                     {"L2"},
+	"regexp/syntax":  {"L2"},
-	"runtime/debug":                     {"L2", "fmt", "io/ioutil", "os", "time"},
+	"runtime/debug":  {"L2", "fmt", "io/ioutil", "os", "time"},
-	"runtime/pprof/internal/protopprof": {"L2", "fmt", "internal/pprof/profile", "os", "time"},
+	"runtime/pprof":  {"L2", "compress/gzip", "context", "fmt", "io/ioutil", "os", "text/tabwriter", "time"},
-	"runtime/pprof":                     {"L2", "context", "fmt", "internal/pprof/profile", "os", "runtime/pprof/internal/protopprof", "text/tabwriter", "time"},
+	"runtime/trace":  {"L0"},
-	"runtime/trace":                     {"L0"},
+	"text/tabwriter": {"L2"},
-	"text/tabwriter":                    {"L2"},
 	"testing":          {"L2", "flag", "fmt", "internal/race", "os", "runtime/debug", "runtime/pprof", "runtime/trace", "time"},
 	"testing/iotest":   {"L2", "log"},

--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-package pprof_test
+package pprof
 import (
 	"bytes"
@@ -10,7 +10,6 @@ import (
 	"reflect"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
 	"testing"
 	"unsafe"
 )
@@ -86,22 +85,22 @@ func TestMemoryProfiler(t *testing.T) {
 	tests := []string{
 		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:41
+#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:40
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:75
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:74
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:22
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:21
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:73
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:72
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:28
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:27
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:74
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:73
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 		fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:49
+#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:48
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}

--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -75,7 +75,6 @@ import (
 	"bufio"
 	"bytes"
 	"fmt"
-	"internal/pprof/profile"
 	"io"
 	"runtime"
 	"sort"
@@ -384,35 +383,26 @@ func printCountProfile(w io.Writer, debug int, name string, p countProfile) erro
 	}
 	// Output profile in protobuf form.
-	prof := &profile.Profile{
+	b := newProfileBuilder(w)
-		PeriodType: &profile.ValueType{Type: name, Unit: "count"},
+	b.pbValueType(tagProfile_PeriodType, name, "count")
-		Period:     1,
+	b.pb.int64Opt(tagProfile_Period, 1)
-		Sample:     make([]*profile.Sample, 0, len(keys)),
+	b.pbValueType(tagProfile_SampleType, name, "count")
-		SampleType: []*profile.ValueType{{Type: name, Unit: "count"}},
-	}
+	values := []int64{0}
-	locMap := make(map[uintptr]*profile.Location)
+	var locs []uint64
 	for _, k := range keys {
-		stk := p.Stack(index[k])
+		values[0] = int64(count[k])
-		c := count[k]
+		locs = locs[:0]
-		locs := make([]*profile.Location, len(stk))
+		for i, addr := range p.Stack(index[k]) {
-		for i, addr := range stk {
+			if false && i > 0 { // TODO: why disabled?
-			loc := locMap[addr]
+				addr--
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(locMap) + 1),
-					Address: uint64(addr - 1),
-				}
-				prof.Location = append(prof.Location, loc)
-				locMap[addr] = loc
 			}
-			locs[i] = loc
+			locs = append(locs, b.locForPC(addr))
 		}
-		prof.Sample = append(prof.Sample, &profile.Sample{
+		b.pbSample(values, locs, nil)
-			Location: locs,
-			Value:    []int64{int64(c)},
-		})
 	}
-	return prof.Write(w)
+	b.build()
+	return nil
 }
 // keysByCount sorts keys with higher counts first, breaking ties by key string order.
@@ -500,8 +490,7 @@ func writeHeap(w io.Writer, debug int) error {
 	}
 	if debug == 0 {
-		pp := encodeMemProfile(p, int64(runtime.MemProfileRate), time.Now())
+		return writeHeapProto(w, p, int64(runtime.MemProfileRate))
-		return pp.Write(w)
 	}
 	sort.Slice(p, func(i, j int) bool { return p[i].InUseBytes() > p[j].InUseBytes() })
@@ -705,7 +694,7 @@ func StartCPUProfile(w io.Writer) error {
 func readProfile() (data []uint64, tags []unsafe.Pointer, eof bool)
 func profileWriter(w io.Writer) {
-	b := newProfileBuilder()
+	b := newProfileBuilder(w)
 	var err error
 	for {
 		time.Sleep(100 * time.Millisecond)
@@ -717,13 +706,12 @@ func profileWriter(w io.Writer) {
 			break
 		}
 	}
-	p := b.build()
 	if err != nil {
 		// The runtime should never produce an invalid or truncated profile.
 		// It drops records that can't fit into its log buffers.
 		panic("runtime/pprof: converting profile: " + err.Error())
 	}
-	p.Write(w)
+	b.build()
 	cpu.done <- true
 }

--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -4,7 +4,7 @@
 // +build !nacl
-package pprof_test
+package pprof
 import (
 	"bytes"
@@ -16,7 +16,6 @@ import (
 	"os/exec"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
 	"strings"
 	"sync"
 	"testing"
@@ -68,14 +67,14 @@ func cpuHog2() {
 }
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"runtime/pprof.cpuHog1"}, func(dur time.Duration) {
 		cpuHogger(cpuHog1, dur)
 	})
 }
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
 			cpuHogger(cpuHog1, dur)
@@ -171,21 +170,26 @@ func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Dur
 	// Check that profile is well formed and contains need.
 	have := make([]uintptr, len(need))
 	var samples uintptr
+	var buf bytes.Buffer
 	parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+		fmt.Fprintf(&buf, "%d:", count)
 		samples += count
 		for _, pc := range stk {
+			fmt.Fprintf(&buf, " %#x", pc)
 			f := runtime.FuncForPC(pc)
 			if f == nil {
 				continue
 			}
+			fmt.Fprintf(&buf, "(%s)", f.Name())
 			for i, name := range need {
 				if strings.Contains(f.Name(), name) {
 					have[i] += count
 				}
 			}
 		}
+		fmt.Fprintf(&buf, "\n")
 	})
-	t.Logf("total %d CPU profile samples collected", samples)
+	t.Logf("total %d CPU profile samples collected:\n%s", samples, buf.String())
 	if samples < 10 && runtime.GOOS == "windows" {
 		// On some windows machines we end up with
@@ -361,44 +365,44 @@ func TestBlockProfile(t *testing.T) {
 		{"chan recv", blockChanRecv, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanRecv\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan send", blockChanSend, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime\.chansend1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanSend\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockChanSend\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan close", blockChanClose, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanClose\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockChanClose\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select recv async", blockSelectRecvAsync, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectRecvAsync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select send sync", blockSelectSendSync, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectSendSync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"mutex", blockMutex, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9,a-f]+	.*/src/sync/mutex\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockMutex\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockMutex\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"cond", blockCond, `
 [0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9,a-f]+	.*/src/sync/cond\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockCond\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.blockCond\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9,a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 	}
@@ -541,7 +545,7 @@ func TestMutexProfile(t *testing.T) {
 	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
 		t.Errorf("%q didn't match %q", lines[3], r2)
 	}
-	r3 := "^#.*runtime/pprof_test.blockMutex.*$"
+	r3 := "^#.*runtime/pprof.blockMutex.*$"
 	if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
 		t.Errorf("%q didn't match %q", lines[5], r3)
 	}

--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -19,11 +19,13 @@ import (
 // This is only used for testing. Real conversions stream the
 // data into the profileBuilder as it becomes available.
 func translateCPUProfile(data []uint64) (*profile.Profile, error) {
-	b := newProfileBuilder()
+	var buf bytes.Buffer
+	b := newProfileBuilder(&buf)
 	if err := b.addCPUData(data, nil); err != nil {
 		return nil, err
 	}
-	return b.build(), nil
+	b.build()
+	return profile.Parse(&buf)
 }
 // fmtJSON returns a pretty-printed JSON form for x.
@@ -38,7 +40,7 @@ func TestConvertCPUProfileEmpty(t *testing.T) {
 	// A test server with mock cpu profile data.
 	var buf bytes.Buffer
-	b := []uint64{3, 0, 2000} // empty profile with 2000ms sample period
+	b := []uint64{3, 0, 2000} // empty profile with 2ms sample period
 	p, err := translateCPUProfile(b)
 	if err != nil {
 		t.Fatalf("translateCPUProfile: %v", err)
@@ -53,15 +55,13 @@ func TestConvertCPUProfileEmpty(t *testing.T) {
 	}
 	// Expected PeriodType and SampleType.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
+	sampleType := []*profile.ValueType{
 		{Type: "samples", Unit: "count"},
 		{Type: "cpu", Unit: "nanoseconds"},
 	}
-	if p.Period != 2000*1000 || !reflect.DeepEqual(p.PeriodType, expectedPeriodType) ||
-		!reflect.DeepEqual(p.SampleType, expectedSampleType) || p.Sample != nil {
+	checkProfile(t, p, 2000*1000, periodType, sampleType, nil)
-		t.Fatalf("Unexpected Profile fields")
-	}
 }
 func f1() { f1() }
@@ -145,7 +145,17 @@ func checkProfile(t *testing.T, p *profile.Profile, period int64, periodType *pr
 			l.Line = nil
 		}
 	}
-	if !reflect.DeepEqual(p.Sample, samples) {
+	if fmtJSON(p.Sample) != fmtJSON(samples) { // ignore unexported fields
+		if len(p.Sample) == len(samples) {
+			for i := range p.Sample {
+				if !reflect.DeepEqual(p.Sample[i], samples[i]) {
+					t.Errorf("sample %d = %v\nwant = %v\n", i, fmtJSON(p.Sample[i]), fmtJSON(samples[i]))
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
 		t.Fatalf("p.Sample = %v\nwant = %v", fmtJSON(p.Sample), fmtJSON(samples))
 	}
 }
@@ -163,6 +173,7 @@ func (f *fakeFunc) FileLine(uintptr) (string, int) {
 	return f.file, f.lineno
 }
+/*
 // TestRuntimeFunctionTrimming tests if symbolize trims runtime functions as intended.
 func TestRuntimeRunctionTrimming(t *testing.T) {
 	fakeFuncMap := map[uintptr]*fakeFunc{
@@ -246,3 +257,4 @@ func TestRuntimeRunctionTrimming(t *testing.T) {
 		}
 	}
 }
+*/
--- a/src/runtime/pprof/protobuf.go
+++ b/src/runtime/pprof/protobuf.go
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+package pprof
+// A protobuf is a simple protocol buffer encoder.
+type protobuf struct {
+	data []byte
+	tmp  [16]byte
+	nest int
+}
+func (b *protobuf) varint(x uint64) {
+	for x >= 128 {
+		b.data = append(b.data, byte(x)|0x80)
+		x >>= 7
+	}
+	b.data = append(b.data, byte(x))
+}
+func (b *protobuf) length(tag int, len int) {
+	b.varint(uint64(tag)<<3 | 2)
+	b.varint(uint64(len))
+}
+func (b *protobuf) uint64(tag int, x uint64) {
+	// append varint to b.data
+	b.varint(uint64(tag)<<3 | 0)
+	b.varint(x)
+}
+func (b *protobuf) uint64s(tag int, x []uint64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(u)
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.uint64(tag, u)
+	}
+}
+func (b *protobuf) uint64Opt(tag int, x uint64) {
+	if x == 0 {
+		return
+	}
+	b.uint64(tag, x)
+}
+func (b *protobuf) int64(tag int, x int64) {
+	u := uint64(x)
+	b.uint64(tag, u)
+}
+func (b *protobuf) int64Opt(tag int, x int64) {
+	if x == 0 {
+		return
+	}
+	b.int64(tag, x)
+}
+func (b *protobuf) int64s(tag int, x []int64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(uint64(u))
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.int64(tag, u)
+	}
+}
+func (b *protobuf) string(tag int, x string) {
+	b.length(tag, len(x))
+	b.data = append(b.data, x...)
+}
+func (b *protobuf) strings(tag int, x []string) {
+	for _, s := range x {
+		b.string(tag, s)
+	}
+}
+func (b *protobuf) stringOpt(tag int, x string) {
+	if x == "" {
+		return
+	}
+	b.string(tag, x)
+}
+func (b *protobuf) bool(tag int, x bool) {
+	if x {
+		b.uint64(tag, 1)
+	} else {
+		b.uint64(tag, 0)
+	}
+}
+func (b *protobuf) boolOpt(tag int, x bool) {
+	if x == false {
+		return
+	}
+	b.bool(tag, x)
+}
+type msgOffset int
+func (b *protobuf) startMessage() msgOffset {
+	b.nest++
+	return msgOffset(len(b.data))
+}
+func (b *protobuf) endMessage(tag int, start msgOffset) {
+	n1 := int(start)
+	n2 := len(b.data)
+	b.length(tag, n2-n1)
+	n3 := len(b.data)
+	copy(b.tmp[:], b.data[n2:n3])
+	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+	copy(b.data[n1:], b.tmp[:n3-n2])
+	b.nest--
+}
--- a/src/runtime/pprof/protomem.go
+++ b/src/runtime/pprof/protomem.go
@@ -5,55 +5,65 @@
 package pprof
 import (
-	"internal/pprof/profile"
+	"io"
 	"math"
 	"runtime"
-	"time"
+	"strings"
 )
-// encodeMemProfile converts MemProfileRecords to a Profile.
+// writeHeapProto writes the current heap profile in protobuf format to w.
-func encodeMemProfile(mr []runtime.MemProfileRecord, rate int64, t time.Time) *profile.Profile {
+func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64) error {
-	p := &profile.Profile{
+	b := newProfileBuilder(w)
-		Period:     rate,
+	b.pbValueType(tagProfile_PeriodType, "space", "bytes")
-		PeriodType: &profile.ValueType{Type: "space", Unit: "bytes"},
+	b.pb.int64Opt(tagProfile_Period, rate)
-		SampleType: []*profile.ValueType{
+	b.pbValueType(tagProfile_SampleType, "alloc_objects", "count")
-			{Type: "alloc_objects", Unit: "count"},
+	b.pbValueType(tagProfile_SampleType, "alloc_space", "bytes")
-			{Type: "alloc_space", Unit: "bytes"},
+	b.pbValueType(tagProfile_SampleType, "inuse_objects", "count")
-			{Type: "inuse_objects", Unit: "count"},
+	b.pbValueType(tagProfile_SampleType, "inuse_space", "bytes")
-			{Type: "inuse_space", Unit: "bytes"},
-		},
-		TimeNanos: int64(t.UnixNano()),
-	}
-	locs := make(map[uintptr]*profile.Location)
+	values := []int64{0, 0, 0, 0}
-	for _, r := range mr {
+	var locs []uint64
-		stack := r.Stack()
+	for _, r := range p {
-		sloc := make([]*profile.Location, len(stack))
+		locs = locs[:0]
-		for i, addr := range stack {
+		hideRuntime := true
-			loc := locs[addr]
+		for tries := 0; tries < 2; tries++ {
-			if loc == nil {
+			for i, addr := range r.Stack() {
-				loc = &profile.Location{
+				if false && i > 0 { // TODO: why disabled?
-					ID:      uint64(len(p.Location) + 1),
+					addr--
-					Address: uint64(addr),
+				}
+				if hideRuntime {
+					if f := runtime.FuncForPC(addr); f != nil && strings.HasPrefix(f.Name(), "runtime.") {
+						continue
+					}
+					// Found non-runtime. Show any runtime uses above it.
+					hideRuntime = false
 				}
-				locs[addr] = loc
+				l := b.locForPC(addr)
-				p.Location = append(p.Location, loc)
+				if l == 0 { // runtime.goexit
+					continue
+				}
+				locs = append(locs, l)
+			}
+			if len(locs) > 0 {
+				break
 			}
-			sloc[i] = loc
+			hideRuntime = false // try again, and show all frames
 		}
-		ao, ab := scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
+		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
-		uo, ub := scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
+		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
+		var blockSize int64
-		p.Sample = append(p.Sample, &profile.Sample{
+		if values[0] > 0 {
-			Value:    []int64{ao, ab, uo, ub},
+			blockSize = values[1] / values[0]
-			Location: sloc,
+		}
+		b.pbSample(values, locs, func() {
+			if blockSize != 0 {
+				b.pbLabel(tagSample_Label, "bytes", "", blockSize)
+			}
 		})
 	}
-	if runtime.GOOS == "linux" {
+	b.build()
-		addMappings(p)
+	return nil
-	}
-	return p
 }
 // scaleHeapSample adjusts the data from a heap Sample to

--- a/src/runtime/pprof/protomem_test.go
+++ b/src/runtime/pprof/protomem_test.go
@@ -9,7 +9,6 @@ import (
 	"internal/pprof/profile"
 	"runtime"
 	"testing"
-	"time"
 )
 func TestConvertMemProfile(t *testing.T) {
@@ -24,8 +23,7 @@ func TestConvertMemProfile(t *testing.T) {
 		{AllocBytes: 512 * 1024, FreeBytes: 512 * 1024, AllocObjects: 1, FreeObjects: 1, Stack0: [32]uintptr{a1 + 1, a1 + 2, a2 + 3}},
 	}
-	p := encodeMemProfile(rec, rate, time.Now())
+	if err := writeHeapProto(&buf, rec, rate); err != nil {
-	if err := p.Write(&buf); err != nil {
 		t.Fatalf("writing profile: %v", err)
 	}
@@ -42,19 +40,31 @@ func TestConvertMemProfile(t *testing.T) {
 		{Type: "inuse_space", Unit: "bytes"},
 	}
 	samples := []*profile.Sample{
-		{Value: []int64{2050, 2099200, 1537, 1574400}, Location: []*profile.Location{
+		{
-			{ID: 1, Mapping: map1, Address: addr1},
+			Value: []int64{2050, 2099200, 1537, 1574400},
-			{ID: 2, Mapping: map2, Address: addr2},
+			Location: []*profile.Location{
-		}},
+				{ID: 1, Mapping: map1, Address: addr1},
-		{Value: []int64{1, 829411, 1, 829411}, Location: []*profile.Location{
+				{ID: 2, Mapping: map2, Address: addr2},
-			{ID: 3, Mapping: map2, Address: addr2 + 1},
+			},
-			{ID: 4, Mapping: map2, Address: addr2 + 2},
+			NumLabel: map[string][]int64{"bytes": {1024}},
-		}},
+		},
-		{Value: []int64{1, 829411, 0, 0}, Location: []*profile.Location{
+		{
-			{ID: 5, Mapping: map1, Address: addr1 + 1},
+			Value: []int64{1, 829411, 1, 829411},
-			{ID: 6, Mapping: map1, Address: addr1 + 2},
+			Location: []*profile.Location{
-			{ID: 7, Mapping: map2, Address: addr2 + 3},
+				{ID: 3, Mapping: map2, Address: addr2 + 1},
-		}},
+				{ID: 4, Mapping: map2, Address: addr2 + 2},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
+		{
+			Value: []int64{1, 829411, 0, 0},
+			Location: []*profile.Location{
+				{ID: 5, Mapping: map1, Address: addr1 + 1},
+				{ID: 6, Mapping: map1, Address: addr1 + 2},
+				{ID: 7, Mapping: map2, Address: addr2 + 3},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
 	}
 	checkProfile(t, p, rate, periodType, sampleType, samples)
 }