Commit e038c7e4 authored by Hana (Hyang-Ah) Kim's avatar Hana (Hyang-Ah) Kim Committed by Hyang-Ah Hana Kim

runtime/pprof: correctly encode inlined functions in CPU profile

The pprof profile proto message expects inlined functions of a PC
to be encoded in one Location entry using multiple Line entries.
https://github.com/google/pprof/blob/5e96527/proto/profile.proto#L177-L184

runtime/pprof has encoded the symbolization information by creating
a Location for each PC found in the stack trace and including info
from all the frames expanded from the PC using runtime.CallersFrames.
This assumes inlined functions are represented as a single PC in the
stack trace. (https://go-review.googlesource.com/41256)

In the recent years, behavior around inlining and the traceback
changed significantly (e.g. https://golang.org/cl/152537,
https://golang.org/issue/29582, and many changes). Now the PCs
in the stack trace represent user frames even including inline
marks. As a result, the profile proto started to allocate a Location
entry for each user frame, lose the inline information (so pprof
presented incorrect results when inlined functions are involved),
and confuse the pprof tool with those PCs made up for inline marks.

This CL attempts to detect inlined call frames from the stack traces
of CPU profiles, and organize the Location information as intended.
Currently, runtime does not provide a reliable and convenient way to
detect inlined call frames and expand user frames from a given externally
recognizable PCs. So we use heuristics to recover the groups
  - inlined call frames have nil Func field
  - inlined call frames will have the same Entry point
  - but must be careful with recursive functions that have the
    same Entry point by definition, and non-Go functions that
    may lack most of the fields of Frame.

The followup CL will address the issue with other profile types.

Change-Id: I0c9667ab016a3e898d648f31c3f82d84c15398db
Reviewed-on: https://go-review.googlesource.com/c/go/+/204636Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent 33dfd352
......@@ -49,8 +49,12 @@ var (
// Must not call other functions nor access heap/globals in the loop,
// otherwise under race detector the samples will be in the race runtime.
func cpuHog1(x int) int {
return cpuHog0(x, 1e5)
}
func cpuHog0(x, n int) int {
foo := x
for i := 0; i < 1e5; i++ {
for i := 0; i < n; i++ {
if foo > 0 {
foo *= foo
} else {
......@@ -101,34 +105,69 @@ func TestCPUProfileMultithreaded(t *testing.T) {
}
func TestCPUProfileInlining(t *testing.T) {
testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
cpuHogger(inlinedCaller, &salt1, dur)
})
// Check if inlined function locations are encoded correctly. The inlinedCalee and inlinedCaller should be in one location.
for _, loc := range p.Location {
hasInlinedCallerAfterInlinedCallee, hasInlinedCallee := false, false
for _, line := range loc.Line {
if line.Function.Name == "runtime/pprof.inlinedCallee" {
hasInlinedCallee = true
}
if hasInlinedCallee && line.Function.Name == "runtime/pprof.inlinedCaller" {
hasInlinedCallerAfterInlinedCallee = true
}
}
if hasInlinedCallee != hasInlinedCallerAfterInlinedCallee {
t.Fatalf("want inlinedCallee followed by inlinedCaller, got separate Location entries:\n%v", p)
}
}
}
func inlinedCaller(x int) int {
x = inlinedCallee(x)
x = inlinedCallee(x, 1e5)
return x
}
func inlinedCallee(x int) int {
// We could just use cpuHog1, but for loops prevent inlining
// right now. :(
foo := x
i := 0
loop:
if foo > 0 {
foo *= foo
} else {
foo *= foo + 1
func inlinedCallee(x, n int) int {
return cpuHog0(x, n)
}
func TestCPUProfileRecursion(t *testing.T) {
p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) {
cpuHogger(recursionCaller, &salt1, dur)
})
// check the Location encoding was not confused by recursive calls.
for i, loc := range p.Location {
recursionFunc := 0
for _, line := range loc.Line {
if name := line.Function.Name; name == "runtime/pprof.recursionCaller" || name == "runtime/pprof.recursionCallee" {
recursionFunc++
}
}
if recursionFunc > 1 {
t.Fatalf("want at most one recursionCaller or recursionCallee in one Location, got a violating Location (index: %d):\n%v", i, p)
}
}
if i++; i < 1e5 {
goto loop
}
func recursionCaller(x int) int {
y := recursionCallee(3, x)
return y
}
func recursionCallee(n, x int) int {
if n == 0 {
return 1
}
return foo
y := inlinedCallee(x, 1e4)
return y * recursionCallee(n-1, x)
}
func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile {
p, err := profile.Parse(bytes.NewReader(valBytes))
if err != nil {
t.Fatal(err)
......@@ -137,11 +176,12 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
count := uintptr(sample.Value[0])
f(count, sample.Location, sample.Label)
}
return p
}
// testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
// as interpreted by matches.
func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
// as interpreted by matches, and returns the parsed profile.
func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
switch runtime.GOOS {
case "darwin":
switch runtime.GOARCH {
......@@ -195,8 +235,8 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
f(duration)
StopCPUProfile()
if profileOk(t, matches, need, avoid, prof, duration) {
return
if p, ok := profileOk(t, matches, need, avoid, prof, duration); ok {
return p
}
duration *= 2
......@@ -217,6 +257,7 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
}
t.FailNow()
return nil
}
func contains(slice []string, s string) bool {
......@@ -242,7 +283,7 @@ func stackContains(spec string, count uintptr, stk []*profile.Location, labels m
type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (_ *profile.Profile, ok bool) {
ok = true
// Check that profile is well formed, contains 'need', and does not contain
......@@ -251,7 +292,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
avoidSamples := make([]uintptr, len(avoid))
var samples uintptr
var buf bytes.Buffer
parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
fmt.Fprintf(&buf, "%d:", count)
fprintStack(&buf, stk)
samples += count
......@@ -278,7 +319,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
// not enough samples due to coarse timer
// resolution. Let it go.
t.Log("too few samples on Windows (golang.org/issue/10842)")
return false
return p, false
}
// Check that we got a reasonable number of samples.
......@@ -300,7 +341,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
}
if len(need) == 0 {
return ok
return p, ok
}
var total uintptr
......@@ -323,7 +364,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
ok = false
}
}
return ok
return p, ok
}
// Fork can hang if preempted with signals frequently enough (see issue 5517).
......
......@@ -41,8 +41,8 @@ type profileBuilder struct {
pb protobuf
strings []string
stringMap map[string]int
locs map[uintptr]int
funcs map[string]int // Package path-qualified function name to Function.ID
locs map[uintptr]locInfo // list of locInfo starting with the given PC.
funcs map[string]int // Package path-qualified function name to Function.ID
mem []memMap
}
......@@ -207,13 +207,43 @@ func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file
b.pb.endMessage(tag, start)
}
func allFrames(addr uintptr) ([]runtime.Frame, symbolizeFlag) {
// Expand this one address using CallersFrames so we can cache
// each expansion. In general, CallersFrames takes a whole
// stack, but in this case we know there will be no skips in
// the stack and we have return PCs anyway.
frames := runtime.CallersFrames([]uintptr{addr})
frame, more := frames.Next()
if frame.Function == "runtime.goexit" {
// Short-circuit if we see runtime.goexit so the loop
// below doesn't allocate a useless empty location.
return nil, 0
}
symbolizeResult := lookupTried
if frame.PC == 0 || frame.Function == "" || frame.File == "" || frame.Line == 0 {
symbolizeResult |= lookupFailed
}
if frame.PC == 0 {
// If we failed to resolve the frame, at least make up
// a reasonable call PC. This mostly happens in tests.
frame.PC = addr - 1
}
ret := []runtime.Frame{frame}
for frame.Function != "runtime.goexit" && more == true {
frame, more = frames.Next()
ret = append(ret, frame)
}
return ret, symbolizeResult
}
// locForPC returns the location ID for addr.
// addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call.
// It may emit to b.pb, so there must be no message encoding in progress.
func (b *profileBuilder) locForPC(addr uintptr) uint64 {
id := uint64(b.locs[addr])
if id != 0 {
return id
if loc, ok := b.locs[addr]; ok {
return loc.id
}
// Expand this one address using CallersFrames so we can cache
......@@ -248,8 +278,8 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
}
newFuncs := make([]newFunc, 0, 8)
id = uint64(len(b.locs)) + 1
b.locs[addr] = int(id)
id := uint64(len(b.locs)) + 1
b.locs[addr] = locInfo{id: id, pcs: []uintptr{addr}}
start := b.pb.startMessage()
b.pb.uint64Opt(tagLocation_ID, id)
b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
......@@ -293,6 +323,16 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
return id
}
type locInfo struct {
// location id assigned by the profileBuilder
id uint64
// sequence of PCs, including the fake PCs returned by the traceback
// to represent inlined functions
// https://github.com/golang/go/blob/d6f2f833c93a41ec1c68e49804b8387a06b131c5/src/runtime/traceback.go#L347-L368
pcs []uintptr
}
// newProfileBuilder returns a new profileBuilder.
// CPU profiling data obtained from the runtime can be added
// by calling b.addCPUData, and then the eventual profile
......@@ -305,7 +345,7 @@ func newProfileBuilder(w io.Writer) *profileBuilder {
start: time.Now(),
strings: []string{""},
stringMap: map[string]int{"": 0},
locs: map[uintptr]int{},
locs: map[uintptr]locInfo{},
funcs: map[string]int{},
}
b.readMapping()
......@@ -388,7 +428,10 @@ func (b *profileBuilder) build() {
}
values := []int64{0, 0}
var deck = &pcDeck{}
var locs []uint64
for e := b.m.all; e != nil; e = e.nextAll {
values[0] = e.count
values[1] = e.count * b.period
......@@ -402,23 +445,62 @@ func (b *profileBuilder) build() {
}
}
deck.reset()
locs = locs[:0]
for i, addr := range e.stk {
// Addresses from stack traces point to the
// next instruction after each call, except
// for the leaf, which points to where the
// signal occurred. locForPC expects return
// PCs, so increment the leaf address to look
// like a return PC.
if i == 0 {
addr++
// Addresses from stack traces point to the next instruction after each call,
// except for the leaf, which points to where the signal occurred.
// deck.add+emitLocation expects return PCs so increment the leaf address to
// look like a return PC.
e.stk[0] += 1
for stk := e.stk; len(stk) > 0; {
addr := stk[0]
if l, ok := b.locs[addr]; ok {
// first record the location if there is any pending accumulated info.
if id := b.emitLocation(deck); id > 0 {
locs = append(locs, id)
}
// then, record the cached location.
locs = append(locs, l.id)
stk = stk[len(l.pcs):] // skip the matching pcs.
continue
}
l := b.locForPC(addr)
if l == 0 { // runtime.goexit
frames, symbolizeResult := allFrames(addr)
if len(frames) == 0 { // runtime.goexit.
if id := b.emitLocation(deck); id > 0 {
locs = append(locs, id)
}
stk = stk[1:]
continue
}
locs = append(locs, l)
if added := deck.tryAdd(addr, frames, symbolizeResult); added {
stk = stk[1:]
continue
}
// add failed because this addr is not inlined with
// the existing PCs in the deck. Flush the deck and retry to
// handle this pc.
if id := b.emitLocation(deck); id > 0 {
locs = append(locs, id)
}
// check cache again - previous emitLocation added a new entry
if l, ok := b.locs[addr]; ok {
locs = append(locs, l.id)
stk = stk[len(l.pcs):] // skip the matching pcs.
} else {
deck.tryAdd(addr, frames, symbolizeResult) // must succeed.
stk = stk[1:]
}
}
if id := b.emitLocation(deck); id > 0 { // emit remaining location.
locs = append(locs, id)
}
e.stk[0] -= 1 // undo the adjustment on the leaf done before the loop.
b.pbSample(values, locs, labels)
}
......@@ -435,6 +517,133 @@ func (b *profileBuilder) build() {
b.zw.Close()
}
// pcDeck is a helper to detect a sequence of inlined functions from
// a stack trace returned by the runtime.
//
// The stack traces returned by runtime's trackback functions are fully
// expanded (at least for Go functions) and include the fake pcs representing
// inlined functions. The profile proto expects the inlined functions to be
// encoded in one Location message.
// https://github.com/google/pprof/blob/5e965273ee43930341d897407202dd5e10e952cb/proto/profile.proto#L177-L184
//
// Runtime does not directly expose whether a frame is for an inlined function
// and looking up debug info is not ideal, so we use a heuristic to filter
// the fake pcs and restore the inlined and entry functions. Inlined functions
// have the following properties:
// Frame's Func is nil (note: also true for non-Go functions), and
// Frame's Entry matches its entry function frame's Entry. (note: could also be true for recursive calls and non-Go functions),
// Frame's Name does not match its entry function frame's name.
//
// As reading and processing the pcs in a stack trace one by one (from leaf to the root),
// we use pcDeck to temporarily hold the observed pcs and their expanded frames
// until we observe the entry function frame.
type pcDeck struct {
pcs []uintptr
frames []runtime.Frame
symbolizeResult symbolizeFlag
}
func (d *pcDeck) reset() {
d.pcs = d.pcs[:0]
d.frames = d.frames[:0]
d.symbolizeResult = 0
}
// tryAdd tries to add the pc and Frames expanded from it (most likely one,
// since the stack trace is already fully expanded) and the symbolizeResult
// to the deck. If it fails the caller needs to flush the deck and retry.
func (d *pcDeck) tryAdd(pc uintptr, frames []runtime.Frame, symbolizeResult symbolizeFlag) (success bool) {
if existing := len(d.pcs); existing > 0 {
// 'frames' are all expanded from one 'pc' and represent all inlined functions
// so we check only the first one.
newFrame := frames[0]
last := d.frames[existing-1]
if last.Func != nil && newFrame.Func != nil { // Can't be an inlined frame.
return false
}
if last.Entry == 0 || newFrame.Entry == 0 { // Possibly not a Go function. Don't try to merge.
return false
}
if last.Entry != newFrame.Entry { // newFrame is for a different function.
return false
}
if last.Function == newFrame.Function { // maybe recursion.
return false
}
}
d.pcs = append(d.pcs, pc)
d.frames = append(d.frames, frames...)
d.symbolizeResult |= symbolizeResult
return true
}
// emitLocation emits the new location and function information recorded in the deck
// and returns the location ID encoded in the profile protobuf.
// It emits to b.pb, so there must be no message encoding in progress.
// It resets the deck.
func (b *profileBuilder) emitLocation(deck *pcDeck) uint64 {
defer deck.reset()
if len(deck.pcs) == 0 {
return 0
}
addr := deck.pcs[0]
firstFrame := deck.frames[0]
// We can't write out functions while in the middle of the
// Location message, so record new functions we encounter and
// write them out after the Location.
type newFunc struct {
id uint64
name, file string
}
newFuncs := make([]newFunc, 0, 8)
id := uint64(len(b.locs)) + 1
b.locs[addr] = locInfo{id: id, pcs: append([]uintptr{}, deck.pcs...)}
start := b.pb.startMessage()
b.pb.uint64Opt(tagLocation_ID, id)
b.pb.uint64Opt(tagLocation_Address, uint64(firstFrame.PC))
for _, frame := range deck.frames {
// Write out each line in frame expansion.
funcID := uint64(b.funcs[frame.Function])
if funcID == 0 {
funcID = uint64(len(b.funcs)) + 1
b.funcs[frame.Function] = int(funcID)
newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
}
b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
}
for i := range b.mem {
if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
m := b.mem[i]
m.funcs |= deck.symbolizeResult
b.mem[i] = m
break
}
}
b.pb.endMessage(tagProfile_Location, start)
// Write out functions we found during frame expansion.
for _, fn := range newFuncs {
start := b.pb.startMessage()
b.pb.uint64Opt(tagFunction_ID, fn.id)
b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
b.pb.endMessage(tagProfile_Function, start)
}
b.flush()
return id
}
// readMapping reads /proc/self/maps and writes mappings to b.pb.
// It saves the address ranges of the mappings in b.mem for use
// when emitting locations.
......
......@@ -358,6 +358,17 @@ func TestMapping(t *testing.T) {
continue
}
}
if traceback == "Go+C" {
// The test code was arranged to have PCs from C and
// they are not symbolized.
// Check no Location containing those unsymbolized PCs contains multiple lines.
for i, loc := range prof.Location {
if !symbolized(loc) && len(loc.Line) > 1 {
t.Errorf("Location[%d] contains unsymbolized PCs and multiple lines: %v", i, loc)
}
}
}
})
}
}
......
......@@ -17,8 +17,7 @@ package main
int cpuHogCSalt1 = 0;
int cpuHogCSalt2 = 0;
void CPUHogCFunction() {
int foo = cpuHogCSalt1;
void CPUHogCFunction0(int foo) {
int i;
for (i = 0; i < 100000; i++) {
if (foo > 0) {
......@@ -30,6 +29,10 @@ void CPUHogCFunction() {
}
}
void CPUHogCFunction() {
CPUHogCFunction0(cpuHogCSalt1);
}
struct CgoTracebackArg {
uintptr_t context;
uintptr_t sigContext;
......@@ -39,8 +42,9 @@ struct CgoTracebackArg {
void CollectCgoTraceback(void* parg) {
struct CgoTracebackArg* arg = (struct CgoTracebackArg*)(parg);
arg->buf[0] = (uintptr_t)(CPUHogCFunction);
arg->buf[1] = 0;
arg->buf[0] = (uintptr_t)(CPUHogCFunction0);
arg->buf[1] = (uintptr_t)(CPUHogCFunction);
arg->buf[2] = 0;
};
*/
import "C"
......@@ -81,7 +85,6 @@ var salt1 int
var salt2 int
func cpuHogGoFunction() {
// Generates CPU profile samples including a Go call path.
for {
foo := salt1
for i := 0; i < 1e5; i++ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment