Commit 7596e61a authored by Kevin Modzelewski's avatar Kevin Modzelewski

Python-level sampling profiler

Uses setitimer() to set a recurring signal, and prints
a Python stacktrace at the next safepoint aka allowGLReadPreemption.
This is not great since allowGLReadPreemption can happen a decent
amount later than the signal.  (I'll play around with trying to get
the signal to be acted on sooner, but it might be better to wait
for full signal-handling support.)

Still, it seems to provide some decent high-level info.  For example,
half of the startup time of the django-template benchmark seems to be
due to regular expressions.
parent d618cd63
...@@ -300,8 +300,6 @@ public: ...@@ -300,8 +300,6 @@ public:
}; };
Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block, AST_stmt* start_at) { Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block, AST_stmt* start_at) {
threading::allowGLReadPreemption();
STAT_TIMER(t0, "us_timer_astinterpreter_execute"); STAT_TIMER(t0, "us_timer_astinterpreter_execute");
void* frame_addr = __builtin_frame_address(0); void* frame_addr = __builtin_frame_address(0);
...@@ -315,6 +313,11 @@ Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block ...@@ -315,6 +313,11 @@ Value ASTInterpreter::execute(ASTInterpreter& interpreter, CFGBlock* start_block
start_at = start_block->body[0]; start_at = start_block->body[0];
} }
// Important that this happens after RegisterHelper:
interpreter.current_inst = start_at;
threading::allowGLReadPreemption();
interpreter.current_inst = NULL;
interpreter.current_block = start_block; interpreter.current_block = start_block;
bool started = false; bool started = false;
for (auto s : start_block->body) { for (auto s : start_block->body) {
......
...@@ -87,6 +87,8 @@ extern GlobalState g; ...@@ -87,6 +87,8 @@ extern GlobalState g;
// in runtime_hooks.cpp: // in runtime_hooks.cpp:
void initGlobalFuncs(GlobalState& g); void initGlobalFuncs(GlobalState& g);
extern int sigprof_pending;
DS_DECLARE_RWLOCK(codegen_rwlock); DS_DECLARE_RWLOCK(codegen_rwlock);
} }
......
...@@ -55,6 +55,8 @@ ...@@ -55,6 +55,8 @@
namespace pyston { namespace pyston {
int sigprof_pending = 0;
GlobalState g; GlobalState g;
extern "C" { extern "C" {
...@@ -356,6 +358,10 @@ static void handle_sigusr1(int signum) { ...@@ -356,6 +358,10 @@ static void handle_sigusr1(int signum) {
_printStacktrace(); _printStacktrace();
} }
static void handle_sigprof(int signum) {
sigprof_pending++;
}
static void handle_sigint(int signum) { static void handle_sigint(int signum) {
assert(signum == SIGINT); assert(signum == SIGINT);
// TODO: this should set a flag saying a KeyboardInterrupt is pending. // TODO: this should set a flag saying a KeyboardInterrupt is pending.
...@@ -461,6 +467,13 @@ void initCodegen() { ...@@ -461,6 +467,13 @@ void initCodegen() {
signal(SIGUSR1, &handle_sigusr1); signal(SIGUSR1, &handle_sigusr1);
signal(SIGINT, &handle_sigint); signal(SIGINT, &handle_sigint);
#if ENABLE_SAMPLING_PROFILER
struct itimerval prof_timer;
prof_timer.it_value.tv_sec = prof_timer.it_interval.tv_sec = 0;
prof_timer.it_value.tv_usec = prof_timer.it_interval.tv_usec = 1000;
signal(SIGPROF, handle_sigprof);
setitimer(ITIMER_PROF, &prof_timer, NULL);
#endif
// There are some parts of llvm that are only configurable through command line args, // There are some parts of llvm that are only configurable through command line args,
// so construct a fake argc/argv pair and pass it to the llvm command line machinery: // so construct a fake argc/argv pair and pass it to the llvm command line machinery:
......
...@@ -594,7 +594,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc ...@@ -594,7 +594,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc
// Function-entry safepoint: // Function-entry safepoint:
// TODO might be more efficient to do post-call safepoints? // TODO might be more efficient to do post-call safepoints?
generator->doSafePoint(); generator->doSafePoint(block->body[0]);
} else if (entry_descriptor && block == entry_descriptor->backedge->target) { } else if (entry_descriptor && block == entry_descriptor->backedge->target) {
assert(block->predecessors.size() > 1); assert(block->predecessors.size() > 1);
assert(osr_entry_block); assert(osr_entry_block);
...@@ -763,7 +763,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc ...@@ -763,7 +763,7 @@ static void emitBBs(IRGenState* irstate, TypeAnalysis* types, const OSREntryDesc
if (predecessor->idx > block->idx) { if (predecessor->idx > block->idx) {
// Loop safepoint: // Loop safepoint:
// TODO does it matter which side of the backedge these are on? // TODO does it matter which side of the backedge these are on?
generator->doSafePoint(); generator->doSafePoint(block->body[0]);
break; break;
} }
} }
......
...@@ -2618,7 +2618,15 @@ public: ...@@ -2618,7 +2618,15 @@ public:
} }
} }
void doSafePoint() override { emitter.getBuilder()->CreateCall(g.funcs.allowGLReadPreemption); } void doSafePoint(AST_stmt* next_statement) override {
// If the sampling profiler is turned on (and eventually, destructors), we need frame-introspection
// support while in allowGLReadPreemption:
#if ENABLE_SAMPLING_PROFILER
emitter.createCall(UnwindInfo(next_statement, NULL), g.funcs.allowGLReadPreemption);
#else
emitter.getBuilder()->CreateCall(g.funcs.allowGLReadPreemption);
#endif
}
}; };
IRGenerator* createIRGenerator(IRGenState* irstate, std::unordered_map<CFGBlock*, llvm::BasicBlock*>& entry_blocks, IRGenerator* createIRGenerator(IRGenState* irstate, std::unordered_map<CFGBlock*, llvm::BasicBlock*>& entry_blocks,
......
...@@ -127,7 +127,7 @@ public: ...@@ -127,7 +127,7 @@ public:
virtual void copySymbolsFrom(SymbolTable* st) = 0; virtual void copySymbolsFrom(SymbolTable* st) = 0;
virtual void run(const CFGBlock* block) = 0; // primary entry point virtual void run(const CFGBlock* block) = 0; // primary entry point
virtual EndingState getEndingSymbolTable() = 0; virtual EndingState getEndingSymbolTable() = 0;
virtual void doSafePoint() = 0; virtual void doSafePoint(AST_stmt* next_statement) = 0;
virtual void addFrameStackmapArgs(PatchpointInfo* pp, AST_stmt* current_stmt, virtual void addFrameStackmapArgs(PatchpointInfo* pp, AST_stmt* current_stmt,
std::vector<llvm::Value*>& stackmap_args) = 0; std::vector<llvm::Value*>& stackmap_args) = 0;
}; };
......
...@@ -1000,7 +1000,7 @@ AST_Module* parse_file(const char* fn) { ...@@ -1000,7 +1000,7 @@ AST_Module* parse_file(const char* fn) {
if (ENABLE_PYPA_PARSER) { if (ENABLE_PYPA_PARSER) {
AST_Module* rtn = pypa_parse(fn); AST_Module* rtn = pypa_parse(fn);
RELEASE_ASSERT(rtn, "unknown parse error"); RELEASE_ASSERT(rtn, "unknown parse error (possibly: '%s'?)", strerror(errno));
return rtn; return rtn;
} }
......
...@@ -345,7 +345,7 @@ public: ...@@ -345,7 +345,7 @@ public:
return reinterpret_cast<AST_stmt*>(readLocation(e.locations[0])); return reinterpret_cast<AST_stmt*>(readLocation(e.locations[0]));
} }
} }
abort(); RELEASE_ASSERT(0, "no frame info found at offset 0x%x / ip 0x%lx!", offset, ip);
} else if (id.type == PythonFrameId::INTERPRETED) { } else if (id.type == PythonFrameId::INTERPRETED) {
return getCurrentStatementForInterpretedFrame((void*)id.bp); return getCurrentStatementForInterpretedFrame((void*)id.bp);
} }
......
...@@ -47,6 +47,8 @@ extern bool ENABLE_ICS, ENABLE_ICGENERICS, ENABLE_ICGETITEMS, ENABLE_ICSETITEMS, ...@@ -47,6 +47,8 @@ extern bool ENABLE_ICS, ENABLE_ICGENERICS, ENABLE_ICGETITEMS, ENABLE_ICSETITEMS,
// Due to a temporary LLVM limitation, represent bools as i64's instead of i1's. // Due to a temporary LLVM limitation, represent bools as i64's instead of i1's.
extern bool BOOLS_AS_I64; extern bool BOOLS_AS_I64;
#define ENABLE_SAMPLING_PROFILER 0
} }
} }
......
...@@ -24,12 +24,14 @@ ...@@ -24,12 +24,14 @@
#include "Python.h" #include "Python.h"
#include "codegen/codegen.h" // sigprof_pending
#include "core/common.h" #include "core/common.h"
#include "core/options.h" #include "core/options.h"
#include "core/stats.h" #include "core/stats.h"
#include "core/thread_utils.h" #include "core/thread_utils.h"
#include "core/util.h" #include "core/util.h"
#include "gc/collector.h" #include "gc/collector.h"
#include "runtime/objmodel.h" // _printStacktrace
namespace pyston { namespace pyston {
namespace threading { namespace threading {
...@@ -528,6 +530,18 @@ int gil_check_count = 0; ...@@ -528,6 +530,18 @@ int gil_check_count = 0;
// We could enforce fairness by having a FIFO of events (implementd with mutexes?) // We could enforce fairness by having a FIFO of events (implementd with mutexes?)
// and make sure to always wake up the longest-waiting one. // and make sure to always wake up the longest-waiting one.
void allowGLReadPreemption() { void allowGLReadPreemption() {
#if ENABLE_SAMPLING_PROFILER
if (unlikely(sigprof_pending)) {
// Output multiple stacktraces if we received multiple signals
// between being able to handle it (such as being in LLVM or the GC),
// to try to fully account for that time.
while (sigprof_pending) {
_printStacktrace();
sigprof_pending--;
}
}
#endif
// Double-checked locking: first read with no ordering constraint: // Double-checked locking: first read with no ordering constraint:
if (!threads_waiting_on_gil.load(std::memory_order_relaxed)) if (!threads_waiting_on_gil.load(std::memory_order_relaxed))
return; return;
......
...@@ -101,7 +101,16 @@ void raiseSyntaxErrorHelper(const std::string& file, const std::string& func, AS ...@@ -101,7 +101,16 @@ void raiseSyntaxErrorHelper(const std::string& file, const std::string& func, AS
} }
void _printStacktrace() { void _printStacktrace() {
static bool recursive = false;
if (recursive) {
fprintf(stderr, "_printStacktrace ran into an issue; refusing to try it again!\n");
return;
}
recursive = true;
printTraceback(getTraceback()); printTraceback(getTraceback());
recursive = false;
} }
// where should this go... // where should this go...
......
...@@ -85,6 +85,7 @@ void printTraceback(Box* b) { ...@@ -85,6 +85,7 @@ void printTraceback(Box* b) {
fprintf(stderr, " %.*s\n", (int)r, ptr); fprintf(stderr, " %.*s\n", (int)r, ptr);
free(buf); free(buf);
} }
fclose(f);
} }
} }
} }
......
import sys
if __name__ == "__main__":
fn = sys.argv[1]
tracebacks = []
cur_traceback = []
with open(fn) as f:
for l in f:
if l.startswith("Traceback"):
if cur_traceback:
tracebacks.append(''.join(cur_traceback).strip())
cur_traceback = []
else:
cur_traceback.append(l)
if cur_traceback:
tracebacks.append(''.join(cur_traceback).strip())
cur_traceback = []
counts = {}
for t in tracebacks:
# dedupe on:
# key = t # full traceback
# key = '\n'.join(t.split('\n')[-8:]) # last 4 stack frames
# key = '\n'.join(t.split('\n')[-2:]) # last stack frame
key = t.split(' File "')[-1].split()[0][:-2] # filename of last stack trace
counts[key] = counts.get(key, 0) + 1
n = len(tracebacks)
NUM_DISPLAY = 20
entries = sorted(counts.items(), key=lambda (k, v): v)
if len(counts) > NUM_DISPLAY:
num_hidden = 0
counts_hidden = 0
for k, v in entries[:-NUM_DISPLAY]:
num_hidden += 1
counts_hidden += v
print "Hiding %d entries that occurred %d (%.1f%%) times" % (num_hidden, counts_hidden, 100.0 * counts_hidden / n)
for k, v in sorted(counts.items(), key=lambda (k, v): v)[-NUM_DISPLAY:]:
print
print "Occurs %d (%.1f%%) times:" % (v, 100.0 * v / n)
print k
print "Total tracebacks:", n
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment