Commit d8c0bbd8 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Change stack crawling to not be based on unwinding

We always want to crawl the entire stack, and it's
possible to determine the extents of the stack, so just
do a scan over the entire memory range.

Also, change the way the interpreter keeps track of its roots;
we don't really need to associate the roots with a specific
interpreter frame.

This should hopefully clear up the weirdness about libunwind
trying to unwind through the pthreads assembly code, and potentially
also make stack crawling faster.
parent 9adb8f17
......@@ -66,8 +66,8 @@ git clone git://git.sv.gnu.org/libunwind.git libunwind-trunk
mkdir libunwind-trunk-install
cd libunwind-trunk
git checkout 65ac867416
# disable shared libraries because we'll be installing this in a place that the loader can't find it:
autoreconf -i
# disable shared libraries because we'll be installing this in a place that the loader can't find it:
./configure --prefix=$HOME/pyston_deps/libunwind-trunk-install --enable-shared=0
make -j4
make install
......
......@@ -29,6 +29,7 @@
#include "codegen/irgen/util.h"
#include "core/common.h"
#include "core/stats.h"
#include "core/thread_utils.h"
#include "core/util.h"
//#undef VERBOSITY
......@@ -206,25 +207,23 @@ static void set(SymMap& symbols, const llvm::BasicBlock::iterator& it, Val v) {
//#define SET(v) symbols.insert(std::make_pair(static_cast<llvm::Value*>(&(*it)), Val(v)))
}
static std::unordered_map<void*, const SymMap*> interpreter_roots;
static std::unordered_map<void*, llvm::Instruction*> cur_instruction_map;
void gatherInterpreterRootsForFrame(GCVisitor* visitor, void* frame_ptr) {
auto it = interpreter_roots.find(frame_ptr);
if (it == interpreter_roots.end()) {
printf("%p is not an interpreter frame; they are", frame_ptr);
for (const auto& p2 : interpreter_roots) {
printf(" %p", p2.first);
}
printf("\n");
abort();
}
typedef std::vector<const SymMap*> root_stack_t;
threading::PerThreadSet<root_stack_t> root_stack_set;
threading::PerThread<root_stack_t> thread_local root_stack(&root_stack_set);
// printf("Gathering roots for frame %p\n", frame_ptr);
const SymMap* symbols = it->second;
void gatherInterpreterRoots(GCVisitor* visitor) {
// In theory this lock should be superfluous since we should only call this
// inside a sequential section, but lock it anyway:
threading::LockedRegion _lock(&root_stack_set.lock);
for (const auto& p2 : *symbols) {
visitor->visitPotential(p2.second.o);
for (auto& p : root_stack_set.map) {
for (const SymMap* sym_map : *p.second) {
for (const auto& p2 : *sym_map) {
visitor->visitPotential(p2.second.o);
}
}
}
}
......@@ -236,8 +235,7 @@ public:
constexpr UnregisterHelper(void* frame_ptr) : frame_ptr(frame_ptr) {}
~UnregisterHelper() {
assert(interpreter_roots.count(frame_ptr));
interpreter_roots.erase(frame_ptr);
root_stack.value.pop_back();
assert(cur_instruction_map.count(frame_ptr));
cur_instruction_map.erase(frame_ptr);
......@@ -282,7 +280,7 @@ Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box*
SymMap symbols;
void* frame_ptr = __builtin_frame_address(0);
interpreter_roots[frame_ptr] = &symbols;
root_stack.value.push_back(&symbols);
UnregisterHelper helper(frame_ptr);
int arg_num = -1;
......
......@@ -27,7 +27,7 @@ class LineInfo;
Box* interpretFunction(llvm::Function* f, int nargs, Box* arg1, Box* arg2, Box* arg3, Box** args);
void gatherInterpreterRootsForFrame(GCVisitor* visitor, void* frame_ptr);
void gatherInterpreterRoots(GCVisitor* visitor);
const LineInfo* getLineInfoForInterpretedFrame(void* frame_ptr);
}
......
......@@ -25,6 +25,8 @@
#include <unordered_set>
#include <vector>
#define STACK_GROWS_DOWN 1
#define _STRINGIFY(N) #N
#define STRINGIFY(N) _STRINGIFY(N)
......
// Copyright (c) 2014 Dropbox, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PYSTON_CORE_THREADUTILS_H
#define PYSTON_CORE_THREADUTILS_H
#include <pthread.h>
namespace pyston {
namespace threading {
class LockedRegion {
private:
pthread_mutex_t* mutex;
public:
LockedRegion(pthread_mutex_t* mutex) : mutex(mutex) { pthread_mutex_lock(mutex); }
~LockedRegion() { pthread_mutex_unlock(mutex); }
};
template <typename T> class PerThreadSet {
public:
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
std::unordered_map<pthread_t, T*> map;
};
template <typename T> class PerThread {
private:
PerThreadSet<T>* set;
pthread_t self;
public:
T value;
PerThread(PerThreadSet<T>* set) : set(set), self(pthread_self()) {
LockedRegion _lock(&set->lock);
set->map[self] = &value;
}
~PerThread() {
LockedRegion _lock(&set->lock);
assert(set->map.count(self) == 1);
set->map.erase(self);
}
};
} // namespace threading
} // namespace pyston
#endif
......@@ -23,6 +23,7 @@
#include "core/common.h"
#include "core/options.h"
#include "core/thread_utils.h"
extern "C" int start_thread(void* arg);
......@@ -39,17 +40,6 @@ int tgkill(int tgid, int tid, int sig) {
return syscall(SYS_tgkill, tgid, tid, sig);
}
class LockedRegion {
private:
pthread_mutex_t* mutex;
public:
LockedRegion(pthread_mutex_t* mutex) : mutex(mutex) { pthread_mutex_lock(mutex); }
~LockedRegion() { pthread_mutex_unlock(mutex); }
};
// Certain thread examination functions won't be valid for a brief
// period while a thread is starting up.
// To handle this, track the number of threads in an uninitialized state,
......@@ -62,9 +52,18 @@ struct ThreadStartArgs {
};
static pthread_mutex_t threading_lock = PTHREAD_MUTEX_INITIALIZER;
static std::unordered_set<pid_t> current_threads;
struct ThreadInfo {
// "bottom" in the sense of a stack, which in a down-growing stack is the highest address:
void* stack_bottom;
pthread_t pthread_id;
};
static std::unordered_map<pid_t, ThreadInfo> current_threads;
static std::atomic<int> signals_waiting(0);
void* getStackBottom() {
return current_threads[gettid()].stack_bottom;
}
static int signals_waiting(0);
static std::vector<ThreadState> thread_states;
std::vector<ThreadState> getAllThreadStates() {
// TODO need to prevent new threads from starting,
......@@ -89,7 +88,8 @@ std::vector<ThreadState> getAllThreadStates() {
pid_t tgid = getpid();
pid_t mytid = gettid();
for (pid_t tid : current_threads) {
for (auto& pair : current_threads) {
pid_t tid = pair.first;
if (tid == mytid)
continue;
tgkill(tgid, tid, SIGUSR2);
......@@ -119,12 +119,19 @@ static void _thread_context_dump(int signum, siginfo_t* info, void* _context) {
printf("old rip: 0x%lx\n", context->uc_mcontext.gregs[REG_RIP]);
}
thread_states.push_back(ThreadState(tid, context));
signals_waiting--; // atomic on std::atomic
#if STACK_GROWS_DOWN
void* stack_start = (void*)context->uc_mcontext.gregs[REG_RSP];
void* stack_end = current_threads[tid].stack_bottom;
#else
void* stack_start = current_threads[tid].stack_bottom;
void* stack_end = (void*)(context->uc_mcontext.gregs[REG_RSP] + sizeof(void*));
#endif
assert(stack_start < stack_end);
thread_states.push_back(ThreadState(tid, context, stack_start, stack_end));
signals_waiting--;
}
static void* _thread_start(void* _arg) {
pid_t tid = gettid();
ThreadStartArgs* arg = static_cast<ThreadStartArgs*>(_arg);
auto start_func = arg->start_func;
Box* arg1 = arg->arg1;
......@@ -135,11 +142,33 @@ static void* _thread_start(void* _arg) {
{
LockedRegion _lock(&threading_lock);
current_threads.insert(tid);
pid_t tid = gettid();
pthread_t current_thread = pthread_self();
pthread_attr_t thread_attrs;
int code = pthread_getattr_np(current_thread, &thread_attrs);
RELEASE_ASSERT(code == 0, "");
void* stack_start;
size_t stack_size;
code = pthread_attr_getstack(&thread_attrs, &stack_start, &stack_size);
RELEASE_ASSERT(code == 0, "");
pthread_attr_destroy(&thread_attrs);
current_threads[tid] = ThreadInfo {
#if STACK_GROWS_DOWN
.stack_bottom = static_cast<char*>(stack_start) + stack_size,
#else
.stack_bottom = stack_start,
#endif
.pthread_id = current_thread,
};
num_starting_threads--;
if (VERBOSITY() >= 2)
printf("child initialized; tid=%d\n", tid);
printf("child initialized; tid=%d\n", gettid());
}
threading::GLReadRegion _glock;
......@@ -149,9 +178,9 @@ static void* _thread_start(void* _arg) {
{
LockedRegion _lock(&threading_lock);
current_threads.erase(tid);
current_threads.erase(gettid());
if (VERBOSITY() >= 2)
printf("thread tid=%d exited\n", tid);
printf("thread tid=%d exited\n", gettid());
}
return rtn;
......@@ -167,7 +196,7 @@ intptr_t start_thread(void* (*start_func)(Box*, Box*, Box*), Box* arg1, Box* arg
pthread_t thread_id;
int code = pthread_create(&thread_id, NULL, &_thread_start, args);
assert(code == 0);
RELEASE_ASSERT(code == 0, "");
if (VERBOSITY() >= 2)
printf("pthread thread_id: 0x%lx\n", thread_id);
......@@ -175,6 +204,50 @@ intptr_t start_thread(void* (*start_func)(Box*, Box*, Box*), Box* arg1, Box* arg
return thread_id;
}
// from https://www.sourceware.org/ml/guile/2000-07/msg00214.html
static void* find_stack() {
FILE* input;
char* line;
char* s;
size_t len;
char hex[9];
void* start;
void* end;
int dummy;
input = fopen("/proc/self/maps", "r");
if (input == NULL)
return NULL;
len = 0;
line = NULL;
while (getline(&line, &len, input) != -1) {
s = strchr(line, '-');
if (s == NULL)
return NULL;
*s++ = '\0';
start = (void*)strtoul(line, NULL, 16);
end = (void*)strtoul(s, NULL, 16);
if ((void*)&dummy >= start && (void*)&dummy <= end) {
free(line);
fclose(input);
#if STACK_GROWS_DOWN
return end;
#else
return start;
#endif
}
}
free(line);
fclose(input);
return NULL; /* not found =^P */
}
intptr_t call_frame_base;
void registerMainThread() {
LockedRegion _lock(&threading_lock);
......@@ -184,7 +257,9 @@ void registerMainThread() {
// call_frame_base = (intptr_t)::start_thread;
call_frame_base = (intptr_t)_thread_start;
current_threads.insert(gettid());
current_threads[gettid()] = ThreadInfo{
.stack_bottom = find_stack(), .pthread_id = pthread_self(),
};
struct sigaction act;
act.sa_flags = SA_SIGINFO;
......
......@@ -37,7 +37,10 @@ struct ThreadState {
pid_t tid; // useful mostly for debugging
ucontext_t ucontext;
ThreadState(pid_t tid, ucontext_t* ucontext) : tid(tid) {
void* stack_start, *stack_end;
ThreadState(pid_t tid, ucontext_t* ucontext, void* stack_start, void* stack_end)
: tid(tid), stack_start(stack_start), stack_end(stack_end) {
memcpy(&this->ucontext, ucontext, sizeof(ucontext_t));
this->ucontext.uc_mcontext.fpregs = &this->ucontext.__fpregs_mem;
}
......@@ -47,6 +50,10 @@ struct ThreadState {
// as a corollary, this thread is very much not thread safe.
std::vector<ThreadState> getAllThreadStates();
// Get the stack "bottom" (first pushed data; for stacks that grow down, will
// be the highest address).
void* getStackBottom();
#define THREADING_USE_GIL 1
#define THREADING_USE_GRWL 0
#define THREADING_SAFE_DATASTRUCTURES THREADING_USE_GRWL
......
......@@ -14,14 +14,11 @@
#include "gc/root_finder.h"
#define UNW_LOCAL_ONLY
#include <libunwind.h>
#include <cstring>
#include <setjmp.h>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <cstring>
#include <setjmp.h>
#include <vector>
#include "codegen/codegen.h"
......@@ -43,129 +40,46 @@ namespace gc {
void collectRoots(void* start, void* end, TraceStack* stack) {
assert(start <= end);
void** cur = (void**)start;
while (cur < end) {
void* p = global_heap.getAllocationFromInteriorPointer(*cur);
if (p)
stack->push(p);
cur++;
}
TraceStackGCVisitor(stack).visitPotentialRange((void**)start, (void**)end);
}
static void _unwindStack(unw_cursor_t* cursor, TraceStack* stack) {
TraceStackGCVisitor visitor(stack);
unw_word_t ip, sp, bp;
#ifndef NVALGRIND
if (RUNNING_ON_VALGRIND) {
memset(&ip, 0, sizeof(ip));
memset(&sp, 0, sizeof(sp));
memset(&bp, 0, sizeof(bp));
}
#endif
int code;
while (true) {
int code = unw_step(cursor);
// Negative codes are errors, zero means that there isn't a new frame.
RELEASE_ASSERT(code >= 0 && "something broke unwinding!", "%d '%s'", code, unw_strerror(code));
RELEASE_ASSERT(code != 0, "didn't get to the top of the stack!");
unw_get_reg(cursor, UNW_REG_IP, &ip);
unw_get_reg(cursor, UNW_REG_SP, &sp);
unw_get_reg(cursor, UNW_TDEP_BP, &bp);
void* cur_sp = (void*)sp;
void* cur_bp = (void*)bp;
// std::string name = g.func_addr_registry.getFuncNameAtAddress((void*)ip, true);
unw_proc_info_t pip;
unw_get_proc_info(cursor, &pip);
// if (VERBOSITY()) printf("ip = 0x%lx (start_ip = 0x%lx), stack = [%p, %p)\n", (long) ip, pip.start_ip, cur_sp,
// cur_bp);
if (pip.start_ip == (uintptr_t)&__libc_start_main) {
break;
}
if (pip.start_ip == (intptr_t)interpretFunction) {
// TODO Do we still need to crawl the interpreter itself?
gatherInterpreterRootsForFrame(&visitor, cur_bp);
}
collectRoots(cur_sp, (char*)cur_bp, stack);
if (pip.start_ip == threading::call_frame_base) {
break;
}
if (cur_bp == NULL) {
// TODO I think this indicates an unwind mistake by libunwind? Not sure.
// But if it returns cur_bp=NULL, this is probably just a thread where libunwind
// didn't reconstruct the call stack exactly the way we thought.
// TODO we probably don't need to do any unwinding here at all; we can just track
// the stack min and max for every thread.
break;
}
}
}
void collectOtherThreadsStacks(TraceStack* stack) {
std::vector<threading::ThreadState> threads = threading::getAllThreadStates();
// unw_addr_space_t as = getOtherAddrSpace();
for (threading::ThreadState& tstate : threads) {
unw_cursor_t cursor;
// int code = unw_init_remote(&cursor, as, &tstate);
int code = unw_init_local(&cursor, (ucontext_t*)&tstate.ucontext);
assert(code == 0);
// printf("Collecting thread %d\n", tstate.tid);
collectRoots(tstate.stack_start, tstate.stack_end, stack);
collectRoots(&tstate.ucontext, (&tstate.ucontext) + 1, stack);
_unwindStack(&cursor, stack);
}
}
static void collectLocalStack(TraceStack* stack) {
unw_cursor_t cursor;
unw_context_t uc;
// force callee-save registers onto the stack:
// Actually, I feel like this is pretty brittle:
// collectLocalStack itself is allowed to save the callee-save registers
// on its own stack.
jmp_buf registers __attribute__((aligned(sizeof(void*))));
#ifndef NVALGRIND
if (RUNNING_ON_VALGRIND) {
memset(&registers, 0, sizeof(registers));
memset(&cursor, 0, sizeof(cursor));
memset(&uc, 0, sizeof(uc));
}
#endif
setjmp(registers);
assert(sizeof(registers) % 8 == 0);
// void* stack_bottom = __builtin_frame_address(0);
collectRoots(&registers, &registers + 1, stack);
unw_getcontext(&uc);
unw_init_local(&cursor, &uc);
collectRoots(&registers, (&registers) + 1, stack);
_unwindStack(&cursor, stack);
void* stack_bottom = threading::getStackBottom();
#if STACK_GROWS_DOWN
collectRoots(&registers, stack_bottom, stack);
#else
collectRoots(stack_bottom, &registers + 1, stack);
#endif
}
void collectStackRoots(TraceStack* stack) {
collectLocalStack(stack);
collectOtherThreadsStacks(stack);
TraceStackGCVisitor visitor(stack);
gatherInterpreterRoots(&visitor);
}
}
}
......@@ -24,6 +24,12 @@ namespace pyston {
class PystonTestEnvironment : public testing::Environment {
void SetUp() override {
threading::registerMainThread();
threading::acquireGLRead();
}
void TearDown() override {
threading::releaseGLRead();
}
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment