Commit 10067380 authored by Kevin Modzelewski's avatar Kevin Modzelewski

Merge branch 'ic_rewriting'

This might introduce a perf regression (ICs get slightly slower) but cleans
things up a lot and fixes a few bugs.
parents 0ba1cac6 ee247d3e
......@@ -457,6 +457,7 @@ struct _typeobject {
char _dep_getattrs[56]; // FIXME: this is hardcoding the size of this particular implementation of std::unordered_map
char _ics[32];
void* _gcvisit_func;
void* _dtor;
int _attrs_offset;
bool _flags[2];
};
......
......@@ -616,12 +616,52 @@ void Assembler::sub(Immediate imm, Register reg) {
emitArith(imm, reg, OPCODE_SUB);
}
void Assembler::inc(Register reg) {
UNIMPLEMENTED();
void Assembler::incl(Indirect mem) {
int src_idx = mem.base.regnum;
int rex = 0;
if (src_idx >= 8) {
rex |= REX_B;
src_idx -= 8;
}
assert(src_idx >= 0 && src_idx < 8);
if (rex)
emitRex(rex);
emitByte(0xff);
assert(-0x80 <= mem.offset && mem.offset < 0x80);
if (mem.offset == 0) {
emitModRM(0b00, 0, src_idx);
} else {
emitModRM(0b01, 0, src_idx);
emitByte(mem.offset);
}
}
void Assembler::inc(Indirect mem) {
UNIMPLEMENTED();
void Assembler::decl(Indirect mem) {
int src_idx = mem.base.regnum;
int rex = 0;
if (src_idx >= 8) {
rex |= REX_B;
src_idx -= 8;
}
assert(src_idx >= 0 && src_idx < 8);
if (rex)
emitRex(rex);
emitByte(0xff);
assert(-0x80 <= mem.offset && mem.offset < 0x80);
if (mem.offset == 0) {
emitModRM(0b00, 1, src_idx);
} else {
emitModRM(0b01, 1, src_idx);
emitByte(mem.offset);
}
}
......
......@@ -125,8 +125,8 @@ public:
void add(Immediate imm, Register reg);
void sub(Immediate imm, Register reg);
void inc(Register reg);
void inc(Indirect mem);
void incl(Indirect mem);
void decl(Indirect mem);
void callq(Register reg);
void retq();
......@@ -158,6 +158,7 @@ public:
void fillWithNopsExcept(int bytes);
void emitAnnotation(int num);
int bytesWritten() { return addr - start_addr; }
uint8_t* curInstPointer() { return addr; }
bool isExactlyFull() { return addr == end_addr; }
};
......
......@@ -71,7 +71,7 @@ void ICSlotRewrite::abort() {
ic->failed = true;
}
void ICSlotRewrite::commit(uint64_t decision_path, CommitHook* hook) {
void ICSlotRewrite::commit(CommitHook* hook) {
bool still_valid = true;
for (int i = 0; i < dependencies.size(); i++) {
int orig_version = dependencies[i].second;
......@@ -87,7 +87,7 @@ void ICSlotRewrite::commit(uint64_t decision_path, CommitHook* hook) {
return;
}
ICSlotInfo* ic_entry = ic->pickEntryForRewrite(decision_path, debug_name);
ICSlotInfo* ic_entry = ic->pickEntryForRewrite(debug_name);
if (ic_entry == NULL)
return;
......@@ -99,9 +99,10 @@ void ICSlotRewrite::commit(uint64_t decision_path, CommitHook* hook) {
uint8_t* slot_start = (uint8_t*)ic->start_addr + ic_entry->idx * ic->getSlotSize();
uint8_t* continue_point = (uint8_t*)ic->continue_addr;
hook->finishAssembly(continue_point - slot_start);
hook->finishAssembly(ic_entry, continue_point - slot_start);
assert(assembler->isExactlyFull());
assert(!assembler->hasFailed());
// if (VERBOSITY()) printf("Commiting to %p-%p\n", start, start + ic->slot_size);
memcpy(slot_start, buf, ic->getSlotSize());
......@@ -145,40 +146,25 @@ ICSlotRewrite* ICInfo::startRewrite(const char* debug_name) {
return new ICSlotRewrite(this, debug_name);
}
ICSlotInfo* ICInfo::pickEntryForRewrite(uint64_t decision_path, const char* debug_name) {
for (int i = 0; i < getNumSlots(); i++) {
SlotInfo& sinfo = slots[i];
if (!sinfo.is_patched) {
if (VERBOSITY()) {
printf("committing %s icentry to unused slot %d at %p\n", debug_name, i, start_addr);
}
sinfo.is_patched = true;
sinfo.decision_path = decision_path;
return &sinfo.entry;
}
}
ICSlotInfo* ICInfo::pickEntryForRewrite(const char* debug_name) {
int num_slots = getNumSlots();
for (int _i = 0; _i < num_slots; _i++) {
int i = (_i + next_slot_to_try) % num_slots;
SlotInfo& sinfo = slots[i];
if (sinfo.is_patched && sinfo.decision_path != decision_path) {
ICSlotInfo& sinfo = slots[i];
assert(sinfo.num_inside >= 0);
if (sinfo.num_inside)
continue;
}
if (VERBOSITY()) {
printf("committing %s icentry to in-use slot %d at %p\n", debug_name, i, start_addr);
}
next_slot_to_try++;
next_slot_to_try = i + 1;
sinfo.is_patched = true;
sinfo.decision_path = decision_path;
return &sinfo.entry;
return &sinfo;
}
if (VERBOSITY())
printf("not committing %s icentry since it is not compatible (%lx)\n", debug_name, decision_path);
printf("not committing %s icentry since there are no available slots\n", debug_name);
return NULL;
}
......@@ -192,7 +178,7 @@ ICInfo::ICInfo(void* start_addr, void* slowpath_rtn_addr, void* continue_addr, S
type_recorder(type_recorder), failed(false), start_addr(start_addr), slowpath_rtn_addr(slowpath_rtn_addr),
continue_addr(continue_addr) {
for (int i = 0; i < num_slots; i++) {
slots.push_back(SlotInfo(this, i));
slots.push_back(ICSlotInfo(this, i));
}
}
......@@ -270,6 +256,8 @@ void ICInfo::clear(ICSlotInfo* icentry) {
std::unique_ptr<Assembler> writer(new Assembler(start, getSlotSize()));
writer->nop();
writer->jmp(JumpDestination::fromStart(getSlotSize()));
assert(writer->bytesWritten() <= IC_INVALDITION_HEADER_SIZE);
// std::unique_ptr<MCWriter> writer(createMCWriter(start, getSlotSize(), 0));
// writer->emitNop();
// writer->emitGuardFalse();
......
......@@ -30,12 +30,15 @@ class TypeRecorder;
class ICInfo;
class ICInvalidator;
#define IC_INVALDITION_HEADER_SIZE 6
struct ICSlotInfo {
public:
ICSlotInfo(ICInfo* ic, int idx) : ic(ic), idx(idx) {}
ICSlotInfo(ICInfo* ic, int idx) : ic(ic), idx(idx), num_inside(0) {}
ICInfo* ic;
int idx;
int idx; // the index inside the ic
int num_inside; // the number of stack frames that are currently inside this slot
void clear();
};
......@@ -45,7 +48,7 @@ public:
class CommitHook {
public:
virtual ~CommitHook() {}
virtual void finishAssembly(int fastpath_offset) = 0;
virtual void finishAssembly(ICSlotInfo* picked_slot, int fastpath_offset) = 0;
};
private:
......@@ -73,7 +76,7 @@ public:
assembler::GenericRegister returnRegister();
void addDependenceOn(ICInvalidator&);
void commit(uint64_t decision_path, CommitHook* hook);
void commit(CommitHook* hook);
void abort();
friend class ICInfo;
......@@ -81,14 +84,7 @@ public:
class ICInfo {
private:
struct SlotInfo {
bool is_patched;
uint64_t decision_path;
ICSlotInfo entry;
SlotInfo(ICInfo* ic, int idx) : is_patched(false), decision_path(0), entry(ic, idx) {}
};
std::vector<SlotInfo> slots;
std::vector<ICSlotInfo> slots;
// For now, just use a round-robin eviction policy.
// This is probably a bunch worse than LRU, but it's also
// probably a bunch better than the "always evict slot #0" policy
......@@ -105,7 +101,7 @@ private:
bool failed;
// for ICSlotRewrite:
ICSlotInfo* pickEntryForRewrite(uint64_t decision_path, const char* debug_name);
ICSlotInfo* pickEntryForRewrite(const char* debug_name);
public:
ICInfo(void* start_addr, void* slowpath_rtn_addr, void* continue_addr, StackInfo stack_info, int num_slots,
......
......@@ -558,6 +558,39 @@ void Rewriter::_call(RewriterVar* result, bool can_call_into_python, void* func_
// assert(!can_call_into_python);
assert(done_guarding);
// we've been ignoring these annotations for long enough that I'm not sure they can be trusted,
// so just be pessimistic:
can_call_into_python = true;
if (can_call_into_python) {
// We need some fixed amount of space at the beginning of the IC that we can use to invalidate
// it by writing a jmp.
// FIXME this check is conservative, since actually we just have to verify that the return
// address is at least IC_INVALDITION_HEADER_SIZE bytes past the beginning, but we're
// checking based on the beginning of the call. I think the load+call might actually
// always larger than the invalidation jmp.
assert(assembler->bytesWritten() >= IC_INVALDITION_HEADER_SIZE);
}
if (can_call_into_python) {
if (!marked_inside_ic) {
// assembler->trap();
// TODO this is super hacky: we don't know the address that we want to inc/dec, since
// it depends on the slot that we end up picking, so just write out an arbitrary
// constant an we'll rewrite it later
// TODO if we can guarantee that the mark_addr will fit in 32 bits,
// we can use a more compact instruction encoding
mark_addr_addrs.push_back((void**)(assembler->curInstPointer() + 2));
assembler::Register reg = allocReg(Location::any());
assembler->mov(assembler::Immediate(0x1234567890abcdefL), reg);
assembler->incl(assembler::Indirect(reg, 0));
assertConsistent();
marked_inside_ic = true;
}
}
// RewriterVarUsage scratch = createNewVar(Location::any());
assembler::Register r = allocReg(assembler::R11);
......@@ -731,16 +764,18 @@ void Rewriter::commit() {
assert(!finished);
initPhaseEmitting();
if (assembler->hasFailed()) {
static StatCounter rewriter_assemblyfail("rewriter_assemblyfail");
rewriter_assemblyfail.log();
static StatCounter rewriter_assemblyfail("rewriter_assemblyfail");
auto on_assemblyfail = [&]() {
rewriter_assemblyfail.log();
this->abort();
};
if (assembler->hasFailed()) {
on_assemblyfail();
return;
}
finished = true;
static StatCounter rewriter_commits("rewriter_commits");
rewriter_commits.log();
......@@ -760,7 +795,8 @@ void Rewriter::commit() {
// at each guard in the var's `uses` list.
// First: check if we're done guarding before we even begin emitting.
if (last_guard_action == -1) {
auto on_done_guarding = [&]() {
done_guarding = true;
for (RewriterVar* arg : args) {
if (arg->next_use == arg->uses.size()) {
......@@ -770,9 +806,12 @@ void Rewriter::commit() {
arg->locations.clear();
}
}
}
assertConsistent();
};
assertConsistent();
if (last_guard_action == -1) {
on_done_guarding();
}
// Now, start emitting assembly; check if we're dong guarding after each.
for (int i = 0; i < actions.size(); i++) {
......@@ -780,17 +819,20 @@ void Rewriter::commit() {
assertConsistent();
if (i == last_guard_action) {
done_guarding = true;
for (RewriterVar* arg : args) {
if (arg->next_use == arg->uses.size()) {
for (Location loc : arg->locations) {
vars_by_location.erase(loc);
}
arg->locations.clear();
}
}
on_done_guarding();
}
assertConsistent();
}
if (marked_inside_ic) {
// TODO this is super hacky: we don't know the address that we want to inc/dec, since
// it depends on the slot that we end up picking, so just write out an arbitrary
// constant an we'll rewrite it later
// TODO if we can guarantee that the mark_addr will fit in 32 bits,
// we can use a more compact instruction encoding
mark_addr_addrs.push_back((void**)(assembler->curInstPointer() + 2));
assembler::Register reg = allocReg(Location::any(), getReturnDestination());
assembler->mov(assembler::Immediate(0x1234567890abcdefL), reg);
assembler->decl(assembler::Indirect(reg, 0));
}
// Make sure that we have been calling bumpUse correctly.
......@@ -896,10 +938,30 @@ void Rewriter::commit() {
}
#endif
rewrite->commit(decision_path, this);
if (assembler->hasFailed()) {
on_assemblyfail();
return;
}
finished = true;
// TODO: have to check that we have enough room to write the final jmp
rewrite->commit(this);
assert(!assembler->hasFailed());
}
void Rewriter::finishAssembly(int continue_offset) {
void Rewriter::finishAssembly(ICSlotInfo* picked_slot, int continue_offset) {
if (marked_inside_ic) {
void* mark_addr = &picked_slot->num_inside;
// Go back and rewrite the faked constants to point to the correct address:
for (void** mark_addr_addr : mark_addr_addrs) {
assert(*mark_addr_addr == (void*)0x1234567890abcdefL);
*mark_addr_addr = mark_addr;
}
}
assembler->jmp(assembler::JumpDestination::fromStart(continue_offset));
assembler->fillWithNops();
......@@ -914,12 +976,6 @@ void Rewriter::commitReturning(RewriterVar* var) {
commit();
}
void Rewriter::addDecision(int way) {
assert(ndecisions < 60);
ndecisions++;
decision_path = (decision_path << 1) | way;
}
void Rewriter::addDependenceOn(ICInvalidator& invalidator) {
rewrite->addDependenceOn(invalidator);
}
......@@ -1299,7 +1355,7 @@ TypeRecorder* Rewriter::getTypeRecorder() {
Rewriter::Rewriter(ICSlotRewrite* rewrite, int num_args, const std::vector<int>& live_outs)
: rewrite(rewrite), assembler(rewrite->getAssembler()), return_location(rewrite->returnRegister()),
added_changing_action(false), last_guard_action(-1), done_guarding(false), ndecisions(0), decision_path(1) {
added_changing_action(false), marked_inside_ic(false), last_guard_action(-1), done_guarding(false) {
initPhaseCollecting();
#ifndef NDEBUG
......
......@@ -346,6 +346,9 @@ private:
actions.emplace_back(action);
}
bool added_changing_action;
bool marked_inside_ic;
std::vector<void**> mark_addr_addrs;
int last_guard_action;
bool done_guarding;
......@@ -374,10 +377,7 @@ private:
// Do the bookkeeping to say that var is no longer in location l
void removeLocationFromVar(RewriterVar* var, Location l);
void finishAssembly(int continue_offset) override;
int ndecisions;
uint64_t decision_path;
void finishAssembly(ICSlotInfo* picked_slot, int continue_offset) override;
void _trap();
void _loadConst(RewriterVar* result, int64_t val, Location loc);
......@@ -461,8 +461,6 @@ public:
static Rewriter* createRewriter(void* rtn_addr, int num_args, const char* debug_name);
void addDecision(int way);
friend class RewriterVar;
};
......
......@@ -300,7 +300,10 @@ static void _doFree(GCAllocation* al) {
if (al->kind_id == GCKind::PYTHON) {
Box* b = (Box*)al->user_data;
ASSERT(b->cls->tp_dealloc == NULL, "%s", getTypeName(b));
if (b->cls->simple_destructor)
b->cls->simple_destructor(b);
}
}
......
......@@ -298,8 +298,8 @@ void BoxedClass::freeze() {
BoxedClass::BoxedClass(BoxedClass* base, gcvisit_func gc_visit, int attrs_offset, int instance_size,
bool is_user_defined)
: BoxVar(0), gc_visit(gc_visit), attrs_offset(attrs_offset), is_constant(false), is_user_defined(is_user_defined),
is_pyston_class(true) {
: BoxVar(0), gc_visit(gc_visit), simple_destructor(NULL), attrs_offset(attrs_offset), is_constant(false),
is_user_defined(is_user_defined), is_pyston_class(true) {
// Zero out the CPython tp_* slots:
memset(&tp_name, 0, (char*)(&tp_version_tag + 1) - (char*)(&tp_name));
......@@ -2400,6 +2400,7 @@ Box* callFunc(BoxedFunctionBase* func, CallRewriteArgs* rewrite_args, ArgPassSpe
} else {
rewrite_args->obj->addGuard((intptr_t)func);
}
rewrite_args->rewriter->addDependenceOn(func->dependent_ics);
}
}
......@@ -2731,9 +2732,6 @@ Box* runtimeCallInternal(Box* obj, CallRewriteArgs* rewrite_args, ArgPassSpec ar
}
rewrite_args->args_guarded = true;
}
rewrite_args->rewriter->addDecision(obj->cls == function_cls || obj->cls == builtin_function_or_method_cls ? 1
: 0);
}
if (obj->cls == function_cls || obj->cls == builtin_function_or_method_cls) {
......
......@@ -304,7 +304,6 @@ extern "C" BoxedFunctionBase::BoxedFunctionBase(CLFunction* f, std::initializer_
assert(f->num_defaults == ndefaults);
}
// This probably belongs in dict.cpp?
extern "C" void functionGCHandler(GCVisitor* v, Box* b) {
boxGCHandler(v, b);
......@@ -324,6 +323,14 @@ extern "C" void functionGCHandler(GCVisitor* v, Box* b) {
}
}
static void functionDtor(Box* b) {
assert(isSubclass(b->cls, function_cls) || isSubclass(b->cls, builtin_function_or_method_cls));
BoxedFunctionBase* self = static_cast<BoxedFunctionBase*>(b);
self->dependent_ics.invalidateAll();
self->dependent_ics.~ICInvalidator();
}
BoxedModule::BoxedModule(const std::string& name, const std::string& fn) : fn(fn) {
this->giveAttr("__name__", boxString(name));
this->giveAttr("__file__", boxString(fn));
......@@ -1144,6 +1151,8 @@ void setupRuntime() {
builtin_function_or_method_cls
= new BoxedHeapClass(object_cls, &functionGCHandler, offsetof(BoxedBuiltinFunctionOrMethod, attrs),
sizeof(BoxedBuiltinFunctionOrMethod), false, "builtin_function_or_method");
function_cls->simple_destructor = builtin_function_or_method_cls->simple_destructor = functionDtor;
instancemethod_cls = new BoxedHeapClass(object_cls, &instancemethodGCHandler, 0, sizeof(BoxedInstanceMethod), false,
"instancemethod");
list_cls = new BoxedHeapClass(object_cls, &listGCHandler, 0, sizeof(BoxedList), false, "list");
......
......@@ -196,6 +196,13 @@ public:
gcvisit_func gc_visit;
// A "simple" destructor -- one that is allowed to be called at any point after the object is dead.
// In particular, this means that it can't touch any Python objects or other gc-managed memory,
// since it will be in an undefined state.
// (Context: in Python destructors are supposed to be called in topological order, due to reference counting.
// We don't support that yet, but still want some simple ability to run code when an object gets freed.)
void (*simple_destructor)(Box*);
// Offset of the HCAttrs object or 0 if there are no hcattrs.
// Analogous to tp_dictoffset
const int attrs_offset;
......@@ -447,6 +454,8 @@ public:
int ndefaults;
GCdArray* defaults;
ICInvalidator dependent_ics;
// Accessed via member descriptor
Box* modname; // __module__
......
# Regression test: this triggers a bug in the way we guard for boxedfunctions and their closures.
# In particular, we guard on the specific value of a BoxedFunction to say that it is the same, but
# it's possible for the function to get destructed and then a new (and different) boxedfunction to
# get allocated in the same address, which would pass the guard.
import gc
def f():
x = []
def inner():
x.append(1)
inner()
for i in xrange(30000):
if i % 1000 == 0:
print i
f()
if i % 50 == 0:
gc.collect()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment