Commit 96027cdb authored by Marius Wachtler's avatar Marius Wachtler

bjit: omit frame pointer + use R12 for ASTInterpreter*

This reduces the number of stack access, because previously when we accessed a field of the interpreter
we always had to load it first from stack into a reg - and now we have a dedicated reg.
This is currently only a very small perf change but when #736 lands this becomes more important.
parent 07f0a1aa
...@@ -31,10 +31,27 @@ namespace pyston { ...@@ -31,10 +31,27 @@ namespace pyston {
static llvm::DenseSet<CFGBlock*> blocks_aborted; static llvm::DenseSet<CFGBlock*> blocks_aborted;
static llvm::DenseMap<CFGBlock*, std::vector<void*>> block_patch_locations; static llvm::DenseMap<CFGBlock*, std::vector<void*>> block_patch_locations;
// The EH table is copied from the one clang++ generated for:
//
// long foo(char* c);
// void bjit() {
// asm volatile ("" ::: "r12");
// char scratch[256+16];
// foo(scratch);
// }
//
// It omits the frame pointer but saves R12
const unsigned char eh_info[]
= { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x7a, 0x52, 0x00, 0x01, 0x78, 0x10,
0x01, 0x1b, 0x0c, 0x07, 0x08, 0x90, 0x01, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1c, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x42, 0x0e, 0x10, 0x47,
0x0e, 0xa0, 0x02, 0x8c, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static_assert(JitCodeBlock::num_stack_args == 2, "have to update EH table!");
static_assert(JitCodeBlock::scratch_size == 256, "have to update EH table!");
JitCodeBlock::JitCodeBlock(llvm::StringRef name) JitCodeBlock::JitCodeBlock(llvm::StringRef name)
: frame_manager(false /* don't omit frame pointers */), : code(new uint8_t[code_size]),
code(new uint8_t[code_size]), eh_frame(new uint8_t[sizeof(eh_info)]),
entry_offset(0), entry_offset(0),
a(code.get(), code_size), a(code.get(), code_size),
is_currently_writing(false), is_currently_writing(false),
...@@ -45,24 +62,28 @@ JitCodeBlock::JitCodeBlock(llvm::StringRef name) ...@@ -45,24 +62,28 @@ JitCodeBlock::JitCodeBlock(llvm::StringRef name)
num_jit_total_bytes.log(code_size); num_jit_total_bytes.log(code_size);
// emit prolog // emit prolog
a.push(assembler::RBP); a.push(assembler::R12);
a.mov(assembler::RSP, assembler::RBP); static_assert(sp_adjustment % 16 == 0, "stack isn't aligned");
a.sub(assembler::Immediate(sp_adjustment), assembler::RSP);
static_assert(scratch_size % 16 == 0, "stack aligment code depends on this"); a.mov(assembler::RDI, assembler::R12); // interpreter pointer
// subtract scratch size + 8bytes to align stack after the push.
a.sub(assembler::Immediate(scratch_size + 8), assembler::RSP);
a.push(assembler::RDI); // push interpreter pointer
// subtract space in order to be able to pass additional args on the stack without having to adjusting the SP when
// calling functions with more than 6 args.
a.sub(assembler::Immediate(num_stack_args * sizeof(void*)), assembler::RSP);
a.jmp(assembler::Indirect(assembler::RSI, offsetof(CFGBlock, code))); // jump to block a.jmp(assembler::Indirect(assembler::RSI, offsetof(CFGBlock, code))); // jump to block
entry_offset = a.bytesWritten(); entry_offset = a.bytesWritten();
// generate eh frame... // generate the eh frame...
frame_manager.writeAndRegister(code.get(), code_size); const int size = sizeof(eh_info);
void* eh_frame_addr = eh_frame.get();
memcpy(eh_frame_addr, eh_info, size);
int32_t* offset_ptr = (int32_t*)((uint8_t*)eh_frame_addr + 0x20);
int32_t* size_ptr = (int32_t*)((uint8_t*)eh_frame_addr + 0x24);
int64_t offset = (int8_t*)code.get() - (int8_t*)offset_ptr;
assert(offset >= INT_MIN && offset <= INT_MAX);
*offset_ptr = offset;
*size_ptr = code_size;
registerDynamicEhFrame((uint64_t)code.get(), code_size, (uint64_t)eh_frame_addr, size - 4);
registerEHFrames((uint8_t*)eh_frame_addr, (uint64_t)eh_frame_addr, size);
g.func_addr_registry.registerFunction(("bjit_" + name).str(), code.get(), code_size, NULL); g.func_addr_registry.registerFunction(("bjit_" + name).str(), code.get(), code_size, NULL);
} }
...@@ -73,7 +94,7 @@ std::unique_ptr<JitFragmentWriter> JitCodeBlock::newFragment(CFGBlock* block, in ...@@ -73,7 +94,7 @@ std::unique_ptr<JitFragmentWriter> JitCodeBlock::newFragment(CFGBlock* block, in
is_currently_writing = true; is_currently_writing = true;
int scratch_offset = num_stack_args * 8 + 8 /* ASTInterpreter* */ + 8 /* alignment */; int scratch_offset = num_stack_args * 8;
StackInfo stack_info(scratch_size, scratch_offset); StackInfo stack_info(scratch_size, scratch_offset);
std::unordered_set<int> live_outs; std::unordered_set<int> live_outs;
...@@ -115,7 +136,7 @@ JitFragmentWriter::JitFragmentWriter(CFGBlock* block, std::unique_ptr<ICInfo> ic ...@@ -115,7 +136,7 @@ JitFragmentWriter::JitFragmentWriter(CFGBlock* block, std::unique_ptr<ICInfo> ic
interp(0), interp(0),
ic_info(std::move(ic_info)) { ic_info(std::move(ic_info)) {
interp = createNewVar(); interp = createNewVar();
addLocationToVar(interp, Location(Location::Stack, JitCodeBlock::interpreter_ptr_offset)); addLocationToVar(interp, assembler::R12);
interp->setAttr(ASTInterpreterJitInterface::getCurrentBlockOffset(), imm(block)); interp->setAttr(ASTInterpreterJitInterface::getCurrentBlockOffset(), imm(block));
} }
...@@ -671,7 +692,8 @@ void JitFragmentWriter::_emitJump(CFGBlock* b, RewriterVar* block_next, int& siz ...@@ -671,7 +692,8 @@ void JitFragmentWriter::_emitJump(CFGBlock* b, RewriterVar* block_next, int& siz
} else { } else {
int num_bytes = assembler->bytesWritten(); int num_bytes = assembler->bytesWritten();
block_next->getInReg(assembler::RAX, true); block_next->getInReg(assembler::RAX, true);
assembler->leave(); assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::R12);
assembler->retq(); assembler->retq();
// make sure we have at least 'min_patch_size' of bytes available. // make sure we have at least 'min_patch_size' of bytes available.
...@@ -696,7 +718,8 @@ void JitFragmentWriter::_emitOSRPoint(RewriterVar* result, RewriterVar* node_var ...@@ -696,7 +718,8 @@ void JitFragmentWriter::_emitOSRPoint(RewriterVar* result, RewriterVar* node_var
{ {
assembler::ForwardJump je(*assembler, assembler::COND_EQUAL); assembler::ForwardJump je(*assembler, assembler::COND_EQUAL);
assembler->mov(assembler::Immediate(0ul), assembler::RAX); // TODO: use xor assembler->mov(assembler::Immediate(0ul), assembler::RAX); // TODO: use xor
assembler->leave(); assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::R12);
assembler->retq(); assembler->retq();
} }
...@@ -765,7 +788,8 @@ void JitFragmentWriter::_emitPPCall(RewriterVar* result, void* func_addr, const ...@@ -765,7 +788,8 @@ void JitFragmentWriter::_emitPPCall(RewriterVar* result, void* func_addr, const
void JitFragmentWriter::_emitReturn(RewriterVar* return_val) { void JitFragmentWriter::_emitReturn(RewriterVar* return_val) {
return_val->getInReg(assembler::RDX, true); return_val->getInReg(assembler::RDX, true);
assembler->mov(assembler::Immediate(0ul), assembler::RAX); // TODO: use xor assembler->mov(assembler::Immediate(0ul), assembler::RAX); // TODO: use xor
assembler->leave(); assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::R12);
assembler->retq(); assembler->retq();
return_val->bumpUse(); return_val->bumpUse();
} }
......
...@@ -21,7 +21,6 @@ ...@@ -21,7 +21,6 @@
#include "codegen/ast_interpreter.h" #include "codegen/ast_interpreter.h"
#include "codegen/patchpoints.h" #include "codegen/patchpoints.h"
#include "gc/heap.h" #include "gc/heap.h"
#include "runtime/ics.h"
namespace pyston { namespace pyston {
...@@ -88,11 +87,10 @@ class JitFragmentWriter; ...@@ -88,11 +87,10 @@ class JitFragmentWriter;
// //
// Basic layout of generated code block is: // Basic layout of generated code block is:
// entry_code: // entry_code:
// push %rbp ; setup frame pointer // push %r12 ; save r12
// mov %rsp,%rbp // sub $0x110,%rsp ; setup scratch, 0x110 = scratch_size + 16 = space for two func args passed on the
// sub $0x108,%rsp ; setup scratch, 0x108 = scratch_size + 8 (=stack alignment) // stack
// push %rdi ; save the pointer to ASTInterpreter instance // mov %rdi,%r12 ; copy the pointer to ASTInterpreter instance into r12
// sub $0x16,%rsp ; space for two func args passed on the stack
// jmpq *0x8(%rsi) ; jump to block->code // jmpq *0x8(%rsi) ; jump to block->code
// possible values: first_JitFragment, second_JitFragment,... // possible values: first_JitFragment, second_JitFragment,...
// //
...@@ -103,7 +101,8 @@ class JitFragmentWriter; ...@@ -103,7 +101,8 @@ class JitFragmentWriter;
// cmp %rax,%rcx ; rax == True // cmp %rax,%rcx ; rax == True
// jne end_side_exit // jne end_side_exit
// movabs $0x215bb60,%rax ; rax = CFGBlock* to interpret next (rax is the 1. return reg) // movabs $0x215bb60,%rax ; rax = CFGBlock* to interpret next (rax is the 1. return reg)
// leave // add $0x110,%rsp ; restore stack pointer
// pop %r12 ; restore r12
// ret ; exit to the interpreter which will interpret the specified CFGBLock* // ret ; exit to the interpreter which will interpret the specified CFGBLock*
// end_side_exit: // end_side_exit:
// .... // ....
...@@ -114,7 +113,8 @@ class JitFragmentWriter; ...@@ -114,7 +113,8 @@ class JitFragmentWriter;
// mov $0,%rax ; rax contains the next block to interpret. // mov $0,%rax ; rax contains the next block to interpret.
// in this case 0 which means we are finished // in this case 0 which means we are finished
// movabs $0x1270014108,%rdx ; rdx must contain the Box* value to return // movabs $0x1270014108,%rdx ; rdx must contain the Box* value to return
// leave // add $0x110,%rsp ; restore stack pointer
// pop %r12 ; restore r12
// ret // ret
// //
// nth_JitFragment: // nth_JitFragment:
...@@ -127,12 +127,14 @@ public: ...@@ -127,12 +127,14 @@ public:
static constexpr int scratch_size = 256; static constexpr int scratch_size = 256;
static constexpr int code_size = 32768; static constexpr int code_size = 32768;
static constexpr int num_stack_args = 2; static constexpr int num_stack_args = 2;
static constexpr int interpreter_ptr_offset = num_stack_args * 8;
// scratch size + space for passing additional args on the stack without having to adjust the SP when calling
// functions with more than 6 args.
static constexpr int sp_adjustment = scratch_size + num_stack_args * 8;
private: private:
EHFrameManager frame_manager;
std::unique_ptr<uint8_t[]> code; std::unique_ptr<uint8_t[]> code;
std::unique_ptr<uint8_t[]> eh_frame;
int entry_offset; int entry_offset;
assembler::Assembler a; assembler::Assembler a;
bool is_currently_writing; bool is_currently_writing;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment