Commit f0e39c1b authored by Marius Wachtler's avatar Marius Wachtler

bjit: use a bitset for register tracking in the rewriter and use r12, r15,...

bjit: use a bitset for register tracking in the rewriter and use r12, r15, rbx, rbp in PPs and the bjit
Keeping the available registers in a bitset makes it more memory efficient and
also easier and more performant to calculate a subset of the registers.
I will soon implement the 'otherThan' functionality using it which would fix the current problem of only allowing to exclude one register
parent a0b70207
......@@ -241,7 +241,7 @@ int ICInfo::calculateSuggestedSize() {
if (!times_rewritten)
return slots[0].size;
int additional_space_per_slot = 30;
int additional_space_per_slot = 50;
// if there are less rewrites than slots we can give a very accurate estimate
if (times_rewritten < slots.size()) {
// add up the sizes of all used slots
......@@ -316,7 +316,8 @@ static llvm::DenseMap<void*, ICInfo*> ics_by_return_addr;
ICInfo::ICInfo(void* start_addr, void* slowpath_rtn_addr, void* continue_addr, StackInfo stack_info, int size,
llvm::CallingConv::ID calling_conv, LiveOutSet _live_outs, assembler::GenericRegister return_register,
TypeRecorder* type_recorder, std::vector<Location> ic_global_decref_locations)
TypeRecorder* type_recorder, std::vector<Location> ic_global_decref_locations,
assembler::RegisterSet allocatable_registers)
: next_slot_to_try(0),
stack_info(stack_info),
calling_conv(calling_conv),
......@@ -326,6 +327,7 @@ ICInfo::ICInfo(void* start_addr, void* slowpath_rtn_addr, void* continue_addr, S
retry_in(0),
retry_backoff(1),
times_rewritten(0),
allocatable_registers(allocatable_registers),
ic_global_decref_locations(std::move(ic_global_decref_locations)),
start_addr(start_addr),
slowpath_rtn_addr(slowpath_rtn_addr),
......@@ -387,7 +389,7 @@ std::unique_ptr<ICInfo> registerCompiledPatchpoint(uint8_t* start_addr, uint8_t*
ICInfo* icinfo
= new ICInfo(start_addr, slowpath_rtn_addr, continue_addr, stack_info, ic->size, ic->getCallingConvention(),
std::move(live_outs), return_register, ic->type_recorder, decref_info);
std::move(live_outs), return_register, ic->type_recorder, decref_info, ic->allocatable_regs);
assert(!ics_by_return_addr.count(slowpath_rtn_addr));
ics_by_return_addr[slowpath_rtn_addr] = icinfo;
......
......@@ -94,6 +94,7 @@ private:
TypeRecorder* const type_recorder;
int retry_in, retry_backoff;
int times_rewritten;
assembler::RegisterSet allocatable_registers;
DecrefInfo slowpath_decref_info;
// This is a vector of locations which always need to get decrefed inside this IC.
......@@ -107,7 +108,8 @@ private:
public:
ICInfo(void* start_addr, void* slowpath_rtn_addr, void* continue_addr, StackInfo stack_info, int size,
llvm::CallingConv::ID calling_conv, LiveOutSet live_outs, assembler::GenericRegister return_register,
TypeRecorder* type_recorder, std::vector<Location> ic_global_decref_locations);
TypeRecorder* type_recorder, std::vector<Location> ic_global_decref_locations,
assembler::RegisterSet allocatable_registers = assembler::RegisterSet::stdAllocatable());
~ICInfo();
void* const start_addr, *const slowpath_rtn_addr, *const continue_addr;
......@@ -133,6 +135,8 @@ public:
int percentBackedoff() const { return retry_backoff; }
int timesRewritten() const { return times_rewritten; }
assembler::RegisterSet getAllocatableRegs() const { return allocatable_registers; }
friend class ICSlotRewrite;
static ICInfo* getICInfoForNode(AST* node);
......
......@@ -646,10 +646,9 @@ void Rewriter::_cmp(RewriterVar* result, RewriterVar* v1, AST_TYPE::AST_TYPE cmp
v1->bumpUseEarlyIfPossible();
v2->bumpUseEarlyIfPossible();
// sete and setne has special register requirements (can't use r8-r15)
const assembler::Register valid_registers[] = {
assembler::RAX, assembler::RCX, assembler::RDX, assembler::RSI, assembler::RDI,
};
// sete and setne has special register requirements
auto set_inst_valid_registers = assembler::RAX | assembler::RBX | assembler::RCX | assembler::RDX;
auto valid_registers = set_inst_valid_registers & allocatable_regs;
assembler::Register newvar_reg = allocReg(dest, Location::any(), valid_registers);
result->initializeInReg(newvar_reg);
assembler->cmp(v1_reg, v2_reg);
......@@ -849,6 +848,9 @@ assembler::Register RewriterVar::getInReg(Location dest, bool allow_constant_in_
Location l(*locations.begin());
assembler::Register reg = rewriter->allocReg(dest, otherThan);
if (rewriter->failed)
return reg;
assert(rewriter->vars_by_location.count(reg) == 0);
if (l.type == Location::Scratch || l.type == Location::Stack) {
......@@ -1979,9 +1981,10 @@ void Rewriter::spillRegister(assembler::Register reg, Location preserve) {
}
// First, try to spill into a callee-save register:
for (assembler::Register new_reg : allocatable_regs) {
if (!new_reg.isCalleeSave())
continue;
auto callee_save_allocatable_regs = allocatable_regs & assembler::RegisterSet::getCalleeSave();
for (assembler::Register new_reg : callee_save_allocatable_regs) {
assert(new_reg.isCalleeSave());
if (vars_by_location.count(new_reg))
continue;
if (Location(new_reg) == preserve)
......@@ -2023,8 +2026,7 @@ assembler::Register Rewriter::allocReg(Location dest, Location otherThan) {
return allocReg(dest, otherThan, allocatable_regs);
}
assembler::Register Rewriter::allocReg(Location dest, Location otherThan,
llvm::ArrayRef<assembler::Register> valid_registers) {
assembler::Register Rewriter::allocReg(Location dest, Location otherThan, assembler::RegisterSet valid_registers) {
assertPhaseEmitting();
if (dest.type == Location::AnyReg) {
......@@ -2063,7 +2065,7 @@ assembler::Register Rewriter::allocReg(Location dest, Location otherThan,
assert(failed || vars_by_location.count(best_reg) == 0);
return best_reg;
} else if (dest.type == Location::Register) {
assert(std::find(valid_registers.begin(), valid_registers.end(), dest.asRegister()) != valid_registers.end());
assert(valid_registers.isInside(dest.asRegister()));
assembler::Register reg(dest.regnum);
if (vars_by_location.count(reg)) {
......@@ -2220,7 +2222,7 @@ Rewriter::Rewriter(std::unique_ptr<ICSlotRewrite> rewrite, int num_args, const L
marked_inside_ic(false),
done_guarding(false),
last_guard_action(-1),
allocatable_regs(std_allocatable_regs) {
allocatable_regs(this->rewrite->getICInfo()->getAllocatableRegs()) {
initPhaseCollecting();
finished = false;
......
......@@ -516,8 +516,7 @@ protected:
// Allocates a register. dest must be of type Register or AnyReg
// If otherThan is a register, guaranteed to not use that register.
assembler::Register allocReg(Location dest, Location otherThan = Location::any());
assembler::Register allocReg(Location dest, Location otherThan,
llvm::ArrayRef<assembler::Register> valid_registers);
assembler::Register allocReg(Location dest, Location otherThan, assembler::RegisterSet valid_registers);
assembler::XMMRegister allocXMMReg(Location dest, Location otherThan = Location::any());
// Allocates an 8-byte region in the scratch space
Location allocScratch();
......@@ -609,7 +608,7 @@ protected:
#endif
}
llvm::ArrayRef<assembler::Register> allocatable_regs;
assembler::RegisterSet allocatable_regs;
public:
// This should be called exactly once for each argument
......
......@@ -35,6 +35,7 @@ namespace assembler {
class Assembler;
struct RegisterSet;
struct Register {
int regnum;
......@@ -52,27 +53,80 @@ struct Register {
static Register fromDwarf(int dwarf_regnum);
static constexpr int numRegs() { return 16; }
constexpr RegisterSet operator|(Register b) const;
};
const Register RAX(0);
const Register RCX(1);
const Register RDX(2);
const Register RBX(3);
const Register RSP(4);
const Register RBP(5);
const Register RSI(6);
const Register RDI(7);
const Register R8(8);
const Register R9(9);
const Register R10(10);
const Register R11(11);
const Register R12(12);
const Register R13(13);
const Register R14(14);
const Register R15(15);
constexpr Register RAX(0);
constexpr Register RCX(1);
constexpr Register RDX(2);
constexpr Register RBX(3);
constexpr Register RSP(4);
constexpr Register RBP(5);
constexpr Register RSI(6);
constexpr Register RDI(7);
constexpr Register R8(8);
constexpr Register R9(9);
constexpr Register R10(10);
constexpr Register R11(11);
constexpr Register R12(12);
constexpr Register R13(13);
constexpr Register R14(14);
constexpr Register R15(15);
struct RegisterSet {
typedef unsigned int Regs;
Regs regs;
constexpr explicit RegisterSet(Regs regs) : regs(regs) {}
constexpr RegisterSet(Register reg) : regs(1ul << reg.regnum) {}
static constexpr RegisterSet getCalleeSave() { return RBX | RSP | RBP | R12 | R13 | R14 | R15; }
static constexpr RegisterSet stdAllocatable() { return RAX | RCX | RDX | RDI | RSI | R8 | R9 | R10 | R11; }
bool isInside(Register reg) const { return regs & (1ul << reg.regnum); }
bool empty() const { return regs == 0; }
class iterator {
public:
const RegisterSet& set;
int i;
iterator(const RegisterSet& set, int i) : set(set), i(i) {}
iterator& operator++() {
do {
i++;
} while (i < Register::numRegs() && !set.isInside(Register(i)));
if (i > Register::numRegs())
i = Register::numRegs();
return *this;
}
bool operator==(const iterator& rhs) const { return i == rhs.i; }
bool operator!=(const iterator& rhs) const { return !(*this == rhs); }
Register operator*() { return Register(i); }
};
iterator begin() const {
if (empty())
return end();
return iterator(*this, __builtin_ctz(regs));
}
iterator end() const { return iterator(*this, Register::numRegs()); }
constexpr RegisterSet operator|(RegisterSet b) const { return RegisterSet(regs | b.regs); }
constexpr RegisterSet operator&(RegisterSet b) const { return RegisterSet(regs & b.regs); }
void operator|=(RegisterSet b) { regs |= b.regs; }
void operator&=(RegisterSet b) { regs &= b.regs; }
};
constexpr RegisterSet Register::operator|(Register b) const {
return RegisterSet(*this) | RegisterSet(b);
}
inline bool Register::isCalleeSave() {
return *this == RBX || *this == RSP || *this == RBP || regnum >= 12;
return RegisterSet::getCalleeSave().isInside(*this);
}
struct Indirect {
......
......@@ -43,6 +43,8 @@ static llvm::DenseMap<CFGBlock*, std::vector<void*>> block_patch_locations;
// asm volatile ("" ::: "r14");
// asm volatile ("" ::: "r13");
// asm volatile ("" ::: "r12");
// asm volatile ("" ::: "rbx");
// asm volatile ("" ::: "rbp");
// char scratch[256+16];
// foo(scratch);
// }
......@@ -50,15 +52,16 @@ static llvm::DenseMap<CFGBlock*, std::vector<void*>> block_patch_locations;
// It omits the frame pointer but saves r12, r13, r14 and r15
// use 'objdump -s -j .eh_frame <obj.file>' to dump it
const unsigned char eh_info[]
= { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x7a, 0x52, 0x00, 0x01, 0x78, 0x10, 0x01,
0x1b, 0x0c, 0x07, 0x08, 0x90, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x00, 0x42, 0x0e, 0x10, 0x42, 0x0e, 0x18, 0x42,
0x0e, 0x20, 0x42, 0x0e, 0x28, 0x47, 0x0e, 0xc0, 0x02, 0x8c, 0x05, 0x8d, 0x04, 0x8e, 0x03, 0x8f,
0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
= { 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x7a, 0x52, 0x00, 0x01, 0x78, 0x10, 0x01, 0x1b,
0x0c, 0x07, 0x08, 0x90, 0x01, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x41, 0x0e, 0x10, 0x42, 0x0e, 0x18, 0x42, 0x0e, 0x20, 0x42,
0x0e, 0x28, 0x42, 0x0e, 0x30, 0x41, 0x0e, 0x38, 0x47, 0x0e, 0xd0, 0x02, 0x83, 0x07, 0x8c, 0x06, 0x8d,
0x05, 0x8e, 0x04, 0x8f, 0x03, 0x86, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
static_assert(JitCodeBlock::num_stack_args == 2, "have to update EH table!");
static_assert(JitCodeBlock::scratch_size == 256, "have to update EH table!");
constexpr int code_size = JitCodeBlock::memory_size - sizeof(eh_info);
constexpr assembler::RegisterSet JitCodeBlock::additional_regs;
JitCodeBlock::MemoryManager::MemoryManager() {
int protection = PROT_READ | PROT_WRITE | PROT_EXEC;
......@@ -86,10 +89,12 @@ JitCodeBlock::JitCodeBlock(llvm::StringRef name)
uint8_t* code = a.curInstPointer();
// emit prolog
a.push(assembler::RBP);
a.push(assembler::R15);
a.push(assembler::R14);
a.push(assembler::R13);
a.push(assembler::R12);
a.push(assembler::RBX);
static_assert(sp_adjustment % 16 == 8, "stack isn't aligned");
a.sub(assembler::Immediate(sp_adjustment), assembler::RSP);
a.mov(assembler::RDI, assembler::R13); // interpreter pointer
......@@ -131,9 +136,10 @@ std::unique_ptr<JitFragmentWriter> JitCodeBlock::newFragment(CFGBlock* block, in
void* fragment_start = a.curInstPointer() - patch_jump_offset;
long fragment_offset = a.bytesWritten() - patch_jump_offset;
long bytes_left = a.bytesLeft() + patch_jump_offset;
constexpr assembler::RegisterSet bjit_allocatable_regs = assembler::RegisterSet::stdAllocatable() | additional_regs;
std::unique_ptr<ICInfo> ic_info(new ICInfo(fragment_start, nullptr, nullptr, stack_info, bytes_left,
llvm::CallingConv::C, live_outs, assembler::RAX, 0,
std::vector<Location>()));
std::vector<Location>(), bjit_allocatable_regs));
std::unique_ptr<ICSlotRewrite> rewrite = ic_info->startRewrite("");
return std::unique_ptr<JitFragmentWriter>(new JitFragmentWriter(
......@@ -156,13 +162,6 @@ void JitCodeBlock::fragmentFinished(int bytes_written, int num_bytes_overlapping
ic_info.appendDecrefInfosTo(decref_infos);
}
static const assembler::Register bjit_allocatable_regs[]
= { assembler::RAX, assembler::RCX, assembler::RDX,
// no RSP
// no RBP
assembler::RDI, assembler::RSI, assembler::R8, assembler::R9,
assembler::R10, assembler::R11, assembler::R12, assembler::R15 };
JitFragmentWriter::JitFragmentWriter(CFGBlock* block, std::unique_ptr<ICInfo> ic_info,
std::unique_ptr<ICSlotRewrite> rewrite, int code_offset, int num_bytes_overlapping,
void* entry_code, JitCodeBlock& code_block)
......@@ -175,7 +174,6 @@ JitFragmentWriter::JitFragmentWriter(CFGBlock* block, std::unique_ptr<ICInfo> ic
code_block(code_block),
interp(0),
ic_info(std::move(ic_info)) {
allocatable_regs = bjit_allocatable_regs;
added_changing_action = true;
......@@ -1062,10 +1060,12 @@ void JitFragmentWriter::_emitJump(CFGBlock* b, RewriterVar* block_next, ExitInfo
exit_info.exit_start = assembler->curInstPointer();
block_next->getInReg(assembler::RAX, true);
assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::RBX);
assembler->pop(assembler::R12);
assembler->pop(assembler::R13);
assembler->pop(assembler::R14);
assembler->pop(assembler::R15);
assembler->pop(assembler::RBP);
assembler->retq();
// make sure we have at least 'min_patch_size' of bytes available.
......@@ -1097,10 +1097,12 @@ void JitFragmentWriter::_emitOSRPoint() {
assembler->clear_reg(assembler::RAX); // = next block to execute
assembler->mov(assembler::Immediate(ASTInterpreterJitInterface::osr_dummy_value), assembler::RDX);
assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::RBX);
assembler->pop(assembler::R12);
assembler->pop(assembler::R13);
assembler->pop(assembler::R14);
assembler->pop(assembler::R15);
assembler->pop(assembler::RBP);
assembler->retq();
}
interp->bumpUse();
......@@ -1134,7 +1136,14 @@ void JitFragmentWriter::_emitPPCall(RewriterVar* result, void* func_addr, llvm::
uint8_t* pp_end = rewrite->getSlotStart() + assembler->bytesWritten();
assert(assembler->hasFailed() || (pp_start + pp_size + call_size == pp_end));
std::unique_ptr<ICSetupInfo> setup_info(ICSetupInfo::initialize(true, pp_size, ICSetupInfo::Generic, NULL));
assembler::RegisterSet regs = assembler::RegisterSet::stdAllocatable();
for (assembler::Register reg : JitCodeBlock::additional_regs) {
if (vars_by_location.count(reg) == 0)
regs |= assembler::RegisterSet(reg);
}
std::unique_ptr<ICSetupInfo> setup_info(ICSetupInfo::initialize(true, pp_size, ICSetupInfo::Generic, NULL, regs));
// calculate available scratch space
int pp_scratch_size = 0;
......@@ -1190,10 +1199,12 @@ void JitFragmentWriter::_emitReturn(RewriterVar* return_val) {
return_val->getInReg(assembler::RDX, true);
assembler->clear_reg(assembler::RAX);
assembler->add(assembler::Immediate(JitCodeBlock::sp_adjustment), assembler::RSP);
assembler->pop(assembler::RBX);
assembler->pop(assembler::R12);
assembler->pop(assembler::R13);
assembler->pop(assembler::R14);
assembler->pop(assembler::R15);
assembler->pop(assembler::RBP);
assembler->retq();
return_val->bumpUse();
}
......
......@@ -73,9 +73,9 @@ class JitFragmentWriter;
// register or stack slot but we aren't if it outlives the block - we have to store it in the interpreter instance.
//
// We use the following callee-save regs to speed up the generated code:
// r12, r15: temporary values
// r13: pointer to ASTInterpreter instance
// r14: pointer to the vregs array
// rbx, rbp, r12, r15: temporary values
// r13 : pointer to ASTInterpreter instance
// r14 : pointer to the vregs array
//
// To execute a specific CFGBlock one has to call:
// CFGBlock* block;
......@@ -94,10 +94,12 @@ class JitFragmentWriter;
//
// Basic layout of generated code block is:
// entry_code:
// push %rbp ; save rbp
// push %r15 ; save r15
// push %r14 ; save r14
// push %r13 ; save r13
// push %r12 ; save r12
// push %rbx ; save rbx
// sub $0x118,%rsp ; setup scratch, 0x118 = scratch_size + 16 = space for two func args passed on the
// stack + 8 byte for stack alignment
// mov %rdi,%r13 ; copy the pointer to ASTInterpreter instance into r13
......@@ -113,10 +115,12 @@ class JitFragmentWriter;
// jne end_side_exit
// movabs $0x215bb60,%rax ; rax = CFGBlock* to interpret next (rax is the 1. return reg)
// add $0x118,%rsp ; restore stack pointer
// pop %rbx ; restore rbx
// pop %r12 ; restore r12
// pop %r13 ; restore r13
// pop %r14 ; restore r14
// pop %r15 ; restore r15
// pop %rbp ; restore rbp
// ret ; exit to the interpreter which will interpret the specified CFGBLock*
// end_side_exit:
// ....
......@@ -128,10 +132,12 @@ class JitFragmentWriter;
// in this case 0 which means we are finished
// movabs $0x1270014108,%rdx ; rdx must contain the Box* value to return
// add $0x118,%rsp ; restore stack pointer
// pop %rbx ; restore rbx
// pop %r12 ; restore r12
// pop %r13 ; restore r13
// pop %r14 ; restore r14
// pop %r15 ; restore r15
// pop %rbp ; restore rbp
// ret
//
// nth_JitFragment:
......@@ -148,6 +154,8 @@ public:
// scratch size + space for passing additional args on the stack without having to adjust the SP when calling
// functions with more than 6 args.
static constexpr int sp_adjustment = scratch_size + num_stack_args * 8 + 8 /* = alignment */;
static constexpr assembler::RegisterSet additional_regs = assembler::RBX | assembler::RBP | assembler::R12
| assembler::R15;
private:
struct MemoryManager {
......
......@@ -44,8 +44,10 @@ int ICSetupInfo::totalSize() const {
static std::vector<std::pair<PatchpointInfo*, void* /* addr of func to call */>> new_patchpoints;
ICSetupInfo* ICSetupInfo::initialize(bool has_return_value, int size, ICType type, TypeRecorder* type_recorder) {
ICSetupInfo* rtn = new ICSetupInfo(type, size, has_return_value, type_recorder);
ICSetupInfo* ICSetupInfo::initialize(bool has_return_value, int size, ICType type, TypeRecorder* type_recorder,
assembler::RegisterSet allocatable_regs) {
ICSetupInfo* rtn = new ICSetupInfo(type, size, has_return_value, type_recorder, allocatable_regs);
// We use size == CALL_ONLY_SIZE to imply that the call isn't patchable
assert(rtn->totalSize() > CALL_ONLY_SIZE);
......
......@@ -20,6 +20,7 @@
#include "llvm/IR/CallingConv.h"
#include "asm_writing/types.h"
#include "codegen/stackmaps.h"
#include "core/common.h"
......@@ -64,14 +65,20 @@ public:
};
private:
ICSetupInfo(ICType type, int size, bool has_return_value, TypeRecorder* type_recorder)
: type(type), size(size), has_return_value(has_return_value), type_recorder(type_recorder) {}
ICSetupInfo(ICType type, int size, bool has_return_value, TypeRecorder* type_recorder,
assembler::RegisterSet allocatable_regs)
: type(type),
size(size),
has_return_value(has_return_value),
type_recorder(type_recorder),
allocatable_regs(allocatable_regs) {}
public:
const ICType type;
const int size;
const bool has_return_value;
TypeRecorder* const type_recorder;
assembler::RegisterSet allocatable_regs;
int totalSize() const;
bool hasReturnValue() const { return has_return_value; }
......@@ -90,7 +97,8 @@ public:
return llvm::CallingConv::C;
}
static ICSetupInfo* initialize(bool has_return_value, int size, ICType type, TypeRecorder* type_recorder);
static ICSetupInfo* initialize(bool has_return_value, int size, ICType type, TypeRecorder* type_recorder,
assembler::RegisterSet allocatable_regs = assembler::RegisterSet::stdAllocatable());
};
struct PatchpointInfo {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment