diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 73c3ab54722e..52c796518ac0 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -82,7 +82,8 @@ endforeach() set(BOLT_ENABLE_RUNTIME_default OFF) if ((CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" - OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$") + OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$" + OR CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") AND (CMAKE_SYSTEM_NAME STREQUAL "Linux" OR CMAKE_SYSTEM_NAME STREQUAL "Darwin") AND (NOT CMAKE_CROSSCOMPILING)) diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp index ff1681f82398..f099dfa46f3d 100644 --- a/bolt/lib/Core/Relocation.cpp +++ b/bolt/lib/Core/Relocation.cpp @@ -123,6 +123,7 @@ static bool isSupportedRISCV(uint32_t Type) { case ELF::R_RISCV_LO12_S: case ELF::R_RISCV_64: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: case ELF::R_RISCV_TPREL_HI20: case ELF::R_RISCV_TPREL_ADD: case ELF::R_RISCV_TPREL_LO12_I: @@ -236,6 +237,7 @@ static size_t getSizeForTypeRISCV(uint32_t Type) { case ELF::R_RISCV_64: case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: // See extractValueRISCV for why this is necessary. return 8; } @@ -491,6 +493,7 @@ static uint64_t extractValueRISCV(uint32_t Type, uint64_t Contents, return extractBImmRISCV(Contents); case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: // We need to know the exact address of the GOT entry so we extract the // value from both the AUIPC and L[D|W]. We cannot rely on the symbol in the // relocation for this since it simply refers to the object that is stored @@ -707,6 +710,7 @@ static bool isPCRelativeRISCV(uint32_t Type) { case ELF::R_RISCV_RVC_BRANCH: case ELF::R_RISCV_32_PCREL: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: return true; } } diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp index a03df3de39e8..37dcfa868c21 100644 --- a/bolt/lib/Rewrite/RewriteInstance.cpp +++ b/bolt/lib/Rewrite/RewriteInstance.cpp @@ -2926,12 +2926,12 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection, if (BinaryData *BD = BC->getBinaryDataContainingAddress(SymbolAddress)) { // Note: this assertion is trying to check sanity of BinaryData objects - // but AArch64 has inferred and incomplete object locations coming from - // GOT/TLS or any other non-trivial relocation (that requires creation - // of sections and whose symbol address is not really what should be - // encoded in the instruction). So we essentially disabled this check + // but AArch64 and RISCV has inferred and incomplete object locations + // coming from GOT/TLS or any other non-trivial relocation (that requires + // creation of sections and whose symbol address is not really what should + // be encoded in the instruction). So we essentially disabled this check // for AArch64 and live with bogus names for objects. - assert((IsAArch64 || IsSectionRelocation || + assert((IsAArch64 || BC->isRISCV() || IsSectionRelocation || BD->nameStartsWith(SymbolName) || BD->nameStartsWith("PG" + SymbolName) || (BD->nameStartsWith("ANONYMOUS") && diff --git a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp index 4320c679acd5..0e27d29019e9 100644 --- a/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp +++ b/bolt/lib/Target/RISCV/RISCVMCPlusBuilder.cpp @@ -14,7 +14,9 @@ #include "MCTargetDesc/RISCVMCTargetDesc.h" #include "bolt/Core/MCPlusBuilder.h" #include "llvm/BinaryFormat/ELF.h" +#include "llvm/MC/MCContext.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" @@ -72,6 +74,7 @@ public: case ELF::R_RISCV_LO12_I: case ELF::R_RISCV_LO12_S: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: return true; default: llvm_unreachable("Unexpected RISCV relocation type in code"); @@ -252,6 +255,11 @@ public: return createCall(RISCV::PseudoCALL, Inst, Target, Ctx); } + void createLongTailCall(InstructionListType &Seq, const MCSymbol *Target, + MCContext *Ctx) override { + createShortJmp(Seq, Target, Ctx, /*IsTailCall*/ true); + } + void createTailCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx) override { return createCall(RISCV::PseudoTAIL, Inst, Target, Ctx); @@ -424,6 +432,7 @@ public: return Expr; case ELF::R_RISCV_GOT_HI20: case ELF::R_RISCV_TLS_GOT_HI20: + case ELF::R_RISCV_TLS_GD_HI20: // The GOT is reused so no need to create GOT relocations case ELF::R_RISCV_PCREL_HI20: return RISCVMCExpr::create(Expr, RISCVMCExpr::VK_PCREL_HI, Ctx); @@ -483,6 +492,375 @@ public: return 2; return 4; } + + void createStackPointerIncrement( + MCInst &Inst, int imm, + bool NoFlagsClobber = false /*unused for RISCV*/) const override { + Inst = MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X2) + .addReg(RISCV::X2) + .addImm(-imm); + } + + void createStackPointerDecrement( + MCInst &Inst, int imm, + bool NoFlagsClobber = false /*unused for RISCV*/) const override { + Inst = MCInstBuilder(RISCV::ADDI) + .addReg(RISCV::X2) + .addReg(RISCV::X2) + .addImm(imm); + } + + void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From, + int64_t offset) const { + Inst = MCInstBuilder(RISCV::LD).addReg(To).addReg(From).addImm(offset); + } + + void storeReg(MCInst &Inst, MCPhysReg From, MCPhysReg To, + int64_t offset) const { + Inst = MCInstBuilder(RISCV::SD).addReg(From).addReg(To).addImm(offset); + } + + void spillRegs(InstructionListType &Insts, + const SmallVector &Regs) const { + Insts.emplace_back(); + createStackPointerIncrement(Insts.back(), Regs.size() * 8); + + int64_t Offset = 0; + for (auto Reg : Regs) { + Insts.emplace_back(); + storeReg(Insts.back(), Reg, RISCV::X2, Offset); + Offset += 8; + } + } + + void reloadRegs(InstructionListType &Insts, + const SmallVector &Regs) const { + int64_t Offset = 0; + for (auto Reg : Regs) { + Insts.emplace_back(); + loadReg(Insts.back(), Reg, RISCV::X2, Offset); + Offset += 8; + } + + Insts.emplace_back(); + createStackPointerDecrement(Insts.back(), Regs.size() * 8); + } + + void atomicAdd(MCInst &Inst, MCPhysReg RegAtomic, MCPhysReg RegTo, + MCPhysReg RegCnt) const { + Inst = MCInstBuilder(RISCV::AMOADD_D) + .addReg(RegAtomic) + .addReg(RegTo) + .addReg(RegCnt); + } + + InstructionListType createCmpJE(MCPhysReg RegNo, MCPhysReg RegTmp, + const MCSymbol *Target, + MCContext *Ctx) const { + InstructionListType Insts; + Insts.emplace_back( + MCInstBuilder(RISCV::SUB).addReg(RegTmp).addReg(RegNo).addReg(RegNo)); + Insts.emplace_back(MCInstBuilder(RISCV::BEQ) + .addReg(RegNo) + .addReg(RegTmp) + .addExpr(MCSymbolRefExpr::create( + Target, MCSymbolRefExpr::VK_None, *Ctx))); + return Insts; + } + + void createTrap(MCInst &Inst) const override { + Inst.clear(); + Inst.setOpcode(RISCV::EBREAK); + } + + void createShortJmp(InstructionListType &Seq, const MCSymbol *Target, + MCContext *Ctx, bool IsTailCall) override { + // The sequence of instructions we create here is the following: + // auipc a5, hi20(Target) + // addi a5, a5, low12(Target) + // jr x5 => jalr x0, x5, 0 + MCPhysReg Reg = RISCV::X5; + InstructionListType Insts = materializeAddress(Target, Ctx, Reg); + Insts.emplace_back(); + MCInst &Inst = Insts.back(); + Inst.clear(); + Inst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Reg).addImm(0); + if (IsTailCall) + setTailCall(Inst); + Seq.swap(Insts); + } + + InstructionListType createGetter(MCContext *Ctx, const char *name) const { + InstructionListType Insts(4); + MCSymbol *Locs = Ctx->getOrCreateSymbol(name); + InstructionListType Addr = materializeAddress(Locs, Ctx, RISCV::X10); + std::copy(Addr.begin(), Addr.end(), Insts.begin()); + loadReg(Insts[2], RISCV::X10, RISCV::X10, 0); + createReturn(Insts[3]); + return Insts; + } + + InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegCnt, + MCPhysReg RegAtomic) const { + InstructionListType Insts; + Insts.emplace_back(); + Insts.back() = + MCInstBuilder(RISCV::ADDI).addReg(RegCnt).addReg(RegAtomic).addImm(1); + Insts.emplace_back(); + atomicAdd(Insts.back(), RegAtomic, RegTo, RegCnt); + return Insts; + } + + InstructionListType materializeAddress(const MCSymbol *Target, MCContext *Ctx, + MCPhysReg RegName, + int64_t Addend = 0) const override { + // Get the symbol address by auipc + addi + InstructionListType Insts(2); + MCSymbol *AuipcLabel = Ctx->createNamedTempSymbol("pcrel_hi"); + Insts[0] = MCInstBuilder(RISCV::AUIPC).addReg(RegName).addImm(0); + setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, Addend, Ctx, + ELF::R_RISCV_PCREL_HI20); + setInstLabel(Insts[0], AuipcLabel); + + Insts[1] = + MCInstBuilder(RISCV::ADDI).addReg(RegName).addReg(RegName).addImm(0); + setOperandToSymbolRef(Insts[1], /* OpNum */ 2, AuipcLabel, Addend, Ctx, + ELF::R_RISCV_PCREL_LO12_I); + return Insts; + } + + InstructionListType + createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf, + unsigned CodePointerSize) const override { + // We need 2 scratch registers: one for the target address (x10), and one + // for the increment value (x11). + // addi sp, sp, -16 + // sd x10, 0(sp) + // sd x11, 8(sp) + // la x10, target # 1: auipc x10, %pcrel_hi(target) + // # addi x10, x10, %pcrel_lo(1b) + // li x11, 1 # addi x11, zero, 1 + // amoadd.d zero, x10, x11 + // ld x10, 0(sp) + // ld x11, 8(sp) + // addi sp, sp, 16 + + InstructionListType Insts; + spillRegs(Insts, {RISCV::X10, RISCV::X11}); + InstructionListType Addr = materializeAddress(Target, Ctx, RISCV::X10); + Insts.insert(Insts.end(), Addr.begin(), Addr.end()); + InstructionListType IncInsts = + createIncMemory(RISCV::X10, RISCV::X11, RISCV::X0); + Insts.insert(Insts.end(), IncInsts.begin(), IncInsts.end()); + reloadRegs(Insts, {RISCV::X10, RISCV::X11}); + return Insts; + } + + void createDirectCall(MCInst &Inst, const MCSymbol *Target, MCContext *Ctx, + bool IsTailCall) override { + Inst.setOpcode(RISCV::JAL); + Inst.clear(); + if (IsTailCall) { + Inst.addOperand(MCOperand::createReg(RISCV::X0)); + Inst.addOperand(MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), + *Ctx, 0))); + convertJmpToTailCall(Inst); + } else { + Inst.addOperand(MCOperand::createReg(RISCV::X1)); + Inst.addOperand(MCOperand::createExpr(getTargetExprFor( + Inst, MCSymbolRefExpr::create(Target, MCSymbolRefExpr::VK_None, *Ctx), + *Ctx, 0))); + } + } + + void createIndirectCallInst(MCInst &Inst, bool IsTailCall, MCPhysReg Reg, + int64_t Disp) const { + Inst.clear(); + Inst.setOpcode(RISCV::JALR); + Inst.clear(); + if (IsTailCall) { + Inst.addOperand(MCOperand::createReg(RISCV::X0)); + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createImm(Disp)); + } else { + Inst.addOperand(MCOperand::createReg(RISCV::X1)); + Inst.addOperand(MCOperand::createReg(Reg)); + Inst.addOperand(MCOperand::createImm(Disp)); + } + } + + InstructionListType + createInstrumentedIndCallHandlerEntryBB(const MCSymbol *InstrTrampoline, + const MCSymbol *IndCallHandler, + MCContext *Ctx) override { + // Code sequence used to check whether InstrTampoline was initialized + // and call it if so, returns via IndCallHandler + // sp -16(sp) + // sd x10, 0(sp) + // sd x11, 0(sp) + // la x10, InstrTrampoline -> auipc + addi + // ld x10, [x10] + // beq x10, x11, IndCallHandler + // sp -16(sp) + // sd x1, 0(sp) + // jalr x1,x10,0 + // ld x1, [sp], #16 + // sp 16(sp) + // jal x0, IndCallHandler + + InstructionListType Insts; + spillRegs(Insts, {RISCV::X10, RISCV::X11}); + InstructionListType Addr = + materializeAddress(InstrTrampoline, Ctx, RISCV::X10); + Insts.insert(Insts.end(), Addr.begin(), Addr.end()); + Insts.emplace_back(); + loadReg(Insts.back(), RISCV::X10, RISCV::X10, 0); + InstructionListType cmpJmp = + createCmpJE(RISCV::X10, RISCV::X11, IndCallHandler, Ctx); + Insts.insert(Insts.end(), cmpJmp.begin(), cmpJmp.end()); + Insts.emplace_back(); + createStackPointerIncrement(Insts.back(), 16); + Insts.emplace_back(); + storeReg(Insts.back(), RISCV::X1, RISCV::X2, 0); + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), /*IsTailCall*/ false, RISCV::X10, 0); + Insts.emplace_back(); + loadReg(Insts.back(), RISCV::X1, RISCV::X2, 0); + Insts.emplace_back(); + createStackPointerDecrement(Insts.back(), 16); + Insts.emplace_back(); + createDirectCall(Insts.back(), IndCallHandler, Ctx, /*IsTailCall*/ true); + return Insts; + } + + InstructionListType createInstrumentedIndCallHandlerExitBB() const override { + InstructionListType Insts; + reloadRegs(Insts, {RISCV::X10, RISCV::X11}); + Insts.emplace_back(); + loadReg(Insts.back(), RISCV::X5, RISCV::X2, 0); + Insts.emplace_back(); + createStackPointerDecrement(Insts.back(), 16); + reloadRegs(Insts, {RISCV::X10, RISCV::X11}); + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), /*IsTailCall*/ true, RISCV::X5, 0); + return Insts; + } + + InstructionListType + createInstrumentedIndTailCallHandlerExitBB() const override { + return createInstrumentedIndCallHandlerExitBB(); + } + + std::vector createSymbolTrampoline(const MCSymbol *TgtSym, + MCContext *Ctx) override { + std::vector Insts; + createShortJmp(Insts, TgtSym, Ctx, /*IsTailCall*/ true); + return Insts; + } + + InstructionListType createNumCountersGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_num_counters"); + } + + InstructionListType + createInstrLocationsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_locations"); + } + + InstructionListType createInstrTablesGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_tables"); + } + + InstructionListType createInstrNumFuncsGetter(MCContext *Ctx) const override { + return createGetter(Ctx, "__bolt_instr_num_funcs"); + } + + void convertIndirectCallToLoad(MCInst &Inst, MCPhysReg Reg, + MCPhysReg ZeroReg) const { + bool IsTailCall = isTailCall(Inst); + if (IsTailCall) + removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall); + Inst.setOpcode(RISCV::ADD); + Inst.insert(Inst.begin(), MCOperand::createReg(Reg)); + Inst.insert(Inst.begin() + 1, MCOperand::createReg(ZeroReg)); + return; + } + + InstructionListType createLoadImmediate(const MCPhysReg Dest, + uint64_t Imm) const override { + InstructionListType Insts; + // get IMM higher 32bit + Insts.emplace_back( + MCInstBuilder(RISCV::LUI).addReg(Dest).addImm((Imm >> 44) & 0xFFFFF)); + Insts.emplace_back(MCInstBuilder(RISCV::LUI) + .addReg(RISCV::X5) + .addImm((Imm >> 32) & 0xFFF)); + Insts.emplace_back(MCInstBuilder(RISCV::SRLI) + .addReg(RISCV::X5) + .addReg(RISCV::X5) + .addImm(12)); + Insts.emplace_back( + MCInstBuilder(RISCV::OR).addReg(Dest).addReg(Dest).addReg(RISCV::X5)); + Insts.emplace_back( + MCInstBuilder(RISCV::SLLI).addReg(Dest).addReg(Dest).addImm(32)); + + // get IMM lower 32bit + Insts.emplace_back(MCInstBuilder(RISCV::LUI) + .addReg(RISCV::X5) + .addImm((Imm >> 12) & 0xFFFFF)); + Insts.emplace_back( + MCInstBuilder(RISCV::LUI).addReg(RISCV::X6).addImm((Imm)&0xFFF)); + Insts.emplace_back(MCInstBuilder(RISCV::SRLI) + .addReg(RISCV::X6) + .addReg(RISCV::X6) + .addImm(12)); + Insts.emplace_back( + MCInstBuilder(RISCV::OR).addReg(RISCV::X5).addReg(RISCV::X5).addReg( + RISCV::X6)); + + // get 64bit IMM + Insts.emplace_back( + MCInstBuilder(RISCV::OR).addReg(Dest).addReg(Dest).addReg(RISCV::X5)); + return Insts; + } + + InstructionListType createInstrumentedIndirectCall(MCInst &&CallInst, + MCSymbol *HandlerFuncAddr, + int CallSiteID, + MCContext *Ctx) override { + // Code sequence used to enter indirect call instrumentation helper: + // addi sp, sp, -0x10 + // sd a0, 0x0(sp) + // sd a1, 0x8(sp) + // mov target x0 convertIndirectCallToLoad -> add a0, zero, target + // mov x1 CallSiteID createLoadImmediate + // addi sp, sp, -0x10 + // sd a0, 0x0(sp) + // sd a1, 0x8(sp) + // la x0 *HandlerFuncAddr -> auipc + addi + // jalr x0 + + InstructionListType Insts; + spillRegs(Insts, {RISCV::X10, RISCV::X11}); + Insts.emplace_back(CallInst); + convertIndirectCallToLoad(Insts.back(), RISCV::X10, RISCV::X0); + InstructionListType LoadImm = createLoadImmediate(RISCV::X11, CallSiteID); + Insts.insert(Insts.end(), LoadImm.begin(), LoadImm.end()); + spillRegs(Insts, {RISCV::X10, RISCV::X11}); + InstructionListType Addr = + materializeAddress(HandlerFuncAddr, Ctx, RISCV::X5); + Insts.insert(Insts.end(), Addr.begin(), Addr.end()); + Insts.emplace_back(); + createIndirectCallInst(Insts.back(), isTailCall(CallInst), RISCV::X5, 0); + + // // Carry over metadata including tail call marker if present. + stripAnnotations(Insts.back()); + moveAnnotations(std::move(CallInst), Insts.back()); + + return Insts; + } }; } // end anonymous namespace diff --git a/bolt/runtime/CMakeLists.txt b/bolt/runtime/CMakeLists.txt index 0deb69a27d43..87cc44812da1 100644 --- a/bolt/runtime/CMakeLists.txt +++ b/bolt/runtime/CMakeLists.txt @@ -35,15 +35,21 @@ set(BOLT_RT_FLAGS -fno-exceptions -fno-rtti -fno-stack-protector - -fPIC - -mgeneral-regs-only) + -fPIC) if (CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64") - set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-sse") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} + -mno-sse + -mgeneral-regs-only) +endif() +if (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS}) endif() if (CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") check_cxx_compiler_flag("-mno-outline-atomics" CXX_SUPPORTS_OUTLINE_ATOMICS) if (CXX_SUPPORTS_OUTLINE_ATOMICS) - set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} "-mno-outline-atomics") + set(BOLT_RT_FLAGS ${BOLT_RT_FLAGS} + -mno-outline-atomics + -mgeneral-regs-only) endif() endif() diff --git a/bolt/runtime/common.h b/bolt/runtime/common.h index 27d083007106..3461f8c545f9 100644 --- a/bolt/runtime/common.h +++ b/bolt/runtime/common.h @@ -153,10 +153,12 @@ struct timespec { #if defined(__aarch64__) || defined(__arm64__) #include "sys_aarch64.h" +#elif defined(__riscv) +#include "sys_riscv64.h" #elif defined(__x86_64__) #include "sys_x86_64.h" #else -#error "For AArch64/ARM64 and X86_64 only." +#error "For AArch64/ARM64,X86_64 AND RISCV64 only." #endif constexpr uint32_t BufSize = 10240; diff --git a/bolt/runtime/instr.cpp b/bolt/runtime/instr.cpp index d1f8a216badc..ae356e71cbe4 100644 --- a/bolt/runtime/instr.cpp +++ b/bolt/runtime/instr.cpp @@ -1674,6 +1674,19 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_call() "ret\n" :::); // clang-format on +#elif defined(__riscv) + // clang-format off + __asm__ __volatile__( + SAVE_ALL + "addi sp, sp, 288\n" + "ld x10, 0(sp)\n" + "ld x11, 8(sp)\n" + "addi sp, sp, -288\n" + "jal x1, instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on #else // clang-format off __asm__ __volatile__(SAVE_ALL @@ -1698,6 +1711,18 @@ extern "C" __attribute((naked)) void __bolt_instr_indirect_tailcall() "ret\n" :::); // clang-format on +#elif defined(__riscv) + // clang-format off + __asm__ __volatile__(SAVE_ALL + "addi sp, sp, 288\n" + "ld x10, 0(sp)\n" + "ld x11, 8(sp)\n" + "addi sp, sp, -288\n" + "jal x1, instrumentIndirectCall\n" + RESTORE_ALL + "ret\n" + :::); + // clang-format on #else // clang-format off __asm__ __volatile__(SAVE_ALL @@ -1724,6 +1749,18 @@ extern "C" __attribute((naked)) void __bolt_instr_start() "br x16\n" :::); // clang-format on +#elif defined(__riscv) + // clang-format off + __asm__ __volatile__( + SAVE_ALL + "jal x1, __bolt_instr_setup\n" + RESTORE_ALL + "setup_symbol:\n" + "auipc x5, %%pcrel_hi(__bolt_start_trampoline)\n" + "addi x5, x5, %%pcrel_lo(setup_symbol)\n" + "jr x5\n" + :::); + // clang-format on #else // clang-format off __asm__ __volatile__(SAVE_ALL @@ -1746,6 +1783,17 @@ extern "C" void __bolt_instr_fini() { RESTORE_ALL :::); // clang-format on +#elif defined(__riscv) + // clang-format off + __asm__ __volatile__( + SAVE_ALL + "fini_symbol:\n" + "auipc x5, %%pcrel_hi(__bolt_fini_trampoline)\n" + "addi x5, x5, %%pcrel_lo(fini_symbol)\n" + "jalr x1, 0(x5)\n" + RESTORE_ALL + :::); + // clang-format on #else __asm__ __volatile__("call __bolt_fini_trampoline\n" :::); #endif diff --git a/bolt/runtime/sys_riscv64.h b/bolt/runtime/sys_riscv64.h new file mode 100644 index 000000000000..00a21e4945f0 --- /dev/null +++ b/bolt/runtime/sys_riscv64.h @@ -0,0 +1,460 @@ +#ifndef LLVM_TOOLS_LLVM_BOLT_SYS_RISCV +#define LLVM_TOOLS_LLVM_BOLT_SYS_RISCV + +// Save all registers while keeping 16B stack alignment +#define SAVE_ALL \ + "addi sp, sp, -16\n" \ + "sd x0, 0(sp)\n" \ + "sd x1, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x2, 0(sp)\n" \ + "sd x3, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x4, 0(sp)\n" \ + "sd x5, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x6, 0(sp)\n" \ + "sd x7, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x8, 0(sp)\n" \ + "sd x9, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x10, 0(sp)\n" \ + "sd x11, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x12, 0(sp)\n" \ + "sd x13, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x14, 0(sp)\n" \ + "sd x15, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x16, 0(sp)\n" \ + "sd x17, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x18, 0(sp)\n" \ + "sd x19, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x20, 0(sp)\n" \ + "sd x21, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x22, 0(sp)\n" \ + "sd x23, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x24, 0(sp)\n" \ + "sd x25, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x26, 0(sp)\n" \ + "sd x27, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x28, 0(sp)\n" \ + "sd x29, 8(sp)\n" \ + "addi sp, sp, -16\n" \ + "sd x30, 0(sp)\n" \ + "sd x31, 8(sp)\n" +// Mirrors SAVE_ALL +#define RESTORE_ALL \ + "ld x30, 0(sp)\n" \ + "ld x31, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x28, 0(sp)\n" \ + "ld x29, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x26, 0(sp)\n" \ + "ld x27, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x24, 0(sp)\n" \ + "ld x25, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x22, 0(sp)\n" \ + "ld x23, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x20, 0(sp)\n" \ + "ld x21, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x18, 0(sp)\n" \ + "ld x19, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x16, 0(sp)\n" \ + "ld x17, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x14, 0(sp)\n" \ + "ld x15, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x12, 0(sp)\n" \ + "ld x13, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x10, 0(sp)\n" \ + "ld x11, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x8, 0(sp)\n" \ + "ld x9, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x6, 0(sp)\n" \ + "ld x7, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x4, 0(sp)\n" \ + "ld x5, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x2, 0(sp)\n" \ + "ld x3, 8(sp)\n" \ + "addi sp, sp, 16\n" \ + "ld x0, 0(sp)\n" \ + "ld x1, 8(sp)\n" \ + "addi sp, sp, 16\n" + +// Anonymous namespace covering everything but our library entry point +namespace { + +// Get the difference between runtime addrress of .text section and +// static address in section header table. Can be extracted from arbitrary +// pc value recorded at runtime to get the corresponding static address, which +// in turn can be used to search for indirect call description. Needed because +// indirect call descriptions are read-only non-relocatable data. +uint64_t getTextBaseAddress() { + uint64_t DynAddr; + uint64_t StaticAddr; + __asm__ volatile("lla %0, __hot_end\n\t" + "lui %1, %%hi(__hot_end)\n\t" + "addi %1, %1, %%lo(__hot_end)\n\t" + : "=r"(DynAddr), "=r"(StaticAddr)); + return DynAddr - StaticAddr; +} + +uint64_t __read(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t a0 __asm__("a0") = fd; + register const void *a1 __asm__("a1") = buf; + register uint64_t a2 __asm__("a2") = count; + register uint64_t a7 __asm__("a7") = + 63; // Assuming 63 is the syscall number for read + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a0), "r"(a1), "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +uint64_t __write(uint64_t fd, const void *buf, uint64_t count) { + uint64_t ret; + register uint64_t a0 __asm__("a0") = fd; + register const void *a1 __asm__("a1") = buf; + register uint64_t a2 __asm__("a2") = count; + register uint32_t a7 __asm__("a7") = + 64; // Assuming 64 is the syscall number for write + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a0), "r"(a1), "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +void *__mmap(uint64_t addr, uint64_t size, uint64_t prot, uint64_t flags, + uint64_t fd, uint64_t offset) { + void *ret; + register uint64_t a0 __asm__("a0") = addr; + register uint64_t a1 __asm__("a1") = size; + register uint64_t a2 __asm__("a2") = prot; + register uint64_t a3 __asm__("a3") = flags; + register uint64_t a4 __asm__("a4") = fd; + register uint64_t a5 __asm__("a5") = offset; + register uint32_t a7 __asm__("a7") = + 222; // Assuming 222 is the syscall number for mmap + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a4), "r"(a5), + "r"(a7) + : "memory"); + return ret; +} + +uint64_t __munmap(void *addr, uint64_t size) { + uint64_t ret; + register void *a0 __asm__("a0") = addr; + register uint64_t a1 __asm__("a1") = size; + register uint32_t a7 __asm__("a7") = 215; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __exit(uint64_t code) { + uint64_t ret; + register uint64_t a0 __asm__("a0") = code; + register uint32_t a7 __asm__("a7") = 94; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __open(const char *pathname, uint64_t flags, uint64_t mode) { + uint64_t ret; + register int a0 __asm__("a0") = + -100; // Assuming -100 is an invalid file descriptor + register const char *a1 __asm__("a1") = pathname; + register uint64_t a2 __asm__("a2") = flags; + register uint64_t a3 __asm__("a3") = mode; + register uint64_t a7 __asm__("a7") = + 56; // Assuming 56 is the syscall number for open + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a0), "r"(a1), "r"(a2), "r"(a3), "r"(a7) + : "memory"); + return ret; +} + +long __getdents64(unsigned int fd, dirent64 *dirp, size_t count) { + long ret; + register unsigned int a0 __asm__("a0") = fd; + register dirent64 *a1 __asm__("a1") = dirp; + register size_t a2 __asm__("a2") = count; + register uint32_t a7 __asm__("a7") = 61; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +uint64_t __readlink(const char *pathname, char *buf, size_t bufsize) { + uint64_t ret; + register int a0 __asm__("a0") = -100; + register const char *a1 __asm__("a1") = pathname; + register char *a2 __asm__("a2") = buf; + register size_t a3 __asm__("a3") = bufsize; + register uint32_t a7 __asm__("a7") = 78; // readlinkat + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a3), "r"(a7) + : "memory"); + return ret; +} + +uint64_t __lseek(uint64_t fd, uint64_t pos, uint64_t whence) { + uint64_t ret; + register uint64_t a0 __asm__("a0") = fd; + register uint64_t a1 __asm__("a1") = pos; + register uint64_t a2 __asm__("a2") = whence; + register uint32_t a7 __asm__("a7") = 62; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +int __ftruncate(uint64_t fd, uint64_t length) { + int ret; + register uint64_t a0 __asm__("a0") = fd; + register uint64_t a1 __asm__("a1") = length; + register uint32_t a7 __asm__("a7") = 46; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a7) + : "memory"); + return ret; +} + +int __close(uint64_t fd) { + int ret; + register uint64_t a0 __asm__("a0") = fd; + register uint32_t a7 __asm__("a7") = + 57; // Assuming 57 is the syscall number for close + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a0), "r"(a7) + : "memory"); + return ret; +} + +int __madvise(void *addr, size_t length, int advice) { + int ret; + register void *a0 __asm__("a0") = addr; + register size_t a1 __asm__("a1") = length; + register int a2 __asm__("a2") = advice; + register uint32_t a7 __asm__("a7") = 233; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +int __uname(struct UtsNameTy *buf) { + int ret; + register UtsNameTy *a0 __asm__("a0") = buf; + register uint32_t a7 __asm__("a7") = 160; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __nanosleep(const timespec *req, timespec *rem) { + uint64_t ret; + register const timespec *a0 __asm__("a0") = req; + register timespec *a1 __asm__("a1") = rem; + register uint32_t a7 __asm__("a7") = 101; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a7) + : "memory"); + return ret; +} + +int64_t __fork() { + uint64_t ret; + // clone instead of fork with flags + // "CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD" + register uint64_t a0 __asm__("a0") = 0x1200011; + register uint64_t a1 __asm__("a1") = 0; + register uint64_t a2 __asm__("a2") = 0; + register uint64_t a3 __asm__("a3") = 0; + register uint64_t a4 __asm__("a4") = 0; + register uint32_t a7 __asm__("a7") = 220; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a3), "r"(a4), "r"(a7) + : "memory"); + return ret; +} + +int __mprotect(void *addr, size_t len, int prot) { + int ret; + register void *a0 __asm__("a0") = addr; + register size_t a1 __asm__("a1") = len; + register int a2 __asm__("a2") = prot; + register uint32_t a7 __asm__("a7") = 226; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a7) + : "memory"); + return ret; +} + +uint64_t __getpid() { + uint64_t ret; + register uint32_t a7 __asm__("a7") = 172; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __getppid() { + uint64_t ret; + register uint32_t a7 __asm__("a7") = 173; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret) + : "r"(a7) + : "memory"); + return ret; +} + +int __setpgid(uint64_t pid, uint64_t pgid) { + int ret; + register uint64_t a0 __asm__("a0") = pid; + register uint64_t a1 __asm__("a1") = pgid; + register uint32_t a7 __asm__("a7") = 154; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __getpgid(uint64_t pid) { + uint64_t ret; + register uint64_t a0 __asm__("a0") = pid; + register uint32_t a7 __asm__("a7") = 155; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0) + : "r"(a7) + : "memory"); + return ret; +} + +int __kill(uint64_t pid, int sig) { + int ret; + register uint64_t a0 __asm__("a0") = pid; + register int a1 __asm__("a1") = sig; + register uint32_t a7 __asm__("a7") = 129; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a7) + : "memory"); + return ret; +} + +int __fsync(int fd) { + int ret; + register int a0 __asm__("a0") = fd; + register uint32_t a7 __asm__("a7") = 82; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0) + : "r"(a7) + : "memory"); + return ret; +} + +uint64_t __sigprocmask(int how, const void *set, void *oldset) { + uint64_t ret; + register int a0 __asm__("a0") = how; + register const void *a1 __asm__("a1") = set; + register void *a2 __asm__("a2") = oldset; + register long a3 asm("a3") = 8; + register uint32_t a7 __asm__("a7") = 135; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a3), "r"(a7) + : "memory"); + return ret; +} + +int __prctl(int option, unsigned long arg2, unsigned long arg3, + unsigned long arg4, unsigned long arg5) { + int ret; + register int a0 __asm__("a0") = option; + register unsigned long a1 __asm__("a1") = arg2; + register unsigned long a2 __asm__("a2") = arg3; + register unsigned long a3 __asm__("a3") = arg4; + register unsigned long a4 __asm__("a4") = arg5; + register uint32_t a7 __asm__("a7") = 167; + __asm__ __volatile__("ecall\n\t" + "mv %0, a0" + : "=r"(ret), "+r"(a0), "+r"(a1) + : "r"(a2), "r"(a3), "r"(a4), "r"(a7) + : "cc", "memory"); + return ret; +} +} // anonymous namespace + +#endif \ No newline at end of file diff --git a/bolt/test/runtime/RISCV/basic-instrumentation.s b/bolt/test/runtime/RISCV/basic-instrumentation.s new file mode 100644 index 000000000000..e926f98cef43 --- /dev/null +++ b/bolt/test/runtime/RISCV/basic-instrumentation.s @@ -0,0 +1,33 @@ +# REQUIRES: system-linux,bolt-runtime + +# RUN: %clang %cflags -Wl,-q -o %t.exe %s +# RUN: llvm-bolt --instrument --instrumentation-file=%t.fdata -o %t.instr %t.exe + +## Run the profiled binary and check that the profile reports at least that `f` +## has been called. +# RUN: rm -f %t.fdata +# RUN: %t.instr +# RUN: cat %t.fdata | FileCheck %s +# CHECK: f 0 0 1{{$}} + +## Check BOLT works with this profile +# RUN: llvm-bolt --data %t.fdata --reorder-blocks=cache -o %t.bolt %t.exe + + .text + .globl main + .type main, @function +main: + addi sp, sp, -8 + sd ra, 0(sp) + call f + ld ra, 0(sp) + addi sp, sp, 8 + li a0, 0 + ret + .size main, .-main + + .globl f + .type f, @function +f: + ret + .size f, .-f diff --git a/bolt/test/runtime/RISCV/instrumentation-ind-call.c b/bolt/test/runtime/RISCV/instrumentation-ind-call.c new file mode 100644 index 000000000000..1fd49a774c94 --- /dev/null +++ b/bolt/test/runtime/RISCV/instrumentation-ind-call.c @@ -0,0 +1,39 @@ +/* +REQUIRES: system-linux,bolt-runtime + +RUN: %clang %cflags %s -o %t.exe -Wl,-q + +RUN: llvm-bolt %t.exe --instrument --instrumentation-file=%t.fdata \ +RUN: -o %t.instrumented + +# Instrumented program needs to finish returning zero +RUN: %t.instrumented | FileCheck %s -check-prefix=CHECK-OUTPUT + +# Test that the instrumented data makes sense +RUN: llvm-bolt %t.exe -o %t.bolted --data %t.fdata \ +RUN: --reorder-blocks=ext-tsp --reorder-functions=hfsort+ \ +RUN: --print-only=main --print-finalized | FileCheck %s + +RUN: %t.bolted | FileCheck %s -check-prefix=CHECK-OUTPUT + +CHECK-OUTPUT: The sum is: 30 + +# Check that our indirect call has 1 hit recorded in the fdata file and that +# this was processed correctly by BOLT +CHECK: jalr a2 # CallProfile: 1 (0 misses) : +CHECK-NEXT: { add: 1 (0 misses) } +*/ + +#include + +typedef int (*func_ptr)(int, int); + +int add(int a, int b) { return a + b; } + +int main() { + func_ptr fun; + fun = add; + int sum = fun(10, 20); // indirect call to 'add' + printf("The sum is: %d\n", sum); + return 0; +} diff --git a/bolt/test/runtime/RISCV/lit.local.cfg b/bolt/test/runtime/RISCV/lit.local.cfg new file mode 100644 index 000000000000..c0627d905ab3 --- /dev/null +++ b/bolt/test/runtime/RISCV/lit.local.cfg @@ -0,0 +1,2 @@ +if config.host_arch not in ["riscv64"]: + config.unsupported = True