From 52e0d8d473afc389a7f8753fa16a2c7344a1b08d Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 19 Oct 2023 21:53:57 +1000 Subject: [PATCH] CPU/Recompiler/AArch32: Load membase on demand --- src/core/cpu_newrec_compiler_aarch64.cpp | 31 +++++++++--------- src/core/cpu_recompiler_code_generator.h | 2 ++ .../cpu_recompiler_code_generator_aarch32.cpp | 32 +++++++++++-------- src/core/cpu_recompiler_types.h | 1 - 4 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/core/cpu_newrec_compiler_aarch64.cpp b/src/core/cpu_newrec_compiler_aarch64.cpp index 82b015877..53cf75ae9 100644 --- a/src/core/cpu_newrec_compiler_aarch64.cpp +++ b/src/core/cpu_newrec_compiler_aarch64.cpp @@ -382,7 +382,6 @@ void CPU::NewRec::AArch64Compiler::EndAndLinkBlock(const std::optional& new DebugAssert(!m_dirty_pc); // TODO: try extracting this to a function - // TODO: move the cycle flush in here.. // save cycles for event test const TickCount cycles = std::exchange(m_cycles, 0); @@ -621,7 +620,12 @@ void CPU::NewRec::AArch64Compiler::Flush(u32 flags) if (flags & FLUSH_INSTRUCTION_BITS) { // This sucks, but it's only used for fallbacks. - Panic("Not implemented"); + EmitMov(RWARG1, inst->bits); + EmitMov(RWARG2, m_current_instruction_pc); + EmitMov(RWARG3, m_current_instruction_branch_delay_slot); + armAsm->str(RWARG1, PTR(&g_state.current_instruction.bits)); + armAsm->str(RWARG2, PTR(&g_state.current_instruction_pc)); + armAsm->strb(RWARG3, PTR(&g_state.current_instruction_in_branch_delay_slot)); } if (flags & FLUSH_LOAD_DELAY_FROM_STATE && m_load_delay_dirty) @@ -699,26 +703,23 @@ void CPU::NewRec::AArch64Compiler::Compile_Fallback() { Flush(FLUSH_FOR_INTERPRETER); -#if 0 - cg->call(&CPU::Recompiler::Thunks::InterpretInstruction); + EmitCall(armAsm, &CPU::Recompiler::Thunks::InterpretInstruction); // TODO: make me less garbage // TODO: this is wrong, it flushes the load delay on the same cycle when we return. // but nothing should be going through here.. Label no_load_delay; - cg->movzx(RWARG1, cg->byte[PTR(&g_state.next_load_delay_reg)]); - cg->cmp(RWARG1, static_cast(Reg::count)); - cg->je(no_load_delay, CodeGenerator::T_SHORT); - cg->mov(RWARG2, cg->dword[PTR(&g_state.next_load_delay_value)]); - cg->mov(cg->byte[PTR(&g_state.load_delay_reg)], RWARG1); - cg->mov(cg->dword[PTR(&g_state.load_delay_value)], RWARG2); - cg->mov(cg->byte[PTR(&g_state.next_load_delay_reg)], static_cast(Reg::count)); - cg->L(no_load_delay); + armAsm->ldrb(RWARG1, PTR(&g_state.next_load_delay_reg)); + armAsm->cmp(RWARG1, static_cast(Reg::count)); + armAsm->b(&no_load_delay, eq); + armAsm->ldr(RWARG2, PTR(&g_state.next_load_delay_value)); + armAsm->strb(RWARG1, PTR(&g_state.load_delay_reg)); + armAsm->str(RWARG2, PTR(&g_state.load_delay_value)); + EmitMov(RWARG1, static_cast(Reg::count)); + armAsm->strb(RWARG1, PTR(&g_state.next_load_delay_reg)); + armAsm->bind(&no_load_delay); m_load_delay_dirty = EMULATE_LOAD_DELAYS; -#else - Panic("Fixme"); -#endif } void CPU::NewRec::AArch64Compiler::CheckBranchTarget(const vixl::aarch64::WRegister& pcreg) diff --git a/src/core/cpu_recompiler_code_generator.h b/src/core/cpu_recompiler_code_generator.h index e8d11da8b..68ffc31c9 100644 --- a/src/core/cpu_recompiler_code_generator.h +++ b/src/core/cpu_recompiler_code_generator.h @@ -122,6 +122,7 @@ public: const Value& address, RegSize size, const Value& value); void EmitStoreGuestMemorySlowmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, RegSize size, const Value& value, bool in_far_code); + void EnsureMembaseLoaded(); void EmitUpdateFastmemBase(); // Unconditional branch to pointer. May allocate a scratch register. @@ -291,6 +292,7 @@ private: bool m_load_delay_dirty = false; bool m_next_load_delay_dirty = false; bool m_gte_busy_cycles_dirty = false; + bool m_membase_loaded = false; ////////////////////////////////////////////////////////////////////////// // Speculative Constants diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index dccd1ed75..8ab2bf6d0 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -224,13 +224,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) #undef RARG3 #undef RARG4 #undef RSCRATCH -#undef RMEMBASE #undef RSTATE namespace CPU::Recompiler { constexpr HostReg RCPUPTR = 4; -constexpr HostReg RMEMBASEPTR = 5; +constexpr HostReg RMEMBASEPTR = 3; constexpr HostReg RRETURN = 0; constexpr HostReg RARG1 = 0; constexpr HostReg RARG2 = 1; @@ -385,14 +384,6 @@ void CodeGenerator::EmitBeginBlock(bool allocate_registers /* = true */) // m_emit->Mov(GetCPUPtrReg(), reinterpret_cast(&g_state)); DebugAssert(cpu_reg_allocated); UNREFERENCED_VARIABLE(cpu_reg_allocated); - - // If there's loadstore instructions, preload the fastmem base. - if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) - { - const bool fastmem_reg_allocated = m_register_cache.AllocateHostReg(RMEMBASEPTR); - Assert(fastmem_reg_allocated); - m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); - } } } @@ -400,9 +391,6 @@ void CodeGenerator::EmitEndBlock(bool free_registers /* = true */, const void* j { if (free_registers) { - if (m_block->HasFlag(CodeCache::BlockFlags::ContainsLoadStoreInstructions)) - m_register_cache.FreeHostReg(RMEMBASEPTR); - m_register_cache.FreeHostReg(RCPUPTR); m_register_cache.FreeHostReg(14); m_register_cache.PopCalleeSavedRegisters(true); @@ -1058,6 +1046,7 @@ void CodeGenerator::EmitSetConditionResult(HostReg to_reg, RegSize to_size, Cond u32 CodeGenerator::PrepareStackForCall() { m_register_cache.PushCallerSavedRegisters(); + m_membase_loaded = false; return 0; } @@ -1351,13 +1340,24 @@ void CodeGenerator::EmitAddCPUStructField(u32 offset, const Value& value) } } +void CodeGenerator::EnsureMembaseLoaded() +{ + if (m_membase_loaded) + return; + + m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + m_membase_loaded = true; +} + void CodeGenerator::EmitUpdateFastmemBase() { - m_emit->Ldr(GetFastmemBasePtrReg(), a32::MemOperand(GetCPUPtrReg(), offsetof(State, fastmem_base))); + m_membase_loaded = false; } void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, Value& result) { + EnsureMembaseLoaded(); + HostReg address_reg; if (address.IsConstant()) { @@ -1396,6 +1396,8 @@ void CodeGenerator::EmitLoadGuestRAMFastmem(const Value& address, RegSize size, void CodeGenerator::EmitLoadGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, RegSize size, Value& result) { + EnsureMembaseLoaded(); + HostReg address_reg; if (address.IsConstant()) { @@ -1538,6 +1540,8 @@ void CodeGenerator::EmitLoadGuestMemorySlowmem(Instruction instruction, const Co void CodeGenerator::EmitStoreGuestMemoryFastmem(Instruction instruction, const CodeCache::InstructionInfo& info, const Value& address, RegSize size, const Value& value) { + EnsureMembaseLoaded(); + Value actual_value = GetValueInHostRegister(value); HostReg address_reg; diff --git a/src/core/cpu_recompiler_types.h b/src/core/cpu_recompiler_types.h index 1b89317a8..ec70cf02e 100644 --- a/src/core/cpu_recompiler_types.h +++ b/src/core/cpu_recompiler_types.h @@ -84,7 +84,6 @@ constexpr u32 MAX_FAR_HOST_BYTES_PER_INSTRUCTION = 128; #define RARG4 vixl::aarch32::r3 #define RSCRATCH vixl::aarch32::r12 #define RSTATE vixl::aarch32::r4 -#define RMEMBASE vixl::aarch32::r5 s32 armGetPCDisplacement(const void* current, const void* target); bool armIsPCDisplacementInImmediateRange(s32 displacement);