diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 1b5c77e9d..40d17d6b7 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -46,29 +46,141 @@ alignas(Recompiler::CODE_STORAGE_ALIGNMENT) static u8 #endif static JitCodeBuffer s_code_buffer; +static FastMapTable s_fast_map[FAST_MAP_TABLE_COUNT]; +static std::unique_ptr s_fast_map_pointers; -std::array s_fast_map; DispatcherFunction s_asm_dispatcher; SingleBlockDispatcherFunction s_single_block_asm_dispatcher; -ALWAYS_INLINE static u32 GetFastMapIndex(u32 pc) +static FastMapTable DecodeFastMapPointer(u32 slot, FastMapTable ptr) { - return ((pc & PHYSICAL_MEMORY_ADDRESS_MASK) >= Bus::BIOS_BASE) ? - (FAST_MAP_RAM_SLOT_COUNT + ((pc & Bus::BIOS_MASK) >> 2)) : - ((pc & Bus::g_ram_mask) >> 2); + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(reinterpret_cast(ptr) + (static_cast(slot) << 17)); + else + return reinterpret_cast(reinterpret_cast(ptr) + (slot << 16)); +} + +static FastMapTable EncodeFastMapPointer(u32 slot, FastMapTable ptr) +{ + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(reinterpret_cast(ptr) - (static_cast(slot) << 17)); + else + return reinterpret_cast(reinterpret_cast(ptr) - (slot << 16)); +} + +static CodeBlock::HostCodePointer* OffsetFastMapPointer(FastMapTable fake_ptr, u32 pc) +{ + u8* fake_byte_ptr = reinterpret_cast(fake_ptr); + if constexpr (sizeof(void*) == 8) + return reinterpret_cast(fake_byte_ptr + (static_cast(pc) << 1)); + else + return reinterpret_cast(fake_byte_ptr + pc); } static void CompileDispatcher(); static void FastCompileBlockFunction(); +static void InvalidCodeFunction(); + +static constexpr u32 GetTableCount(u32 start, u32 end) +{ + return ((end >> FAST_MAP_TABLE_SHIFT) - (start >> FAST_MAP_TABLE_SHIFT)) + 1; +} + +static void AllocateFastMapTables(u32 start, u32 end, FastMapTable& table_ptr) +{ + const u32 start_slot = start >> FAST_MAP_TABLE_SHIFT; + const u32 count = GetTableCount(start, end); + for (u32 i = 0; i < count; i++) + { + const u32 slot = start_slot + i; + + s_fast_map[slot] = EncodeFastMapPointer(slot, table_ptr); + table_ptr += FAST_MAP_TABLE_SIZE; + } +} + +static void AllocateFastMap() +{ + static constexpr VirtualMemoryAddress ranges[][2] = { + {0x00000000, 0x00800000}, // RAM + {0x1F000000, 0x1F800000}, // EXP1 + {0x1FC00000, 0x1FC80000}, // BIOS + + {0x80000000, 0x80800000}, // RAM + {0x9F000000, 0x9F800000}, // EXP1 + {0x9FC00000, 0x9FC80000}, // BIOS + + {0xA0000000, 0xA0800000}, // RAM + {0xBF000000, 0xBF800000}, // EXP1 + {0xBFC00000, 0xBFC80000} // BIOS + }; + + u32 num_tables = 1; // unreachable table + for (u32 i = 0; i < countof(ranges); i++) + num_tables += GetTableCount(ranges[i][0], ranges[i][1]); + + const u32 num_slots = FAST_MAP_TABLE_SIZE * num_tables; + if (!s_fast_map_pointers) + s_fast_map_pointers = std::make_unique(num_slots); + + FastMapTable table_ptr = s_fast_map_pointers.get(); + FastMapTable table_ptr_end = table_ptr + num_slots; + + // Fill the first table with invalid/unreachable. + for (u32 i = 0; i < FAST_MAP_TABLE_SIZE; i++) + table_ptr[i] = InvalidCodeFunction; + + // And the remaining with block compile pointers. + for (u32 i = FAST_MAP_TABLE_SIZE; i < num_slots; i++) + table_ptr[i] = FastCompileBlockFunction; + + // Mark everything as unreachable to begin with. + for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++) + s_fast_map[i] = EncodeFastMapPointer(i, table_ptr); + table_ptr += FAST_MAP_TABLE_SIZE; + + // Allocate ranges. + for (u32 i = 0; i < countof(ranges); i++) + AllocateFastMapTables(ranges[i][0], ranges[i][1], table_ptr); + + Assert(table_ptr == table_ptr_end); +} static void ResetFastMap() { - s_fast_map.fill(FastCompileBlockFunction); + if (!s_fast_map_pointers) + return; + + for (u32 i = 0; i < FAST_MAP_TABLE_COUNT; i++) + { + FastMapTable ptr = DecodeFastMapPointer(i, s_fast_map[i]); + if (ptr == s_fast_map_pointers.get()) + continue; + + for (u32 j = 0; j < FAST_MAP_TABLE_SIZE; j++) + ptr[j] = FastCompileBlockFunction; + } +} + +static void FreeFastMap() +{ + std::memset(s_fast_map, 0, sizeof(s_fast_map)); + s_fast_map_pointers.reset(); } static void SetFastMap(u32 pc, CodeBlock::HostCodePointer function) { - s_fast_map[GetFastMapIndex(pc)] = function; + if (!s_fast_map_pointers) + return; + + const u32 slot = pc >> FAST_MAP_TABLE_SHIFT; + FastMapTable encoded_ptr = s_fast_map[slot]; + + const FastMapTable table_ptr = DecodeFastMapPointer(slot, encoded_ptr); + Assert(table_ptr != nullptr && table_ptr != s_fast_map_pointers.get()); + + CodeBlock::HostCodePointer* ptr = OffsetFastMapPointer(encoded_ptr, pc); + *ptr = function; } #endif @@ -138,11 +250,13 @@ void Initialize() Panic("Failed to initialize code space"); } + AllocateFastMap(); + if (g_settings.IsUsingFastmem() && !InitializeFastmem()) Panic("Failed to initialize fastmem"); - ResetFastMap(); CompileDispatcher(); + ResetFastMap(); } #endif } @@ -169,6 +283,7 @@ void Shutdown() ClearState(); #ifdef WITH_RECOMPILER ShutdownFastmem(); + FreeFastMap(); s_code_buffer.Destroy(); #endif } @@ -305,9 +420,9 @@ void CompileDispatcher() s_code_buffer.WriteProtect(true); } -CodeBlock::HostCodePointer* GetFastMapPointer() +FastMapTable* GetFastMapPointer() { - return s_fast_map.data(); + return s_fast_map; } void ExecuteRecompiler() @@ -334,8 +449,7 @@ void ExecuteRecompiler() const u32 pc = g_state.regs.pc; g_state.current_instruction_pc = pc; - const u32 fast_map_index = GetFastMapIndex(pc); - s_single_block_asm_dispatcher(s_fast_map[fast_map_index]); + s_single_block_asm_dispatcher(s_fast_map[pc >> 16][pc >> 2]); } TimingEvents::RunEvents(); @@ -503,7 +617,7 @@ recompile: if (block->recompile_count >= RECOMPILE_COUNT_TO_FALL_BACK_TO_INTERPRETER) { Log_PerfPrintf("Block 0x%08X has been recompiled %u times in %u frames, falling back to interpreter", - block->GetPC(), block->recompile_count, frame_diff); + block->GetPC(), block->recompile_count, frame_diff); FallbackExistingBlockToInterpreter(block); return false; @@ -683,11 +797,36 @@ void FastCompileBlockFunction() { CodeBlock* block = LookupBlock(GetNextBlockKey()); if (block) + { s_single_block_asm_dispatcher(block->host_code); + } else if (g_settings.gpu_pgxp_enable) - InterpretUncachedBlock(); + { + if (g_settings.gpu_pgxp_cpu) + InterpretUncachedBlock(); + else + InterpretUncachedBlock(); + } else + { InterpretUncachedBlock(); + } +} + +void InvalidCodeFunction() +{ + Log_ErrorPrintf("Trying to execute invalid code at 0x%08X", g_state.regs.pc); + if (g_settings.gpu_pgxp_enable) + { + if (g_settings.gpu_pgxp_cpu) + InterpretUncachedBlock(); + else + InterpretUncachedBlock(); + } + else + { + InterpretUncachedBlock(); + } } #endif diff --git a/src/core/cpu_code_cache.h b/src/core/cpu_code_cache.h index d2fcd0375..a32bd944d 100644 --- a/src/core/cpu_code_cache.h +++ b/src/core/cpu_code_cache.h @@ -16,13 +16,6 @@ namespace CPU { -enum : u32 -{ - FAST_MAP_RAM_SLOT_COUNT = Bus::RAM_8MB_SIZE / 4, - FAST_MAP_BIOS_SLOT_COUNT = Bus::BIOS_SIZE / 4, - FAST_MAP_TOTAL_SLOT_COUNT = FAST_MAP_RAM_SLOT_COUNT + FAST_MAP_BIOS_SLOT_COUNT, -}; - union CodeBlockKey { u32 bits; @@ -107,6 +100,15 @@ struct CodeBlock namespace CodeCache { +enum : u32 +{ + FAST_MAP_TABLE_COUNT = 0x10000, + FAST_MAP_TABLE_SIZE = 0x10000 / 4, // 16384 + FAST_MAP_TABLE_SHIFT = 16, +}; + +using FastMapTable = CodeBlock::HostCodePointer*; + void Initialize(); void Shutdown(); void Execute(); @@ -115,7 +117,7 @@ void Execute(); using DispatcherFunction = void (*)(); using SingleBlockDispatcherFunction = void(*)(const CodeBlock::HostCodePointer); -CodeBlock::HostCodePointer* GetFastMapPointer(); +FastMapTable* GetFastMapPointer(); void ExecuteRecompiler(); #endif diff --git a/src/core/cpu_recompiler_code_generator_aarch32.cpp b/src/core/cpu_recompiler_code_generator_aarch32.cpp index 3ab3014c0..723f7ee42 100644 --- a/src/core/cpu_recompiler_code_generator_aarch32.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch32.cpp @@ -2028,29 +2028,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() // time to lookup the block // r0 <- pc - m_emit->Mov(a32::r3, Bus::BIOS_BASE); m_emit->ldr(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, regs.pc))); - // current_instruction_pc <- pc (eax) + // r1 <- s_fast_map[pc >> 16] + EmitLoadGlobalAddress(2, CodeCache::GetFastMapPointer()); + m_emit->lsr(a32::r1, a32::r0, 16); + m_emit->ldr(a32::r1, a32::MemOperand(a32::r2, a32::r1, a32::LSL, 2)); + + // current_instruction_pc <- pc (r0) m_emit->str(a32::r0, a32::MemOperand(GetHostReg32(RCPUPTR), offsetof(State, current_instruction_pc))); - // r1 <- (pc & RAM_MASK) >> 2 - m_emit->and_(a32::r1, a32::r0, Bus::g_ram_mask); - m_emit->lsr(a32::r1, a32::r1, 2); - - // r2 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT - m_emit->and_(a32::r2, a32::r0, Bus::BIOS_MASK); - m_emit->lsr(a32::r2, a32::r2, 2); - m_emit->add(a32::r2, a32::r2, FAST_MAP_RAM_SLOT_COUNT); - - // if ((r0 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use r2 as index } - m_emit->and_(a32::r0, a32::r0, PHYSICAL_MEMORY_ADDRESS_MASK); - m_emit->cmp(a32::r0, a32::r3); - m_emit->mov(a32::ge, a32::r1, a32::r2); - - // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue - EmitLoadGlobalAddress(0, CodeCache::GetFastMapPointer()); - m_emit->ldr(a32::r0, a32::MemOperand(a32::r0, a32::r1, a32::LSL, 2)); + // blr(r1[pc]) (fast_map[pc >> 2]) + m_emit->ldr(a32::r0, a32::MemOperand(a32::r1, a32::r0)); m_emit->blx(a32::r0); // end while diff --git a/src/core/cpu_recompiler_code_generator_aarch64.cpp b/src/core/cpu_recompiler_code_generator_aarch64.cpp index 76f542637..96ff6f9d6 100644 --- a/src/core/cpu_recompiler_code_generator_aarch64.cpp +++ b/src/core/cpu_recompiler_code_generator_aarch64.cpp @@ -2239,29 +2239,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() // time to lookup the block // w8 <- pc - m_emit->Mov(a64::w11, Bus::BIOS_BASE); m_emit->ldr(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, regs.pc))); - // current_instruction_pc <- pc (eax) + // x9 <- s_fast_map[pc >> 16] + EmitLoadGlobalAddress(10, CodeCache::GetFastMapPointer()); + m_emit->lsr(a64::w9, a64::w8, 16); + m_emit->ldr(a64::x9, a64::MemOperand(a64::x10, a64::x9, a64::LSL, 3)); + + // current_instruction_pc <- pc (w8) m_emit->str(a64::w8, a64::MemOperand(GetHostReg64(RCPUPTR), offsetof(State, current_instruction_pc))); - // w9 <- (pc & RAM_MASK) >> 2 - m_emit->and_(a64::w9, a64::w8, Bus::g_ram_mask); - m_emit->lsr(a64::w9, a64::w9, 2); - - // w10 <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT - m_emit->and_(a64::w10, a64::w8, Bus::BIOS_MASK); - m_emit->lsr(a64::w10, a64::w10, 2); - m_emit->add(a64::w10, a64::w10, FAST_MAP_RAM_SLOT_COUNT); - - // if ((w8 (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use w10 as index } - m_emit->and_(a64::w8, a64::w8, PHYSICAL_MEMORY_ADDRESS_MASK); - m_emit->cmp(a64::w8, a64::w11); - m_emit->csel(a64::w8, a64::w9, a64::w10, a64::lt); - - // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue - EmitLoadGlobalAddress(9, CodeCache::GetFastMapPointer()); - m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 3)); + // blr(x9[pc * 2]) (fast_map[pc >> 2]) + m_emit->ldr(a64::x8, a64::MemOperand(a64::x9, a64::x8, a64::LSL, 2)); m_emit->blr(a64::x8); // end while diff --git a/src/core/cpu_recompiler_code_generator_x64.cpp b/src/core/cpu_recompiler_code_generator_x64.cpp index 746282fb1..a65aa370e 100644 --- a/src/core/cpu_recompiler_code_generator_x64.cpp +++ b/src/core/cpu_recompiler_code_generator_x64.cpp @@ -2996,29 +2996,18 @@ CodeCache::DispatcherFunction CodeGenerator::CompileDispatcher() // eax <- pc m_emit->mov(m_emit->eax, m_emit->dword[m_emit->rbp + offsetof(State, regs.pc)]); - // ebx <- (pc & RAM_MASK) >> 2 - m_emit->mov(m_emit->ebx, m_emit->eax); - m_emit->and_(m_emit->ebx, Bus::g_ram_mask); - m_emit->shr(m_emit->ebx, 2); - - // ecx <- ((pc & BIOS_MASK) >> 2) + FAST_MAP_RAM_SLOT_COUNT - m_emit->mov(m_emit->ecx, m_emit->eax); - m_emit->and_(m_emit->ecx, Bus::BIOS_MASK); - m_emit->shr(m_emit->ecx, 2); - m_emit->add(m_emit->ecx, FAST_MAP_RAM_SLOT_COUNT); - // current_instruction_pc <- pc (eax) m_emit->mov(m_emit->dword[m_emit->rbp + offsetof(State, current_instruction_pc)], m_emit->eax); - // if ((eax (pc) & PHYSICAL_MEMORY_ADDRESS_MASK) >= BIOS_BASE) { use ecx as index } - m_emit->and_(m_emit->eax, PHYSICAL_MEMORY_ADDRESS_MASK); - m_emit->cmp(m_emit->eax, Bus::BIOS_BASE); - m_emit->cmovge(m_emit->ebx, m_emit->ecx); + // rcx <- s_fast_map[pc >> 16] + EmitLoadGlobalAddress(Xbyak::Operand::RBX, CodeCache::GetFastMapPointer()); + m_emit->mov(m_emit->ecx, m_emit->eax); + m_emit->shr(m_emit->ecx, 16); + m_emit->mov(m_emit->rcx, m_emit->qword[m_emit->rbx + m_emit->rcx * 8]); + + // call(rcx[pc * 2]) (fast_map[pc >> 2]) + m_emit->call(m_emit->qword[m_emit->rcx + m_emit->rax * 2]); - // ebx contains our index, rax <- fast_map[ebx * 8], rax(), continue - EmitLoadGlobalAddress(Xbyak::Operand::RAX, CodeCache::GetFastMapPointer()); - m_emit->mov(m_emit->rax, m_emit->qword[m_emit->rax + m_emit->rbx * 8]); - m_emit->call(m_emit->rax); m_emit->jmp(main_loop); // end while